diff options
54 files changed, 1364 insertions, 458 deletions
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl index 0b1a3f97f28..d7884b13fb1 100644 --- a/Documentation/DocBook/kernel-locking.tmpl +++ b/Documentation/DocBook/kernel-locking.tmpl @@ -1645,7 +1645,9 @@ the amount of locking which needs to be done. all the readers who were traversing the list when we deleted the element are finished. We use <function>call_rcu()</function> to register a callback which will actually destroy the object once - the readers are finished. + all pre-existing readers are finished. Alternatively, + <function>synchronize_rcu()</function> may be used to block until + all pre-existing are finished. </para> <para> But how does Read Copy Update know when the readers are @@ -1714,7 +1716,7 @@ the amount of locking which needs to be done. - object_put(obj); + list_del_rcu(&obj->list); cache_num--; -+ call_rcu(&obj->rcu, cache_delete_rcu, obj); ++ call_rcu(&obj->rcu, cache_delete_rcu); } /* Must be holding cache_lock */ @@ -1725,14 +1727,6 @@ the amount of locking which needs to be done. if (++cache_num > MAX_CACHE_SIZE) { struct object *i, *outcast = NULL; list_for_each_entry(i, &cache, list) { -@@ -85,6 +94,7 @@ - obj->popularity = 0; - atomic_set(&obj->refcnt, 1); /* The cache holds a reference */ - spin_lock_init(&obj->lock); -+ INIT_RCU_HEAD(&obj->rcu); - - spin_lock_irqsave(&cache_lock, flags); - __cache_add(obj); @@ -104,12 +114,11 @@ struct object *cache_find(int id) { diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index 790d1a81237..0c134f8afc6 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -218,13 +218,22 @@ over a rather long period of time, but improvements are always welcome! include: a. Keeping a count of the number of data-structure elements - used by the RCU-protected data structure, including those - waiting for a grace period to elapse. Enforce a limit - on this number, stalling updates as needed to allow - previously deferred frees to complete. - - Alternatively, limit only the number awaiting deferred - free rather than the total number of elements. + used by the RCU-protected data structure, including + those waiting for a grace period to elapse. Enforce a + limit on this number, stalling updates as needed to allow + previously deferred frees to complete. Alternatively, + limit only the number awaiting deferred free rather than + the total number of elements. + + One way to stall the updates is to acquire the update-side + mutex. (Don't try this with a spinlock -- other CPUs + spinning on the lock could prevent the grace period + from ever ending.) Another way to stall the updates + is for the updates to use a wrapper function around + the memory allocator, so that this wrapper function + simulates OOM when there is too much memory awaiting an + RCU grace period. There are of course many other + variations on this theme. b. Limiting update rate. For example, if updates occur only once per hour, then no explicit rate limiting is required, @@ -365,3 +374,26 @@ over a rather long period of time, but improvements are always welcome! and the compiler to freely reorder code into and out of RCU read-side critical sections. It is the responsibility of the RCU update-side primitives to deal with this. + +17. Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and + the __rcu sparse checks to validate your RCU code. These + can help find problems as follows: + + CONFIG_PROVE_RCU: check that accesses to RCU-protected data + structures are carried out under the proper RCU + read-side critical section, while holding the right + combination of locks, or whatever other conditions + are appropriate. + + CONFIG_DEBUG_OBJECTS_RCU_HEAD: check that you don't pass the + same object to call_rcu() (or friends) before an RCU + grace period has elapsed since the last time that you + passed that same object to call_rcu() (or friends). + + __rcu sparse checks: tag the pointer to the RCU-protected data + structure with __rcu, and sparse will warn you if you + access that pointer without the services of one of the + variants of rcu_dereference(). + + These debugging aids can help you find problems that are + otherwise extremely difficult to spot. diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c index c908c5f8364..5808731f72d 100644 --- a/drivers/input/evdev.c +++ b/drivers/input/evdev.c @@ -28,7 +28,7 @@ struct evdev { int minor; struct input_handle handle; wait_queue_head_t wait; - struct evdev_client *grab; + struct evdev_client __rcu *grab; struct list_head client_list; spinlock_t client_lock; /* protects client_list */ struct mutex mutex; diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 29e850a7a2f..1318ee00834 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -127,7 +127,10 @@ static void handle_tx(struct vhost_net *net) size_t len, total_len = 0; int err, wmem; size_t hdr_size; - struct socket *sock = rcu_dereference(vq->private_data); + struct socket *sock; + + sock = rcu_dereference_check(vq->private_data, + lockdep_is_held(&vq->mutex)); if (!sock) return; @@ -582,7 +585,10 @@ static void vhost_net_disable_vq(struct vhost_net *n, static void vhost_net_enable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { - struct socket *sock = vq->private_data; + struct socket *sock; + + sock = rcu_dereference_protected(vq->private_data, + lockdep_is_held(&vq->mutex)); if (!sock) return; if (vq == n->vqs + VHOST_NET_VQ_TX) { @@ -598,7 +604,8 @@ static struct socket *vhost_net_stop_vq(struct vhost_net *n, struct socket *sock; mutex_lock(&vq->mutex); - sock = vq->private_data; + sock = rcu_dereference_protected(vq->private_data, + lockdep_is_held(&vq->mutex)); vhost_net_disable_vq(n, vq); rcu_assign_pointer(vq->private_data, NULL); mutex_unlock(&vq->mutex); @@ -736,7 +743,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) } /* start polling new socket */ - oldsock = vq->private_data; + oldsock = rcu_dereference_protected(vq->private_data, + lockdep_is_held(&vq->mutex)); if (sock != oldsock) { vhost_net_disable_vq(n, vq); rcu_assign_pointer(vq->private_data, sock); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index e05557d5299..b5c49478d20 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -284,7 +284,7 @@ long vhost_dev_reset_owner(struct vhost_dev *dev) vhost_dev_cleanup(dev); memory->nregions = 0; - dev->memory = memory; + RCU_INIT_POINTER(dev->memory, memory); return 0; } @@ -316,8 +316,9 @@ void vhost_dev_cleanup(struct vhost_dev *dev) fput(dev->log_file); dev->log_file = NULL; /* No one will access memory at this point */ - kfree(dev->memory); - dev->memory = NULL; + kfree(rcu_dereference_protected(dev->memory, + lockdep_is_held(&dev->mutex))); + RCU_INIT_POINTER(dev->memory, NULL); if (dev->mm) mmput(dev->mm); dev->mm = NULL; @@ -401,14 +402,22 @@ static int vq_access_ok(unsigned int num, /* Caller should have device mutex but not vq mutex */ int vhost_log_access_ok(struct vhost_dev *dev) { - return memory_access_ok(dev, dev->memory, 1); + struct vhost_memory *mp; + + mp = rcu_dereference_protected(dev->memory, + lockdep_is_held(&dev->mutex)); + return memory_access_ok(dev, mp, 1); } /* Verify access for write logging. */ /* Caller should have vq mutex and device mutex */ static int vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base) { - return vq_memory_access_ok(log_base, vq->dev->memory, + struct vhost_memory *mp; + + mp = rcu_dereference_protected(vq->dev->memory, + lockdep_is_held(&vq->mutex)); + return vq_memory_access_ok(log_base, mp, vhost_has_feature(vq->dev, VHOST_F_LOG_ALL)) && (!vq->log_used || log_access_ok(log_base, vq->log_addr, sizeof *vq->used + @@ -448,7 +457,8 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) kfree(newmem); return -EFAULT; } - oldmem = d->memory; + oldmem = rcu_dereference_protected(d->memory, + lockdep_is_held(&d->mutex)); rcu_assign_pointer(d->memory, newmem); synchronize_rcu(); kfree(oldmem); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index afd77295971..af3c11ded5f 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -106,7 +106,7 @@ struct vhost_virtqueue { * vhost_work execution acts instead of rcu_read_lock() and the end of * vhost_work execution acts instead of rcu_read_lock(). * Writers use virtqueue mutex. */ - void *private_data; + void __rcu *private_data; /* Log write descriptors */ void __user *log_base; struct vhost_log log[VHOST_NET_MAX_SG]; @@ -116,7 +116,7 @@ struct vhost_dev { /* Readers use RCU to access memory table pointer * log base pointer and features. * Writers use mutex below.*/ - struct vhost_memory *memory; + struct vhost_memory __rcu *memory; struct mm_struct *mm; struct mutex mutex; unsigned acked_features; @@ -173,7 +173,11 @@ enum { static inline int vhost_has_feature(struct vhost_dev *dev, int bit) { - unsigned acked_features = rcu_dereference(dev->acked_features); + unsigned acked_features; + + acked_features = + rcu_dereference_index_check(dev->acked_features, + lockdep_is_held(&dev->mutex)); return acked_features & (1 << bit); } diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ed3e92e41c6..3cb7d04308c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -75,7 +75,7 @@ struct cgroup_subsys_state { unsigned long flags; /* ID for this css, if possible */ - struct css_id *id; + struct css_id __rcu *id; }; /* bits in struct cgroup_subsys_state flags field */ @@ -205,7 +205,7 @@ struct cgroup { struct list_head children; /* my children */ struct cgroup *parent; /* my parent */ - struct dentry *dentry; /* cgroup fs entry, RCU protected */ + struct dentry __rcu *dentry; /* cgroup fs entry, RCU protected */ /* Private pointers for each registered subsystem */ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; diff --git a/include/linux/compiler.h b/include/linux/compiler.h index c1a62c56a66..320d6c94ff8 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -16,7 +16,11 @@ # define __release(x) __context__(x,-1) # define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) # define __percpu __attribute__((noderef, address_space(3))) +#ifdef CONFIG_SPARSE_RCU_POINTER +# define __rcu __attribute__((noderef, address_space(4))) +#else # define __rcu +#endif extern void __chk_user_ptr(const volatile void __user *); extern void __chk_io_ptr(const volatile void __iomem *); #else diff --git a/include/linux/cred.h b/include/linux/cred.h index 4d2c39573f3..4aaeab37644 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -84,7 +84,7 @@ struct thread_group_cred { atomic_t usage; pid_t tgid; /* thread group process ID */ spinlock_t lock; - struct key *session_keyring; /* keyring inherited over fork */ + struct key __rcu *session_keyring; /* keyring inherited over fork */ struct key *process_keyring; /* keyring private to this process */ struct rcu_head rcu; /* RCU deletion hook */ }; diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index f59ed297b66..133c0ba25e3 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -31,7 +31,7 @@ struct embedded_fd_set { struct fdtable { unsigned int max_fds; - struct file ** fd; /* current fd array */ + struct file __rcu **fd; /* current fd array */ fd_set *close_on_exec; fd_set *open_fds; struct rcu_head rcu; @@ -46,7 +46,7 @@ struct files_struct { * read mostly part */ atomic_t count; - struct fdtable *fdt; + struct fdtable __rcu *fdt; struct fdtable fdtab; /* * written part on a separate cache line in SMP @@ -55,7 +55,7 @@ struct files_struct { int next_fd; struct embedded_fd_set close_on_exec_init; struct embedded_fd_set open_fds_init; - struct file * fd_array[NR_OPEN_DEFAULT]; + struct file __rcu * fd_array[NR_OPEN_DEFAULT]; }; #define rcu_dereference_check_fdtable(files, fdtfd) \ diff --git a/include/linux/fs.h b/include/linux/fs.h index 76041b61475..aa3dc8d2043 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1380,7 +1380,7 @@ struct super_block { * Saved mount options for lazy filesystems using * generic_show_options() */ - char *s_options; + char __rcu *s_options; }; extern struct timespec current_fs_time(struct super_block *sb); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 5f2f4c4d8fb..af3f06b41dc 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -129,8 +129,8 @@ struct blk_scsi_cmd_filter { struct disk_part_tbl { struct rcu_head rcu_head; int len; - struct hd_struct *last_lookup; - struct hd_struct *part[]; + struct hd_struct __rcu *last_lookup; + struct hd_struct __rcu *part[]; }; struct gendisk { @@ -149,7 +149,7 @@ struct gendisk { * non-critical accesses use RCU. Always access through * helpers. */ - struct disk_part_tbl *part_tbl; + struct disk_part_tbl __rcu *part_tbl; struct hd_struct part0; const struct block_device_operations *fops; diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index d5b387669da..1f4517d55b1 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -139,7 +139,7 @@ static inline void account_system_vtime(struct task_struct *tsk) #endif #if defined(CONFIG_NO_HZ) -#if defined(CONFIG_TINY_RCU) +#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) extern void rcu_enter_nohz(void); extern void rcu_exit_nohz(void); diff --git a/include/linux/idr.h b/include/linux/idr.h index e968db71e33..cdb715e58e3 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -50,14 +50,14 @@ struct idr_layer { unsigned long bitmap; /* A zero bit means "space here" */ - struct idr_layer *ary[1<<IDR_BITS]; + struct idr_layer __rcu *ary[1<<IDR_BITS]; int count; /* When zero, we can release it */ int layer; /* distance from leaf */ struct rcu_head rcu_head; }; struct idr { - struct idr_layer *top; + struct idr_layer __rcu *top; struct idr_layer *id_free; int layers; /* only valid without concurrent changes */ int id_free_cnt; diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 1f43fa56f60..2fea6c8ef6b 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -82,11 +82,17 @@ extern struct group_info init_groups; # define CAP_INIT_BSET CAP_FULL_SET #ifdef CONFIG_TREE_PREEMPT_RCU +#define INIT_TASK_RCU_TREE_PREEMPT() \ + .rcu_blocked_node = NULL, +#else +#define INIT_TASK_RCU_TREE_PREEMPT(tsk) +#endif +#ifdef CONFIG_PREEMPT_RCU #define INIT_TASK_RCU_PREEMPT(tsk) \ .rcu_read_lock_nesting = 0, \ .rcu_read_unlock_special = 0, \ - .rcu_blocked_node = NULL, \ - .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), + .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \ + INIT_TASK_RCU_TREE_PREEMPT() #else #define INIT_TASK_RCU_PREEMPT(tsk) #endif @@ -137,8 +143,8 @@ extern struct cred init_cred; .children = LIST_HEAD_INIT(tsk.children), \ .sibling = LIST_HEAD_INIT(tsk.sibling), \ .group_leader = &tsk, \ - .real_cred = &init_cred, \ - .cred = &init_cred, \ + RCU_INIT_POINTER(.real_cred, &init_cred), \ + RCU_INIT_POINTER(.cred, &init_cred), \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(tsk.cred_guard_mutex), \ .comm = "swapper", \ diff --git a/include/linux/input.h b/include/linux/input.h index 896a92227bc..d6ae1761be9 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -1196,7 +1196,7 @@ struct input_dev { int (*flush)(struct input_dev *dev, struct file *file); int (*event)(struct input_dev *dev, unsigned int type, unsigned int code, int value); - struct input_handle *grab; + struct input_handle __rcu *grab; spinlock_t event_lock; struct mutex mutex; diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 64d52913303..3e70b21884a 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -53,7 +53,7 @@ struct io_context { struct radix_tree_root radix_root; struct hlist_head cic_list; - void *ioc_data; + void __rcu *ioc_data; }; static inline struct io_context *ioc_task_link(struct io_context *ioc) diff --git a/include/linux/key.h b/include/linux/key.h index cd50dfa1d4c..3db0adce1fd 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -178,8 +178,9 @@ struct key { */ union { unsigned long value; + void __rcu *rcudata; void *data; - struct keyring_list *subscriptions; + struct keyring_list __rcu *subscriptions; } payload; }; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c13cc48697a..ac740b26eb1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -205,7 +205,7 @@ struct kvm { struct mutex irq_lock; #ifdef CONFIG_HAVE_KVM_IRQCHIP - struct kvm_irq_routing_table *irq_routing; + struct kvm_irq_routing_table __rcu *irq_routing; struct hlist_head mask_notifier_list; struct hlist_head irq_ack_notifier_list; #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ee7e258627f..cb57d657ce4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -299,7 +299,7 @@ struct mm_struct { * new_owner->mm == mm * new_owner->alloc_lock is held */ - struct task_struct *owner; + struct task_struct __rcu *owner; #endif #ifdef CONFIG_PROC_FS diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 508f8cf6da3..d0edf7d823a 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -185,7 +185,7 @@ struct nfs_inode { struct nfs4_cached_acl *nfs4_acl; /* NFSv4 state */ struct list_head open_states; - struct nfs_delegation *delegation; + struct nfs_delegation __rcu *delegation; fmode_t delegation_state; struct rw_semaphore rwsem; #endif /* CONFIG_NFS_V4*/ diff --git a/include/linux/notifier.h b/include/linux/notifier.h index b2f1a4d8355..2026f9e1ceb 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -49,28 +49,28 @@ struct notifier_block { int (*notifier_call)(struct notifier_block *, unsigned long, void *); - struct notifier_block *next; + struct notifier_block __rcu *next; int priority; }; struct atomic_notifier_head { spinlock_t lock; - struct notifier_block *head; + struct notifier_block __rcu *head; }; struct blocking_notifier_head { struct rw_semaphore rwsem; - struct notifier_block *head; + struct notifier_block __rcu *head; }; struct raw_notifier_head { - struct notifier_block *head; + struct notifier_block __rcu *head; }; struct srcu_notifier_head { struct mutex mutex; struct srcu_struct srcu; - struct notifier_block *head; + struct notifier_block __rcu *head; }; #define ATOMIC_INIT_NOTIFIER_HEAD(name) do { \ diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 634b8e674ac..a39cbed9ee1 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -47,6 +47,8 @@ static inline void *radix_tree_indirect_to_ptr(void *ptr) { return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); } +#define radix_tree_indirect_to_ptr(ptr) \ + radix_tree_indirect_to_ptr((void __force *)(ptr)) static inline int radix_tree_is_indirect_ptr(void *ptr) { @@ -61,7 +63,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr) struct radix_tree_root { unsigned int height; gfp_t gfp_mask; - struct radix_tree_node *rnode; + struct radix_tree_node __rcu *rnode; }; #define RADIX_TREE_INIT(mask) { \ diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 4ec3b38ce9c..f31ef61f1c6 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -10,6 +10,21 @@ #include <linux/rcupdate.h> /* + * Why is there no list_empty_rcu()? Because list_empty() serves this + * purpose. The list_empty() function fetches the RCU-protected pointer + * and compares it to the address of the list head, but neither dereferences + * this pointer itself nor provides this pointer to the caller. Therefore, + * it is not necessary to use rcu_dereference(), so that list_empty() can + * be used anywhere you would want to use a list_empty_rcu(). + */ + +/* + * return the ->next po |