[v9,03/39] drivers/base: use a worker for sysfs unbind

Submitted by Ramalingam C on Dec. 13, 2018, 4:01 a.m.

Details

Message ID 1544673701-6353-4-git-send-email-ramalingam.c@intel.com
State New
Headers show
Series "drm/i915: Implement HDCP2.2" ( rev: 12 11 ) in Intel GFX

Not browsing as part of any series.

Commit Message

Ramalingam C Dec. 13, 2018, 4:01 a.m.
From: Daniel Vetter <daniel.vetter@intel.com>

Drivers might want to remove some sysfs files, which needs the same
locks and ends up angering lockdep. Relevant snippet of the stack
trace:

  kernfs_remove_by_name_ns+0x3b/0x80
  bus_remove_driver+0x92/0xa0
  acpi_video_unregister+0x24/0x40
  i915_driver_unload+0x42/0x130 [i915]
  i915_pci_remove+0x19/0x30 [i915]
  pci_device_remove+0x36/0xb0
  device_release_driver_internal+0x185/0x250
  unbind_store+0xaf/0x180
  kernfs_fop_write+0x104/0x190

I've stumbled over this because some new patches by Ram connect the
snd-hda-intel unload (where we do use sysfs unbind) with the locking
chains in the i915 unload code (but without creating a new loop),
which upset our CI. But the bug is already there and can be easily
reproduced by unbind i915 directly.

No idea whether this is the correct place to fix this, should at least
get CI happy again.

Note that the bus locking is already done by device_release_driver ->
device_release_driver_internal, so I dropped that part. Also note that
we don't recheck that the device is still bound by the same driver,
but neither does the current code do that without races. And I figured
that's a obscure enough corner case to not bother.

v2: Use a task work. An entirely async work leads to impressive
fireworks in our CI, notably in the vtcon bind/unbind code. Task work
will be as synchronous as the current code, and so keep all these
preexisting races neatly tugged under the rug.

Cc: Ramalingam C <ramalingam.c@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
---
 drivers/base/bus.c | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 8bfd27ec73d6..095c4a140d76 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -17,6 +17,7 @@ 
 #include <linux/string.h>
 #include <linux/mutex.h>
 #include <linux/sysfs.h>
+#include <linux/task_work.h>
 #include "base.h"
 #include "power/power.h"
 
@@ -174,22 +175,44 @@  static const struct kset_uevent_ops bus_uevent_ops = {
 
 static struct kset *bus_kset;
 
+struct unbind_work {
+	struct callback_head twork;
+	struct device *dev;
+};
+
+void unbind_work_fn(struct callback_head *work)
+{
+	struct unbind_work *unbind_work =
+		container_of(work, struct unbind_work, twork);
+
+	device_release_driver_internal(unbind_work->dev, NULL,
+				       unbind_work->dev->parent);
+	put_device(unbind_work->dev);
+	kfree(unbind_work);
+}
+
 /* Manually detach a device from its associated driver. */
 static ssize_t unbind_store(struct device_driver *drv, const char *buf,
 			    size_t count)
 {
 	struct bus_type *bus = bus_get(drv->bus);
+	struct unbind_work *unbind_work;
 	struct device *dev;
 	int err = -ENODEV;
 
 	dev = bus_find_device_by_name(bus, NULL, buf);
 	if (dev && dev->driver == drv) {
-		if (dev->parent && dev->bus->need_parent_lock)
-			device_lock(dev->parent);
-		device_release_driver(dev);
-		if (dev->parent && dev->bus->need_parent_lock)
-			device_unlock(dev->parent);
-		err = count;
+		unbind_work = kmalloc(sizeof(*unbind_work), GFP_KERNEL);
+		if (unbind_work) {
+			unbind_work->dev = dev;
+			get_device(dev);
+			init_task_work(&unbind_work->twork, unbind_work_fn);
+			task_work_add(current, &unbind_work->twork, true);
+
+			err = count;
+		} else {
+			err = -ENOMEM;
+		}
 	}
 	put_device(dev);
 	bus_put(bus);

Comments

On Thu, Dec 13, 2018 at 09:31:05AM +0530, Ramalingam C wrote:
> From: Daniel Vetter <daniel.vetter@intel.com>
> 
> Drivers might want to remove some sysfs files, which needs the same
> locks and ends up angering lockdep. Relevant snippet of the stack
> trace:
> 
>   kernfs_remove_by_name_ns+0x3b/0x80
>   bus_remove_driver+0x92/0xa0
>   acpi_video_unregister+0x24/0x40
>   i915_driver_unload+0x42/0x130 [i915]
>   i915_pci_remove+0x19/0x30 [i915]
>   pci_device_remove+0x36/0xb0
>   device_release_driver_internal+0x185/0x250
>   unbind_store+0xaf/0x180
>   kernfs_fop_write+0x104/0x190
> 
> I've stumbled over this because some new patches by Ram connect the
> snd-hda-intel unload (where we do use sysfs unbind) with the locking
> chains in the i915 unload code (but without creating a new loop),
> which upset our CI. But the bug is already there and can be easily
> reproduced by unbind i915 directly.
> 
> No idea whether this is the correct place to fix this, should at least
> get CI happy again.
> 
> Note that the bus locking is already done by device_release_driver ->
> device_release_driver_internal, so I dropped that part. Also note that
> we don't recheck that the device is still bound by the same driver,
> but neither does the current code do that without races. And I figured
> that's a obscure enough corner case to not bother.
> 
> v2: Use a task work. An entirely async work leads to impressive
> fireworks in our CI, notably in the vtcon bind/unbind code. Task work
> will be as synchronous as the current code, and so keep all these
> preexisting races neatly tugged under the rug.
> 
> Cc: Ramalingam C <ramalingam.c@intel.com>
> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>

Revised version of this just landed, so we won't need this anymore for
merging. I'll put my patch into topic/core-for-CI, so you can drop this
for the next version.
-Daniel

> ---
>  drivers/base/bus.c | 35 +++++++++++++++++++++++++++++------
>  1 file changed, 29 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/base/bus.c b/drivers/base/bus.c
> index 8bfd27ec73d6..095c4a140d76 100644
> --- a/drivers/base/bus.c
> +++ b/drivers/base/bus.c
> @@ -17,6 +17,7 @@
>  #include <linux/string.h>
>  #include <linux/mutex.h>
>  #include <linux/sysfs.h>
> +#include <linux/task_work.h>
>  #include "base.h"
>  #include "power/power.h"
>  
> @@ -174,22 +175,44 @@ static const struct kset_uevent_ops bus_uevent_ops = {
>  
>  static struct kset *bus_kset;
>  
> +struct unbind_work {
> +	struct callback_head twork;
> +	struct device *dev;
> +};
> +
> +void unbind_work_fn(struct callback_head *work)
> +{
> +	struct unbind_work *unbind_work =
> +		container_of(work, struct unbind_work, twork);
> +
> +	device_release_driver_internal(unbind_work->dev, NULL,
> +				       unbind_work->dev->parent);
> +	put_device(unbind_work->dev);
> +	kfree(unbind_work);
> +}
> +
>  /* Manually detach a device from its associated driver. */
>  static ssize_t unbind_store(struct device_driver *drv, const char *buf,
>  			    size_t count)
>  {
>  	struct bus_type *bus = bus_get(drv->bus);
> +	struct unbind_work *unbind_work;
>  	struct device *dev;
>  	int err = -ENODEV;
>  
>  	dev = bus_find_device_by_name(bus, NULL, buf);
>  	if (dev && dev->driver == drv) {
> -		if (dev->parent && dev->bus->need_parent_lock)
> -			device_lock(dev->parent);
> -		device_release_driver(dev);
> -		if (dev->parent && dev->bus->need_parent_lock)
> -			device_unlock(dev->parent);
> -		err = count;
> +		unbind_work = kmalloc(sizeof(*unbind_work), GFP_KERNEL);
> +		if (unbind_work) {
> +			unbind_work->dev = dev;
> +			get_device(dev);
> +			init_task_work(&unbind_work->twork, unbind_work_fn);
> +			task_work_add(current, &unbind_work->twork, true);
> +
> +			err = count;
> +		} else {
> +			err = -ENOMEM;
> +		}
>  	}
>  	put_device(dev);
>  	bus_put(bus);
> -- 
> 2.7.4
>