[i-g-t] tests/gem_shrink: Exercise OOM and other routes to shrinking in reasonable time

Submitted by Tvrtko Ursulin on Jan. 4, 2019, 3:37 p.m.

Details

Message ID 20190104153709.19350-1-tvrtko.ursulin@linux.intel.com
State New
Series "tests/gem_shrink: Exercise OOM and other routes to shrinking in reasonable time"
Headers show

Commit Message

Tvrtko Ursulin Jan. 4, 2019, 3:37 p.m.
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

A set of subtests which exercises different paths to our shrinker code
(including the OOM killer) in predictable and reasonable time budget.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 lib/igt_core.c                        |  19 ++
 lib/igt_core.h                        |   1 +
 tests/i915/gem_shrink.c               | 399 ++++++++++++++++++++++++++
 tests/intel-ci/blacklist.txt          |   1 +
 tests/intel-ci/fast-feedback.testlist |   3 +
 5 files changed, 423 insertions(+)

Patch hide | download patch | download mbox

diff --git a/lib/igt_core.c b/lib/igt_core.c
index 50d6008f6c82..351da0b4e020 100644
--- a/lib/igt_core.c
+++ b/lib/igt_core.c
@@ -1685,6 +1685,25 @@  void igt_stop_helper(struct igt_helper_process *proc)
 	assert(helper_was_alive(proc, status));
 }
 
+/**
+ * igt_try_stop_helper:
+ * @proc: #igt_helper_process structure
+ *
+ * Terminates a helper process if it is still running and returns true, or false
+ * if the process wasn't running.
+ */
+bool igt_try_stop_helper(struct igt_helper_process *proc)
+{
+	int status;
+
+	/* failure here means the pid is already dead and so waiting is safe */
+	kill(proc->pid, proc->use_SIGKILL ? SIGKILL : SIGTERM);
+
+	status = igt_wait_helper(proc);
+
+	return helper_was_alive(proc, status);
+}
+
 static void children_exit_handler(int sig)
 {
 	int status;
diff --git a/lib/igt_core.h b/lib/igt_core.h
index 6f8c3852a686..ed5ceebf1205 100644
--- a/lib/igt_core.h
+++ b/lib/igt_core.h
@@ -795,6 +795,7 @@  bool __igt_fork_helper(struct igt_helper_process *proc);
 	for (; __igt_fork_helper(proc); exit(0))
 int igt_wait_helper(struct igt_helper_process *proc);
 void igt_stop_helper(struct igt_helper_process *proc);
+bool igt_try_stop_helper(struct igt_helper_process *proc);
 
 /* exit handler code */
 
diff --git a/tests/i915/gem_shrink.c b/tests/i915/gem_shrink.c
index c8e05814ee70..7c002de0ef1f 100644
--- a/tests/i915/gem_shrink.c
+++ b/tests/i915/gem_shrink.c
@@ -26,6 +26,10 @@ 
  *
  * Exercise the shrinker by overallocating GEM objects
  */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <fcntl.h>
 
 #include "igt.h"
 #include "igt_gt.h"
@@ -366,6 +370,376 @@  static void reclaim(unsigned engine, int timeout)
 	close(fd);
 }
 
+static unsigned long get_meminfo(const char *info, const char *tag)
+{
+	const char *str;
+	unsigned long val;
+
+	str = strstr(info, tag);
+	if (str && sscanf(str + strlen(tag), " %lu", &val) == 1)
+		return val >> 10;
+
+	igt_warn("Unrecognised /proc/meminfo field: '%s'\n", tag);
+	return 0;
+}
+
+static unsigned long get_avail_ram_mb(void)
+{
+	int fd;
+	int ret;
+	char buf[4096];
+	unsigned long ram;
+
+	fd = open("/proc/meminfo", O_RDONLY);
+	igt_assert_fd(fd);
+
+	ret = read(fd, buf, sizeof(buf));
+	igt_assert(ret >= 0);
+
+	close(fd);
+
+	ram = get_meminfo(buf, "MemAvailable:");
+	ram += get_meminfo(buf, "Buffers:");
+	ram += get_meminfo(buf, "Cached:");
+	ram += get_meminfo(buf, "SwapCached:");
+
+	return ram;
+}
+
+struct test {
+#define TEST_BO		(1)
+#define TEST_USERPTR	(2)
+	unsigned int flags;
+	int fd;
+};
+
+static uint32_t __get_pages(int fd, unsigned long alloc)
+{
+	uint32_t handle = gem_create(fd, alloc);
+
+	gem_set_domain(fd, handle, I915_GEM_DOMAIN_GTT, 0);
+	gem_madvise(fd, handle, I915_MADV_DONTNEED);
+
+	return handle;
+}
+
+struct test_obj {
+	void *ptr;
+	uint32_t handle;
+};
+
+#define PAGE_SIZE 4096
+static void
+__get_userptr(int fd, struct test_obj *obj, unsigned long sz)
+{
+	struct local_i915_gem_userptr userptr = { };
+	void *ptr;
+
+	igt_assert_eq(sz & 4095, 0);
+
+	ptr = mmap(NULL, sz, PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	assert(ptr != MAP_FAILED);
+
+	for (size_t page = 0; page < sz; page += PAGE_SIZE)
+		*(volatile uint32_t *)((unsigned char *)ptr + page) = 0;
+
+	userptr.user_size = sz;
+	userptr.user_ptr = to_user_pointer(ptr);
+	do_ioctl(fd, LOCAL_IOCTL_I915_GEM_USERPTR, &userptr);
+
+	gem_set_domain(fd, userptr.handle, I915_GEM_DOMAIN_GTT, 0);
+	gem_madvise(fd, userptr.handle, I915_MADV_DONTNEED);
+
+	obj->ptr = ptr;
+	obj->handle = userptr.handle;
+}
+
+/*
+ * Use a specific way of using up memory until we are below a certain threshold.
+ */
+static void *mempressure(void *arg)
+{
+	const unsigned int free_threshold_mb = 256;
+	struct test_obj *list = NULL;
+	struct test *test = arg;
+	const unsigned int sz_mb = 2;
+	const unsigned int sz = sz_mb << 20;
+	unsigned int n = 0, max = 0;
+	unsigned int blocks;
+
+	for (;;) {
+		unsigned long ram_mb = get_avail_ram_mb();
+
+		if (!list) {
+			/*
+			 * On first pass estimate how many sz_mb sized blocks
+			 * we could need to use up all RAM.
+			 */
+			blocks = ram_mb / sz_mb;
+			list = calloc(blocks, sizeof(*list));
+			igt_assert(list);
+		} else if (ram_mb < free_threshold_mb) {
+			/* Limit the working set once under the threshold. */
+			blocks = max + 1;
+		}
+
+		/* Free the oldest block once the working set wrapped. */
+		if (list[n].ptr || list[n].handle) {
+			if (test->flags & TEST_USERPTR) {
+				munmap(list[n].ptr, sz);
+				gem_close(test->fd, list[n].handle);
+			} else if (test->flags & TEST_BO) {
+				gem_close(test->fd, list[n].handle);
+			} else {
+				munmap(list[n].ptr, sz);
+			}
+		}
+
+		/*
+		 * Allocate memory blocks and record the current working set
+		 * size.
+		 */
+		if (test->flags & TEST_BO) {
+			list[n].handle = __get_pages(test->fd, sz);
+		} else if (test->flags & TEST_USERPTR) {
+			__get_userptr(test->fd, &list[n], sz);
+		} else {
+			list[n].ptr = mmap(NULL, sz, PROT_WRITE,
+					   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+			assert(list[n].ptr != MAP_FAILED);
+
+			madvise(list[n].ptr, sz, MADV_HUGEPAGE);
+
+			for (size_t page = 0; page < sz; page += PAGE_SIZE)
+				*(volatile uint32_t *)((unsigned char *)list[n].ptr + page) = 0;
+		}
+
+		if (n > max)
+			max = n;
+
+		n++;
+
+		/*
+		 * Start freeing the oldest and reallocating once maximum
+		 * working set size has been reached.
+		 */
+		if (n >= blocks)
+			n = 0;
+	}
+
+	return NULL;
+}
+
+static void oom_adjust(const char *score)
+{
+	int fd;
+
+	fd = open("/proc/self/oom_score_adj", O_WRONLY);
+	igt_assert_fd(fd);
+	igt_assert(write(fd, score, sizeof(score)) == sizeof(score));
+	close(fd);
+}
+
+static void trigger_oom(void)
+{
+	const char *cmd = "f";
+	int fd;
+
+	fd = open("/proc/sysrq-trigger", O_WRONLY);
+	igt_assert_fd(fd);
+	igt_assert(write(fd, cmd, sizeof(cmd)) == sizeof(cmd));
+	close(fd);
+}
+
+static bool has_sysrq_trigger(void)
+{
+	int fd;
+
+	fd = open("/proc/sysrq-trigger", O_WRONLY);
+	close(fd);
+
+	return fd >= 0;
+}
+
+/*
+ * Exercise different paths to our shrinker code, including OOM, in predictable
+ * and reasonable time budget.
+ */
+static void reclaim_oom(unsigned int flags, unsigned int passes)
+{
+	unsigned int count = 0;
+
+	oom_adjust("-1000");
+
+	do {
+		struct igt_helper_process gem_child = { .use_SIGKILL = true };
+		struct igt_helper_process mem_child = { .use_SIGKILL = true };
+		struct igt_helper_process eb_child = { .use_SIGKILL = true };
+		struct igt_helper_process drop_child = { .use_SIGKILL = true };
+
+		igt_debug("Iteration %u...\n", ++count);
+
+		/*
+		 * Apply either anon backed or shmem backed memory pressure
+		 * to the amount to use up almost all available RAM.
+		 */
+		igt_fork_helper(&mem_child) {
+			struct test test = { };
+
+			if ((flags & (TEST_BO | TEST_USERPTR)) ==
+			    (TEST_BO | TEST_USERPTR))
+				test.flags = TEST_BO;
+
+			/* Sacrifice the memory hog if it comes to that. */
+			oom_adjust("500");
+
+			if (test.flags == TEST_BO) {
+				test.fd = drm_open_driver_render(DRIVER_INTEL);
+				igt_require_gem(test.fd);
+			}
+
+			mempressure(&test);
+
+			if (test.flags == TEST_BO)
+				close(test.fd);
+		}
+
+		/*
+		 * Apply either userptr backed or shmem backed memory pressure
+		 * to the amount to use up almost all available RAM.
+		 */
+		igt_fork_helper(&gem_child) {
+			struct test test = { .flags = flags };
+
+			if ((flags & (TEST_BO | TEST_USERPTR)) ==
+			    (TEST_BO | TEST_USERPTR))
+				test.flags = TEST_USERPTR;
+
+			/* Sacrifice the memory hog if it comes to that. */
+			oom_adjust("500");
+
+			test.fd = drm_open_driver_render(DRIVER_INTEL);
+			igt_require_gem(test.fd);
+
+			mempressure(&test);
+
+			close(test.fd);
+		}
+
+		/*
+		 * Apply an execbuf load to exercise the request allocation and
+		 * direct reclaim from this path.
+		 *
+		 * Occasionaly sync with execution and pause for a little bit to
+		 * avoid hogging to much from this client.
+		 */
+		igt_fork_helper(&eb_child) {
+			struct test test = { .flags = flags };
+			const uint32_t bbe = MI_BATCH_BUFFER_END;
+			struct drm_i915_gem_exec_object2 obj = { };
+			struct drm_i915_gem_execbuffer2 execbuf = { };
+
+			execbuf.buffers_ptr = to_user_pointer(&obj);
+			execbuf.buffer_count = 1;
+
+			test.fd = drm_open_driver_render(DRIVER_INTEL);
+			igt_require_gem(test.fd);
+
+			for (;;) {
+				unsigned long eb = 0;
+				struct timespec ts = { };
+				unsigned long start;
+
+				igt_nsec_elapsed(&ts);
+				start = igt_nsec_elapsed(&ts) / 1e6;
+
+				for (;;) {
+					unsigned long now;
+
+					obj.handle = gem_create(test.fd, 4096);
+					gem_write(test.fd, obj.handle, 0, &bbe,
+						sizeof(bbe));
+					gem_execbuf(test.fd, &execbuf);
+					eb++;
+					now = igt_nsec_elapsed(&ts) / 1e6;
+					if (now > (start + 1000)) {
+						gem_sync(test.fd, obj.handle);
+						if (now > (start + 2000)) {
+							gem_close(test.fd,
+								  obj.handle);
+							break;
+						}
+					}
+					gem_close(test.fd, obj.handle);
+				}
+
+				igt_debug("%lu execbufs\n", eb);
+				usleep(500e3);
+			}
+
+			close(test.fd);
+		}
+
+		/*
+		 * Manually drop cached with the DROP_ACTIVE flag set every now
+		 * and then in order to exercise this path as well.
+		 */
+		igt_fork_helper(&drop_child) {
+			int fd;
+
+			fd = drm_open_driver(DRIVER_INTEL);
+			igt_require_gem(fd);
+
+			for (;;) {
+				usleep(334e3);
+				igt_debug("Dropping caches...\n");
+				igt_drop_caches_set(fd, DROP_ACTIVE);
+			}
+
+			close(fd);
+		}
+
+		/*
+		 * When memory pressure clients have managed to use up all
+		 * available RAM, let them run for a brief moment yet and then
+		 * manually trigger the OOM condition.
+		 */
+		for (unsigned long ram_mb = 0;
+		     (ram_mb = get_avail_ram_mb()) > 512;) {
+			int status;
+			pid_t pid;
+
+			igt_debug("[%u] %lu MiB free\n", count, ram_mb);
+
+			pid = waitpid(mem_child.pid, &status, WNOHANG);
+			if (pid)
+				break;
+
+			pid = waitpid(gem_child.pid, &status, WNOHANG);
+			if (pid)
+				break;
+
+			pid = waitpid(eb_child.pid, &status, WNOHANG);
+			igt_assert_eq(pid, 0);
+
+			pid = waitpid(drop_child.pid, &status, WNOHANG);
+			igt_assert_eq(pid, 0);
+
+			sleep(1);
+		}
+
+		igt_debug("Triggering OOM...\n");
+		trigger_oom();
+
+		sleep(1);
+
+		igt_try_stop_helper(&mem_child);
+		igt_try_stop_helper(&gem_child);
+		igt_stop_helper(&eb_child);
+		igt_stop_helper(&drop_child);
+	} while (count < passes);
+}
+
 igt_main
 {
 	const struct test {
@@ -432,6 +806,31 @@  igt_main
 	igt_subtest("reclaim")
 		reclaim(I915_EXEC_DEFAULT, 2);
 
+	igt_subtest_group {
+		struct {
+			const char *name;
+			unsigned int flags;
+			unsigned int passes;
+		} *p, passes[] = {
+			{ "reclaims-and-oom-basic", TEST_BO, 1 },
+			{ "reclaims-and-oom", TEST_BO, 3 },
+			{ "reclaims-and-oom-userptr-basic", TEST_USERPTR, 1 },
+			{ "reclaims-and-oom-userptr", TEST_USERPTR, 3 },
+			{ "reclaims-and-oom-both-basic", TEST_BO | TEST_USERPTR, 1 },
+			{ "reclaims-and-oom-both", TEST_BO | TEST_USERPTR, 3 },
+			{ NULL, 0, 0 },
+		};
+
+		igt_fixture {
+			igt_require(has_sysrq_trigger());
+		}
+
+		for (p = passes; p->name; p++) {
+			igt_subtest(p->name)
+				reclaim_oom(p->flags, p->passes);
+		}
+	}
+
 	for(const struct test *t = tests; t->name; t++) {
 		for(const struct mode *m = modes; m->suffix; m++) {
 			igt_subtest_f("%s%s", t->name, m->suffix)
diff --git a/tests/intel-ci/blacklist.txt b/tests/intel-ci/blacklist.txt
index 73d127603d28..d76a4b3b1c71 100644
--- a/tests/intel-ci/blacklist.txt
+++ b/tests/intel-ci/blacklist.txt
@@ -60,6 +60,7 @@  igt@gem_ring_sync_copy(@.*)?
 igt@gem_ring_sync_loop(@.*)?
 igt@gem_seqno_wrap(@.*)?
 igt@gem_shrink@(?!reclaim$).*
+igt@gem_shrink@(?!reclaims-and-oom).*
 igt@gem_softpin@.*(hang|S4).*
 igt@gem_spin_batch(@.*)?
 igt@gem_stolen@.*hibernate.*
diff --git a/tests/intel-ci/fast-feedback.testlist b/tests/intel-ci/fast-feedback.testlist
index 6d42792c67f7..0df7cc2fd9fd 100644
--- a/tests/intel-ci/fast-feedback.testlist
+++ b/tests/intel-ci/fast-feedback.testlist
@@ -124,6 +124,9 @@  igt@gem_ringfill@basic-default
 igt@gem_ringfill@basic-default-interruptible
 igt@gem_ringfill@basic-default-forked
 igt@gem_ringfill@basic-default-fd
+igt@gem_shrink@reclaims-and-oom-basic
+igt@gem_shrink@reclaims-and-oom-userptr-basic
+igt@gem_shrink@reclaims-and-oom-both-basic
 igt@gem_sync@basic-all
 igt@gem_sync@basic-each
 igt@gem_sync@basic-many-each

Comments

Petri Latvala Jan. 7, 2019, 11:01 a.m.
On Fri, Jan 04, 2019 at 03:37:09PM +0000, Tvrtko Ursulin wrote:
> diff --git a/tests/intel-ci/blacklist.txt b/tests/intel-ci/blacklist.txt
> index 73d127603d28..d76a4b3b1c71 100644
> --- a/tests/intel-ci/blacklist.txt
> +++ b/tests/intel-ci/blacklist.txt
> @@ -60,6 +60,7 @@ igt@gem_ring_sync_copy(@.*)?
>  igt@gem_ring_sync_loop(@.*)?
>  igt@gem_seqno_wrap(@.*)?
>  igt@gem_shrink@(?!reclaim$).*
> +igt@gem_shrink@(?!reclaims-and-oom).*
>  igt@gem_softpin@.*(hang|S4).*
>  igt@gem_spin_batch(@.*)?
>  igt@gem_stolen@.*hibernate.*


In case you didn't notice: The first gem_shrink line removes the
reclaims-and-oom* subtests, and the second line (the one you added)
then removes the 'reclaim' subtest, resulting in all gem_shrink
subtests blacklisted.
Tvrtko Ursulin Jan. 7, 2019, 11:12 a.m.
On 07/01/2019 11:01, Petri Latvala wrote:
> On Fri, Jan 04, 2019 at 03:37:09PM +0000, Tvrtko Ursulin wrote:
>> diff --git a/tests/intel-ci/blacklist.txt b/tests/intel-ci/blacklist.txt
>> index 73d127603d28..d76a4b3b1c71 100644
>> --- a/tests/intel-ci/blacklist.txt
>> +++ b/tests/intel-ci/blacklist.txt
>> @@ -60,6 +60,7 @@ igt@gem_ring_sync_copy(@.*)?
>>   igt@gem_ring_sync_loop(@.*)?
>>   igt@gem_seqno_wrap(@.*)?
>>   igt@gem_shrink@(?!reclaim$).*
>> +igt@gem_shrink@(?!reclaims-and-oom).*
>>   igt@gem_softpin@.*(hang|S4).*
>>   igt@gem_spin_batch(@.*)?
>>   igt@gem_stolen@.*hibernate.*
> 
> 
> In case you didn't notice: The first gem_shrink line removes the
> reclaims-and-oom* subtests, and the second line (the one you added)
> then removes the 'reclaim' subtest, resulting in all gem_shrink
> subtests blacklisted.

No I haven't thank you. Interestingly the reclaim subtest still did run. 
Okay, but in essence should this work then:

   igt@gem_shrink@(?!reclaim).*

To blacklist everything apart from reclaims and reclaims-and-oom.* subtests?

Regards,

Tvrtko
Petri Latvala Jan. 7, 2019, 11:28 a.m.
On Mon, Jan 07, 2019 at 11:12:30AM +0000, Tvrtko Ursulin wrote:
> 
> On 07/01/2019 11:01, Petri Latvala wrote:
> > On Fri, Jan 04, 2019 at 03:37:09PM +0000, Tvrtko Ursulin wrote:
> > > diff --git a/tests/intel-ci/blacklist.txt b/tests/intel-ci/blacklist.txt
> > > index 73d127603d28..d76a4b3b1c71 100644
> > > --- a/tests/intel-ci/blacklist.txt
> > > +++ b/tests/intel-ci/blacklist.txt
> > > @@ -60,6 +60,7 @@ igt@gem_ring_sync_copy(@.*)?
> > >   igt@gem_ring_sync_loop(@.*)?
> > >   igt@gem_seqno_wrap(@.*)?
> > >   igt@gem_shrink@(?!reclaim$).*
> > > +igt@gem_shrink@(?!reclaims-and-oom).*
> > >   igt@gem_softpin@.*(hang|S4).*
> > >   igt@gem_spin_batch(@.*)?
> > >   igt@gem_stolen@.*hibernate.*
> > 
> > 
> > In case you didn't notice: The first gem_shrink line removes the
> > reclaims-and-oom* subtests, and the second line (the one you added)
> > then removes the 'reclaim' subtest, resulting in all gem_shrink
> > subtests blacklisted.
> 
> No I haven't thank you. Interestingly the reclaim subtest still did run.

https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_2185/shards-all.html
it's NOTRUN on all machines on patched IGT.

> Okay, but in essence should this work then:
> 
>   igt@gem_shrink@(?!reclaim).*
> 
> To blacklist everything apart from reclaims and reclaims-and-oom.*
> subtests?

Yeah that looks correct.
Michał Winiarski Jan. 7, 2019, 12:27 p.m.
On Fri, Jan 04, 2019 at 03:37:09PM +0000, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> A set of subtests which exercises different paths to our shrinker code
> (including the OOM killer) in predictable and reasonable time budget.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  lib/igt_core.c                        |  19 ++
>  lib/igt_core.h                        |   1 +
>  tests/i915/gem_shrink.c               | 399 ++++++++++++++++++++++++++
>  tests/intel-ci/blacklist.txt          |   1 +
>  tests/intel-ci/fast-feedback.testlist |   3 +
>  5 files changed, 423 insertions(+)

[snip]

> diff --git a/tests/i915/gem_shrink.c b/tests/i915/gem_shrink.c
> index c8e05814ee70..7c002de0ef1f 100644
> --- a/tests/i915/gem_shrink.c
> +++ b/tests/i915/gem_shrink.c
> @@ -26,6 +26,10 @@
>   *
>   * Exercise the shrinker by overallocating GEM objects
>   */
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <sys/wait.h>
> +#include <fcntl.h>
>  
>  #include "igt.h"
>  #include "igt_gt.h"
> @@ -366,6 +370,376 @@ static void reclaim(unsigned engine, int timeout)
>  	close(fd);
>  }
>  
> +static unsigned long get_meminfo(const char *info, const char *tag)
> +{
> +	const char *str;
> +	unsigned long val;
> +
> +	str = strstr(info, tag);
> +	if (str && sscanf(str + strlen(tag), " %lu", &val) == 1)
> +		return val >> 10;
> +
> +	igt_warn("Unrecognised /proc/meminfo field: '%s'\n", tag);
> +	return 0;
> +}
> +
> +static unsigned long get_avail_ram_mb(void)
> +{
> +	int fd;
> +	int ret;
> +	char buf[4096];
> +	unsigned long ram;
> +
> +	fd = open("/proc/meminfo", O_RDONLY);
> +	igt_assert_fd(fd);
> +
> +	ret = read(fd, buf, sizeof(buf));
> +	igt_assert(ret >= 0);
> +
> +	close(fd);
> +
> +	ram = get_meminfo(buf, "MemAvailable:");
> +	ram += get_meminfo(buf, "Buffers:");
> +	ram += get_meminfo(buf, "Cached:");
> +	ram += get_meminfo(buf, "SwapCached:");
> +
> +	return ram;
> +}

What's wrong with ones from intel_os.c?

-Michał
Chris Wilson Jan. 7, 2019, 12:31 p.m.
Quoting Michał Winiarski (2019-01-07 12:27:07)
> On Fri, Jan 04, 2019 at 03:37:09PM +0000, Tvrtko Ursulin wrote:
> > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > 
> > A set of subtests which exercises different paths to our shrinker code
> > (including the OOM killer) in predictable and reasonable time budget.
> > 
> > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> >  lib/igt_core.c                        |  19 ++
> >  lib/igt_core.h                        |   1 +
> >  tests/i915/gem_shrink.c               | 399 ++++++++++++++++++++++++++
> >  tests/intel-ci/blacklist.txt          |   1 +
> >  tests/intel-ci/fast-feedback.testlist |   3 +
> >  5 files changed, 423 insertions(+)
> 
> [snip]
> 
> > diff --git a/tests/i915/gem_shrink.c b/tests/i915/gem_shrink.c
> > index c8e05814ee70..7c002de0ef1f 100644
> > --- a/tests/i915/gem_shrink.c
> > +++ b/tests/i915/gem_shrink.c
> > @@ -26,6 +26,10 @@
> >   *
> >   * Exercise the shrinker by overallocating GEM objects
> >   */
> > +#include <sys/types.h>
> > +#include <sys/stat.h>
> > +#include <sys/wait.h>
> > +#include <fcntl.h>
> >  
> >  #include "igt.h"
> >  #include "igt_gt.h"
> > @@ -366,6 +370,376 @@ static void reclaim(unsigned engine, int timeout)
> >       close(fd);
> >  }
> >  
> > +static unsigned long get_meminfo(const char *info, const char *tag)
> > +{
> > +     const char *str;
> > +     unsigned long val;
> > +
> > +     str = strstr(info, tag);
> > +     if (str && sscanf(str + strlen(tag), " %lu", &val) == 1)
> > +             return val >> 10;
> > +
> > +     igt_warn("Unrecognised /proc/meminfo field: '%s'\n", tag);
> > +     return 0;
> > +}
> > +
> > +static unsigned long get_avail_ram_mb(void)
> > +{
> > +     int fd;
> > +     int ret;
> > +     char buf[4096];
> > +     unsigned long ram;
> > +
> > +     fd = open("/proc/meminfo", O_RDONLY);
> > +     igt_assert_fd(fd);
> > +
> > +     ret = read(fd, buf, sizeof(buf));
> > +     igt_assert(ret >= 0);
> > +
> > +     close(fd);
> > +
> > +     ram = get_meminfo(buf, "MemAvailable:");
> > +     ram += get_meminfo(buf, "Buffers:");
> > +     ram += get_meminfo(buf, "Cached:");
> > +     ram += get_meminfo(buf, "SwapCached:");
> > +
> > +     return ram;
> > +}
> 
> What's wrong with ones from intel_os.c?

They pull in both an i915 and mm purge, which iirc, had to be avoided
here.
-Chris
Tvrtko Ursulin Jan. 7, 2019, 12:52 p.m.
On 07/01/2019 12:31, Chris Wilson wrote:
> Quoting Michał Winiarski (2019-01-07 12:27:07)
>> On Fri, Jan 04, 2019 at 03:37:09PM +0000, Tvrtko Ursulin wrote:
>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>
>>> A set of subtests which exercises different paths to our shrinker code
>>> (including the OOM killer) in predictable and reasonable time budget.
>>>
>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> ---
>>>   lib/igt_core.c                        |  19 ++
>>>   lib/igt_core.h                        |   1 +
>>>   tests/i915/gem_shrink.c               | 399 ++++++++++++++++++++++++++
>>>   tests/intel-ci/blacklist.txt          |   1 +
>>>   tests/intel-ci/fast-feedback.testlist |   3 +
>>>   5 files changed, 423 insertions(+)
>>
>> [snip]
>>
>>> diff --git a/tests/i915/gem_shrink.c b/tests/i915/gem_shrink.c
>>> index c8e05814ee70..7c002de0ef1f 100644
>>> --- a/tests/i915/gem_shrink.c
>>> +++ b/tests/i915/gem_shrink.c
>>> @@ -26,6 +26,10 @@
>>>    *
>>>    * Exercise the shrinker by overallocating GEM objects
>>>    */
>>> +#include <sys/types.h>
>>> +#include <sys/stat.h>
>>> +#include <sys/wait.h>
>>> +#include <fcntl.h>
>>>   
>>>   #include "igt.h"
>>>   #include "igt_gt.h"
>>> @@ -366,6 +370,376 @@ static void reclaim(unsigned engine, int timeout)
>>>        close(fd);
>>>   }
>>>   
>>> +static unsigned long get_meminfo(const char *info, const char *tag)
>>> +{
>>> +     const char *str;
>>> +     unsigned long val;
>>> +
>>> +     str = strstr(info, tag);
>>> +     if (str && sscanf(str + strlen(tag), " %lu", &val) == 1)
>>> +             return val >> 10;
>>> +
>>> +     igt_warn("Unrecognised /proc/meminfo field: '%s'\n", tag);
>>> +     return 0;
>>> +}
>>> +
>>> +static unsigned long get_avail_ram_mb(void)
>>> +{
>>> +     int fd;
>>> +     int ret;
>>> +     char buf[4096];
>>> +     unsigned long ram;
>>> +
>>> +     fd = open("/proc/meminfo", O_RDONLY);
>>> +     igt_assert_fd(fd);
>>> +
>>> +     ret = read(fd, buf, sizeof(buf));
>>> +     igt_assert(ret >= 0);
>>> +
>>> +     close(fd);
>>> +
>>> +     ram = get_meminfo(buf, "MemAvailable:");
>>> +     ram += get_meminfo(buf, "Buffers:");
>>> +     ram += get_meminfo(buf, "Cached:");
>>> +     ram += get_meminfo(buf, "SwapCached:");
>>> +
>>> +     return ram;
>>> +}
>>
>> What's wrong with ones from intel_os.c?
> 
> They pull in both an i915 and mm purge, which iirc, had to be avoided
> here.

Yep. I can sense a suggestion of adding a lighter weight version to the 
library now.. :)

Regards,

Tvrtko
Chris Wilson Jan. 7, 2019, 12:56 p.m.
Quoting Tvrtko Ursulin (2019-01-07 12:52:28)
> 
> On 07/01/2019 12:31, Chris Wilson wrote:
> > Quoting Michał Winiarski (2019-01-07 12:27:07)
> >> On Fri, Jan 04, 2019 at 03:37:09PM +0000, Tvrtko Ursulin wrote:
> >>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>>
> >>> A set of subtests which exercises different paths to our shrinker code
> >>> (including the OOM killer) in predictable and reasonable time budget.
> >>>
> >>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>> ---
> >>>   lib/igt_core.c                        |  19 ++
> >>>   lib/igt_core.h                        |   1 +
> >>>   tests/i915/gem_shrink.c               | 399 ++++++++++++++++++++++++++
> >>>   tests/intel-ci/blacklist.txt          |   1 +
> >>>   tests/intel-ci/fast-feedback.testlist |   3 +
> >>>   5 files changed, 423 insertions(+)
> >>
> >> [snip]
> >>
> >>> diff --git a/tests/i915/gem_shrink.c b/tests/i915/gem_shrink.c
> >>> index c8e05814ee70..7c002de0ef1f 100644
> >>> --- a/tests/i915/gem_shrink.c
> >>> +++ b/tests/i915/gem_shrink.c
> >>> @@ -26,6 +26,10 @@
> >>>    *
> >>>    * Exercise the shrinker by overallocating GEM objects
> >>>    */
> >>> +#include <sys/types.h>
> >>> +#include <sys/stat.h>
> >>> +#include <sys/wait.h>
> >>> +#include <fcntl.h>
> >>>   
> >>>   #include "igt.h"
> >>>   #include "igt_gt.h"
> >>> @@ -366,6 +370,376 @@ static void reclaim(unsigned engine, int timeout)
> >>>        close(fd);
> >>>   }
> >>>   
> >>> +static unsigned long get_meminfo(const char *info, const char *tag)
> >>> +{
> >>> +     const char *str;
> >>> +     unsigned long val;
> >>> +
> >>> +     str = strstr(info, tag);
> >>> +     if (str && sscanf(str + strlen(tag), " %lu", &val) == 1)
> >>> +             return val >> 10;
> >>> +
> >>> +     igt_warn("Unrecognised /proc/meminfo field: '%s'\n", tag);
> >>> +     return 0;
> >>> +}
> >>> +
> >>> +static unsigned long get_avail_ram_mb(void)
> >>> +{
> >>> +     int fd;
> >>> +     int ret;
> >>> +     char buf[4096];
> >>> +     unsigned long ram;
> >>> +
> >>> +     fd = open("/proc/meminfo", O_RDONLY);
> >>> +     igt_assert_fd(fd);
> >>> +
> >>> +     ret = read(fd, buf, sizeof(buf));
> >>> +     igt_assert(ret >= 0);
> >>> +
> >>> +     close(fd);
> >>> +
> >>> +     ram = get_meminfo(buf, "MemAvailable:");
> >>> +     ram += get_meminfo(buf, "Buffers:");
> >>> +     ram += get_meminfo(buf, "Cached:");
> >>> +     ram += get_meminfo(buf, "SwapCached:");
> >>> +
> >>> +     return ram;
> >>> +}
> >>
> >> What's wrong with ones from intel_os.c?
> > 
> > They pull in both an i915 and mm purge, which iirc, had to be avoided
> > here.
> 
> Yep. I can sense a suggestion of adding a lighter weight version to the 
> library now.. :)

Nah, I refrained because I hope for the same leniency (many times over).
Anyway, the third user has to refactor ;)

To serendipity,
-Chris
Tvrtko Ursulin Jan. 7, 2019, 5:33 p.m.
On 07/01/2019 11:28, Petri Latvala wrote:
> On Mon, Jan 07, 2019 at 11:12:30AM +0000, Tvrtko Ursulin wrote:
>>
>> On 07/01/2019 11:01, Petri Latvala wrote:
>>> On Fri, Jan 04, 2019 at 03:37:09PM +0000, Tvrtko Ursulin wrote:
>>>> diff --git a/tests/intel-ci/blacklist.txt b/tests/intel-ci/blacklist.txt
>>>> index 73d127603d28..d76a4b3b1c71 100644
>>>> --- a/tests/intel-ci/blacklist.txt
>>>> +++ b/tests/intel-ci/blacklist.txt
>>>> @@ -60,6 +60,7 @@ igt@gem_ring_sync_copy(@.*)?
>>>>    igt@gem_ring_sync_loop(@.*)?
>>>>    igt@gem_seqno_wrap(@.*)?
>>>>    igt@gem_shrink@(?!reclaim$).*
>>>> +igt@gem_shrink@(?!reclaims-and-oom).*
>>>>    igt@gem_softpin@.*(hang|S4).*
>>>>    igt@gem_spin_batch(@.*)?
>>>>    igt@gem_stolen@.*hibernate.*
>>>
>>>
>>> In case you didn't notice: The first gem_shrink line removes the
>>> reclaims-and-oom* subtests, and the second line (the one you added)
>>> then removes the 'reclaim' subtest, resulting in all gem_shrink
>>> subtests blacklisted.
>>
>> No I haven't thank you. Interestingly the reclaim subtest still did run.
> 
> https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_2185/shards-all.html
> it's NOTRUN on all machines on patched IGT.

My fail while looking at the pages.. I saw the test mentioned in the 
list but fail to spot that was only because it was run in the baseline. :I

>> Okay, but in essence should this work then:
>>
>>    igt@gem_shrink@(?!reclaim).*
>>
>> To blacklist everything apart from reclaims and reclaims-and-oom.*
>> subtests?
> 
> Yeah that looks correct.

Thanks, I'll send a v2 shortly.

Regards,

Tvrtko
Tvrtko Ursulin Jan. 7, 2019, 6:25 p.m.
On 07/01/2019 18:00, Patchwork wrote:
> == Series Details ==
> 
> Series: tests/gem_shrink: Exercise OOM and other routes to shrinking in reasonable time (rev2)
> URL   : https://patchwork.freedesktop.org/series/54746/
> State : failure
> 
> == Summary ==
> 
> CI Bug Log - changes from CI_DRM_5369 -> IGTPW_2191
> ====================================================
> 
> Summary
> -------
> 
>    **FAILURE**
> 
>    Serious unknown changes coming with IGTPW_2191 absolutely need to be
>    verified manually.
>    
>    If you think the reported changes have nothing to do with the changes
>    introduced in IGTPW_2191, please notify your bug team to allow them
>    to document this new failure mode, which will reduce false positives in CI.
> 
>    External URL: https://patchwork.freedesktop.org/api/1.0/series/54746/revisions/2/mbox/
> 
> Possible new issues
> -------------------
> 
>    Here are the unknown changes that may have been introduced in IGTPW_2191:
> 
> ### IGT changes ###
> 
> #### Possible regressions ####
> 
>    * igt@i915_selftest@live_gtt:
>      - fi-kbl-r:           PASS -> DMESG-WARN

No idea what we are looking at here!? :)

> 
>    
> #### Suppressed ####
> 
>    The following results come from untrusted machines, tests, or statuses.
>    They do not affect the overall result.
> 
>    * {igt@gem_shrink@reclaims-and-oom-both-basic}:
>      - fi-ivb-3520m:       NOTRUN -> DMESG-WARN
>      - fi-snb-2520m:       NOTRUN -> DMESG-WARN
>      - fi-kbl-8809g:       NOTRUN -> DMESG-WARN
>      - fi-glk-j4005:       NOTRUN -> DMESG-WARN
>      - fi-cfl-8700k:       NOTRUN -> DMESG-WARN
>      - fi-icl-u2:          NOTRUN -> DMESG-WARN
>      - fi-kbl-7500u:       NOTRUN -> DMESG-WARN
>      - fi-bdw-gvtdvm:      NOTRUN -> DMESG-WARN
>      - fi-kbl-x1275:       NOTRUN -> DMESG-WARN
>      - fi-kbl-guc:         NOTRUN -> DMESG-WARN
>      - fi-kbl-7567u:       NOTRUN -> DMESG-WARN
>      - fi-icl-u3:          NOTRUN -> DMESG-WARN
>      - fi-skl-guc:         NOTRUN -> DMESG-WARN
>      - fi-bxt-j4205:       NOTRUN -> DMESG-WARN
>      - fi-skl-6700hq:      NOTRUN -> DMESG-WARN
>      - fi-bsw-kefka:       NOTRUN -> DMESG-WARN
>      - fi-ivb-3770:        NOTRUN -> DMESG-WARN
>      - fi-skl-6700k2:      NOTRUN -> DMESG-WARN
>      - fi-hsw-4770:        NOTRUN -> DMESG-WARN
>      - fi-kbl-7560u:       NOTRUN -> DMESG-WARN
>      - fi-skl-6600u:       NOTRUN -> DMESG-WARN
>      - fi-apl-guc:         NOTRUN -> DMESG-WARN
>      - fi-kbl-r:           NOTRUN -> DMESG-WARN
>      - fi-byt-j1900:       NOTRUN -> DMESG-WARN
>      - fi-blb-e6850:       NOTRUN -> DMESG-WARN
>      - fi-cfl-8109u:       NOTRUN -> DMESG-WARN
>      - fi-bdw-5557u:       NOTRUN -> DMESG-WARN
>      - fi-elk-e7500:       NOTRUN -> DMESG-WARN
>      - fi-hsw-4770r:       NOTRUN -> DMESG-WARN
>      - fi-byt-n2820:       NOTRUN -> DMESG-WARN
>      - fi-skl-6260u:       NOTRUN -> DMESG-WARN
>      - fi-skl-iommu:       NOTRUN -> DMESG-WARN
>      - fi-ilk-650:         NOTRUN -> DMESG-WARN
>      - fi-skl-6770hq:      NOTRUN -> DMESG-WARN
>      - fi-whl-u:           NOTRUN -> DMESG-WARN
>      - fi-cfl-guc:         NOTRUN -> DMESG-WARN

Yay, test found something! :)

> 
>    * {igt@gem_shrink@reclaims-and-oom-userptr-basic}:
>      - fi-bwr-2160:        NOTRUN -> FAIL +1

Fixed in v3.

Regards,

Tvrtko

>    
> Known issues
> ------------
> 
>    Here are the changes found in IGTPW_2191 that come from known issues:
> 
> ### IGT changes ###
> 
> #### Issues hit ####
> 
>    * igt@kms_pipe_crc_basic@nonblocking-crc-pipe-a-frame-sequence:
>      - fi-byt-clapper:     PASS -> FAIL [fdo#103191] / [fdo#107362]
> 
>    * igt@prime_vgem@basic-fence-flip:
>      - fi-gdg-551:         PASS -> FAIL [fdo#103182]
> 
>    
> #### Possible fixes ####
> 
>    * igt@gem_mmap_gtt@basic-small-copy:
>      - fi-glk-dsi:         INCOMPLETE [fdo#103359] / [k.org#198133] -> PASS
> 
>    * igt@i915_selftest@live_coherency:
>      - fi-gdg-551:         DMESG-FAIL [fdo#107164] -> PASS
> 
>    * igt@i915_selftest@live_hangcheck:
>      - fi-bwr-2160:        DMESG-FAIL [fdo#108735] -> PASS
> 
>    
>    {name}: This element is suppressed. This means it is ignored when computing
>            the status of the difference (SUCCESS, WARNING, or FAILURE).
> 
>    [fdo#103182]: https://bugs.freedesktop.org/show_bug.cgi?id=103182
>    [fdo#103191]: https://bugs.freedesktop.org/show_bug.cgi?id=103191
>    [fdo#103359]: https://bugs.freedesktop.org/show_bug.cgi?id=103359
>    [fdo#104108]: https://bugs.freedesktop.org/show_bug.cgi?id=104108
>    [fdo#107164]: https://bugs.freedesktop.org/show_bug.cgi?id=107164
>    [fdo#107362]: https://bugs.freedesktop.org/show_bug.cgi?id=107362
>    [fdo#108735]: https://bugs.freedesktop.org/show_bug.cgi?id=108735
>    [fdo#109241]: https://bugs.freedesktop.org/show_bug.cgi?id=109241
>    [k.org#198133]: https://bugzilla.kernel.org/show_bug.cgi?id=198133
> 
> 
> Participating hosts (49 -> 44)
> ------------------------------
> 
>    Additional (1): fi-pnv-d510
>    Missing    (6): fi-kbl-soraka fi-ilk-m540 fi-byt-squawks fi-bsw-cyan fi-icl-y fi-snb-2600
> 
> 
> Build changes
> -------------
> 
>      * IGT: IGT_4756 -> IGTPW_2191
> 
>    CI_DRM_5369: 4d637a8d160356f01d22695ec1a76858bfb55758 @ git://anongit.freedesktop.org/gfx-ci/linux
>    IGTPW_2191: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_2191/
>    IGT_4756: 75081c6bfb9998bd7cbf35a7ac0578c683fe55a8 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
> 
> 
> 
> == Testlist changes ==
> 
> +igt@gem_shrink@reclaims-and-oom
> +igt@gem_shrink@reclaims-and-oom-basic
> +igt@gem_shrink@reclaims-and-oom-both
> +igt@gem_shrink@reclaims-and-oom-both-basic
> +igt@gem_shrink@reclaims-and-oom-userptr
> +igt@gem_shrink@reclaims-and-oom-userptr-basic
> 
> == Logs ==
> 
> For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_2191/
>
Chris Wilson Jan. 7, 2019, 8:53 p.m.
Quoting Tvrtko Ursulin (2019-01-07 18:25:34)
> 
> On 07/01/2019 18:00, Patchwork wrote:
> > 
> >    * {igt@gem_shrink@reclaims-and-oom-both-basic}:
> >      - fi-ivb-3520m:       NOTRUN -> DMESG-WARN
> 
> Yay, test found something! :)

Test requirement not met in function __real_main745, file ../tests/i915/gem_shrink.c:827:
Test requirement: has_sysrq_trigger()

Boo.
-Chris
Chris Wilson Jan. 7, 2019, 9:43 p.m.
Quoting Chris Wilson (2019-01-07 20:53:48)
> Quoting Tvrtko Ursulin (2019-01-07 18:25:34)
> > 
> > On 07/01/2019 18:00, Patchwork wrote:
> > > 
> > >    * {igt@gem_shrink@reclaims-and-oom-both-basic}:
> > >      - fi-ivb-3520m:       NOTRUN -> DMESG-WARN
> > 
> > Yay, test found something! :)
> 
> Test requirement not met in function __real_main745, file ../tests/i915/gem_shrink.c:827:
> Test requirement: has_sysrq_trigger()

Fwiw, with whatever changed in 5.0, the oom_trigger is not required for
the lockdep warning.

(I'm not sure if the oom_trigger adds anything useful; the nasty oom
conditions will be from inside page_alloc after direct reclaim fails.
Indeed, I don't think oom itself does anything that would trigger the
mmu_notifier? Do you know of a path?)
-Chris