[07/13] mm: remove the page_shift member from struct hmm_range

Submitted by Christoph Hellwig on July 30, 2019, 5:51 a.m.

Details

Message ID 20190730055203.28467-8-hch@lst.de
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Christoph Hellwig July 30, 2019, 5:51 a.m.
All users pass PAGE_SIZE here, and if we wanted to support single
entries for huge pages we should really just add a HMM_FAULT_HUGEPAGE
flag instead that uses the huge page size instead of having the
caller calculate that size once, just for the hmm code to verify it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  1 -
 drivers/gpu/drm/nouveau/nouveau_svm.c   |  1 -
 include/linux/hmm.h                     | 22 -------------
 mm/hmm.c                                | 42 ++++++-------------------
 4 files changed, 9 insertions(+), 57 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 71d6e7087b0b..8bf79288c4e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -818,7 +818,6 @@  int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
 				0 : range->flags[HMM_PFN_WRITE];
 	range->pfn_flags_mask = 0;
 	range->pfns = pfns;
-	range->page_shift = PAGE_SHIFT;
 	range->start = start;
 	range->end = start + ttm->num_pages * PAGE_SIZE;
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 40e706234554..e7068ce46949 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -680,7 +680,6 @@  nouveau_svm_fault(struct nvif_notify *notify)
 			 args.i.p.addr + args.i.p.size, fn - fi);
 
 		/* Have HMM fault pages within the fault window to the GPU. */
-		range.page_shift = PAGE_SHIFT;
 		range.start = args.i.p.addr;
 		range.end = args.i.p.addr + args.i.p.size;
 		range.pfns = args.phys;
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index c5b51376b453..51e18fbb8953 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -158,7 +158,6 @@  enum hmm_pfn_value_e {
  * @values: pfn value for some special case (none, special, error, ...)
  * @default_flags: default flags for the range (write, read, ... see hmm doc)
  * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
- * @page_shift: device virtual address shift value (should be >= PAGE_SHIFT)
  * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
  * @valid: pfns array did not change since it has been fill by an HMM function
  */
@@ -172,31 +171,10 @@  struct hmm_range {
 	const uint64_t		*values;
 	uint64_t		default_flags;
 	uint64_t		pfn_flags_mask;
-	uint8_t			page_shift;
 	uint8_t			pfn_shift;
 	bool			valid;
 };
 
-/*
- * hmm_range_page_shift() - return the page shift for the range
- * @range: range being queried
- * Return: page shift (page size = 1 << page shift) for the range
- */
-static inline unsigned hmm_range_page_shift(const struct hmm_range *range)
-{
-	return range->page_shift;
-}
-
-/*
- * hmm_range_page_size() - return the page size for the range
- * @range: range being queried
- * Return: page size for the range in bytes
- */
-static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
-{
-	return 1UL << hmm_range_page_shift(range);
-}
-
 /*
  * hmm_range_wait_until_valid() - wait for range to be valid
  * @range: range affected by invalidation to wait on
diff --git a/mm/hmm.c b/mm/hmm.c
index 926735a3aef9..f26d6abc4ed2 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -344,13 +344,12 @@  static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
 	struct hmm_vma_walk *hmm_vma_walk = walk->private;
 	struct hmm_range *range = hmm_vma_walk->range;
 	uint64_t *pfns = range->pfns;
-	unsigned long i, page_size;
+	unsigned long i;
 
 	hmm_vma_walk->last = addr;
-	page_size = hmm_range_page_size(range);
-	i = (addr - range->start) >> range->page_shift;
+	i = (addr - range->start) >> PAGE_SHIFT;
 
-	for (; addr < end; addr += page_size, i++) {
+	for (; addr < end; addr += PAGE_SIZE, i++) {
 		pfns[i] = range->values[HMM_PFN_NONE];
 		if (fault || write_fault) {
 			int ret;
@@ -772,7 +771,7 @@  static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
 				      struct mm_walk *walk)
 {
 #ifdef CONFIG_HUGETLB_PAGE
-	unsigned long addr = start, i, pfn, mask, size, pfn_inc;
+	unsigned long addr = start, i, pfn, mask;
 	struct hmm_vma_walk *hmm_vma_walk = walk->private;
 	struct hmm_range *range = hmm_vma_walk->range;
 	struct vm_area_struct *vma = walk->vma;
@@ -783,24 +782,12 @@  static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
 	pte_t entry;
 	int ret = 0;
 
-	size = huge_page_size(h);
-	mask = size - 1;
-	if (range->page_shift != PAGE_SHIFT) {
-		/* Make sure we are looking at a full page. */
-		if (start & mask)
-			return -EINVAL;
-		if (end < (start + size))
-			return -EINVAL;
-		pfn_inc = size >> PAGE_SHIFT;
-	} else {
-		pfn_inc = 1;
-		size = PAGE_SIZE;
-	}
+	mask = huge_page_size(h) - 1;
 
 	ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
 	entry = huge_ptep_get(pte);
 
-	i = (start - range->start) >> range->page_shift;
+	i = (start - range->start) >> PAGE_SHIFT;
 	orig_pfn = range->pfns[i];
 	range->pfns[i] = range->values[HMM_PFN_NONE];
 	cpu_flags = pte_to_hmm_pfn_flags(range, entry);
@@ -812,8 +799,8 @@  static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
 		goto unlock;
 	}
 
-	pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift);
-	for (; addr < end; addr += size, i++, pfn += pfn_inc)
+	pfn = pte_pfn(entry) + ((start & mask) >> PAGE_SHIFT);
+	for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
 		range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
 				 cpu_flags;
 	hmm_vma_walk->last = end;
@@ -850,14 +837,13 @@  static void hmm_pfns_clear(struct hmm_range *range,
  */
 int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror)
 {
-	unsigned long mask = ((1UL << range->page_shift) - 1UL);
 	struct hmm *hmm = mirror->hmm;
 	unsigned long flags;
 
 	range->valid = false;
 	range->hmm = NULL;
 
-	if ((range->start & mask) || (range->end & mask))
+	if ((range->start & (PAGE_SIZE - 1)) || (range->end & (PAGE_SIZE - 1)))
 		return -EINVAL;
 	if (range->start >= range->end)
 		return -EINVAL;
@@ -964,16 +950,6 @@  long hmm_range_fault(struct hmm_range *range, unsigned int flags)
 		if (vma == NULL || (vma->vm_flags & device_vma))
 			return -EFAULT;
 
-		if (is_vm_hugetlb_page(vma)) {
-			if (huge_page_shift(hstate_vma(vma)) !=
-			    range->page_shift &&
-			    range->page_shift != PAGE_SHIFT)
-				return -EINVAL;
-		} else {
-			if (range->page_shift != PAGE_SHIFT)
-				return -EINVAL;
-		}
-
 		if (!(vma->vm_flags & VM_READ)) {
 			/*
 			 * If vma do not allow read access, then assume that it

Comments

On Tue, Jul 30, 2019 at 08:51:57AM +0300, Christoph Hellwig wrote:
> All users pass PAGE_SIZE here, and if we wanted to support single
> entries for huge pages we should really just add a HMM_FAULT_HUGEPAGE
> flag instead that uses the huge page size instead of having the
> caller calculate that size once, just for the hmm code to verify it.

I suspect this was added for the ODP conversion that does use both
page sizes. I think the ODP code for this is kind of broken, but I
haven't delved into that..

The challenge is that the driver needs to know what page size to
configure the hardware before it does any range stuff.

The other challenge is that the HW is configured to do only one page
size, and if the underlying CPU page side changes it goes south.

What I would prefer is if the driver could somehow dynamically adjust
the the page size after each dma map, but I don't know if ODP HW can
do that.

Since this is all driving toward making ODP use this maybe we should
keep this API? 

I'm not sure I can loose the crappy huge page support in ODP.

Jason
On Tue, Jul 30, 2019 at 12:55:17PM +0000, Jason Gunthorpe wrote:
> I suspect this was added for the ODP conversion that does use both
> page sizes. I think the ODP code for this is kind of broken, but I
> haven't delved into that..
> 
> The challenge is that the driver needs to know what page size to
> configure the hardware before it does any range stuff.
> 
> The other challenge is that the HW is configured to do only one page
> size, and if the underlying CPU page side changes it goes south.
> 
> What I would prefer is if the driver could somehow dynamically adjust
> the the page size after each dma map, but I don't know if ODP HW can
> do that.
> 
> Since this is all driving toward making ODP use this maybe we should
> keep this API? 
> 
> I'm not sure I can loose the crappy huge page support in ODP.

The problem is that I see no way how to use the current API.  To know
the huge page size you need to have the vma, and the current API
doesn't require a vma to be passed in.

That's why I suggested an api where we pass in a flag that huge pages
are ok into hmm_range_fault, and it then could pass the shift out, and
limits itself to a single vma (which it normally doesn't, that is an
additional complication).  But all this seems really awkward in terms
of an API still.  AFAIK ODP is only used by mlx5, and mlx5 unlike other
IB HCAs can use scatterlist style MRs with variable length per entry,
so even if we pass multiple pages per entry from hmm it could coalesce
them.  The best API for mlx4 would of course be to pass a biovec-style
variable length structure that hmm_fault could fill out, but that would
be a major restructure.
On Tue, Jul 30, 2019 at 03:14:30PM +0200, Christoph Hellwig wrote:
> On Tue, Jul 30, 2019 at 12:55:17PM +0000, Jason Gunthorpe wrote:
> > I suspect this was added for the ODP conversion that does use both
> > page sizes. I think the ODP code for this is kind of broken, but I
> > haven't delved into that..
> > 
> > The challenge is that the driver needs to know what page size to
> > configure the hardware before it does any range stuff.
> > 
> > The other challenge is that the HW is configured to do only one page
> > size, and if the underlying CPU page side changes it goes south.
> > 
> > What I would prefer is if the driver could somehow dynamically adjust
> > the the page size after each dma map, but I don't know if ODP HW can
> > do that.
> > 
> > Since this is all driving toward making ODP use this maybe we should
> > keep this API? 
> > 
> > I'm not sure I can loose the crappy huge page support in ODP.
> 
> The problem is that I see no way how to use the current API.  To know
> the huge page size you need to have the vma, and the current API
> doesn't require a vma to be passed in.

The way ODP seems to work is once in hugetlb mode the dma addresses
must give huge pages or the page fault will be failed. I think that is
a terrible design, but this is how the driver is ..

So, from this HMM perspective if the caller asked for huge pages then
the results have to be all huge pages or a hard failure.

It is not negotiated as an optimization like you are thinking.

[note, I haven't yet checked carefully how this works in ODP, every
 time I look at parts of it the thing seems crazy]

> That's why I suggested an api where we pass in a flag that huge pages
> are ok into hmm_range_fault, and it then could pass the shift out, and
> limits itself to a single vma (which it normally doesn't, that is an
> additional complication).  But all this seems really awkward in terms
> of an API still.  AFAIK ODP is only used by mlx5, and mlx5 unlike other
> IB HCAs can use scatterlist style MRs with variable length per entry,
> so even if we pass multiple pages per entry from hmm it could coalesce
> them.  

When the driver takes faults it has to repair the MR mapping, and
fixing a page in the middle of a variable length SGL would be pretty
complicated. Even so, I don't think the SG_GAPs feature and ODP are
compatible - I'm pretty sure ODP has to be page lists not SGL..

However, what ODP can maybe do is represent a full multi-level page
table, so we could have 2M entries that map to a single DMA or to
another page table w/ 4k pages (have to check on this)

But the driver isn't set up to do that right now.

> The best API for mlx4 would of course be to pass a biovec-style
> variable length structure that hmm_fault could fill out, but that would
> be a major restructure.

It would work, but the driver has to expand that into a page list
right awayhow.

We can't even dma map the biovec with today's dma API as it needs the
ability to remap on a page granularity.

Jason
On 2019-07-30 1:51 a.m., Christoph Hellwig wrote:
> All users pass PAGE_SIZE here, and if we wanted to support single

> entries for huge pages we should really just add a HMM_FAULT_HUGEPAGE

> flag instead that uses the huge page size instead of having the

> caller calculate that size once, just for the hmm code to verify it.


Maybe this was meant to support device page size != native page size? 
Anyway, looks like we didn't use it that way.

Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>



>

> Signed-off-by: Christoph Hellwig <hch@lst.de>

> ---

>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  1 -

>   drivers/gpu/drm/nouveau/nouveau_svm.c   |  1 -

>   include/linux/hmm.h                     | 22 -------------

>   mm/hmm.c                                | 42 ++++++-------------------

>   4 files changed, 9 insertions(+), 57 deletions(-)

>

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c

> index 71d6e7087b0b..8bf79288c4e2 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c

> @@ -818,7 +818,6 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)

>   				0 : range->flags[HMM_PFN_WRITE];

>   	range->pfn_flags_mask = 0;

>   	range->pfns = pfns;

> -	range->page_shift = PAGE_SHIFT;

>   	range->start = start;

>   	range->end = start + ttm->num_pages * PAGE_SIZE;

>   

> diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c

> index 40e706234554..e7068ce46949 100644

> --- a/drivers/gpu/drm/nouveau/nouveau_svm.c

> +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c

> @@ -680,7 +680,6 @@ nouveau_svm_fault(struct nvif_notify *notify)

>   			 args.i.p.addr + args.i.p.size, fn - fi);

>   

>   		/* Have HMM fault pages within the fault window to the GPU. */

> -		range.page_shift = PAGE_SHIFT;

>   		range.start = args.i.p.addr;

>   		range.end = args.i.p.addr + args.i.p.size;

>   		range.pfns = args.phys;

> diff --git a/include/linux/hmm.h b/include/linux/hmm.h

> index c5b51376b453..51e18fbb8953 100644

> --- a/include/linux/hmm.h

> +++ b/include/linux/hmm.h

> @@ -158,7 +158,6 @@ enum hmm_pfn_value_e {

>    * @values: pfn value for some special case (none, special, error, ...)

>    * @default_flags: default flags for the range (write, read, ... see hmm doc)

>    * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter

> - * @page_shift: device virtual address shift value (should be >= PAGE_SHIFT)

>    * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)

>    * @valid: pfns array did not change since it has been fill by an HMM function

>    */

> @@ -172,31 +171,10 @@ struct hmm_range {

>   	const uint64_t		*values;

>   	uint64_t		default_flags;

>   	uint64_t		pfn_flags_mask;

> -	uint8_t			page_shift;

>   	uint8_t			pfn_shift;

>   	bool			valid;

>   };

>   

> -/*

> - * hmm_range_page_shift() - return the page shift for the range

> - * @range: range being queried

> - * Return: page shift (page size = 1 << page shift) for the range

> - */

> -static inline unsigned hmm_range_page_shift(const struct hmm_range *range)

> -{

> -	return range->page_shift;

> -}

> -

> -/*

> - * hmm_range_page_size() - return the page size for the range

> - * @range: range being queried

> - * Return: page size for the range in bytes

> - */

> -static inline unsigned long hmm_range_page_size(const struct hmm_range *range)

> -{

> -	return 1UL << hmm_range_page_shift(range);

> -}

> -

>   /*

>    * hmm_range_wait_until_valid() - wait for range to be valid

>    * @range: range affected by invalidation to wait on

> diff --git a/mm/hmm.c b/mm/hmm.c

> index 926735a3aef9..f26d6abc4ed2 100644

> --- a/mm/hmm.c

> +++ b/mm/hmm.c

> @@ -344,13 +344,12 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,

>   	struct hmm_vma_walk *hmm_vma_walk = walk->private;

>   	struct hmm_range *range = hmm_vma_walk->range;

>   	uint64_t *pfns = range->pfns;

> -	unsigned long i, page_size;

> +	unsigned long i;

>   

>   	hmm_vma_walk->last = addr;

> -	page_size = hmm_range_page_size(range);

> -	i = (addr - range->start) >> range->page_shift;

> +	i = (addr - range->start) >> PAGE_SHIFT;

>   

> -	for (; addr < end; addr += page_size, i++) {

> +	for (; addr < end; addr += PAGE_SIZE, i++) {

>   		pfns[i] = range->values[HMM_PFN_NONE];

>   		if (fault || write_fault) {

>   			int ret;

> @@ -772,7 +771,7 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,

>   				      struct mm_walk *walk)

>   {

>   #ifdef CONFIG_HUGETLB_PAGE

> -	unsigned long addr = start, i, pfn, mask, size, pfn_inc;

> +	unsigned long addr = start, i, pfn, mask;

>   	struct hmm_vma_walk *hmm_vma_walk = walk->private;

>   	struct hmm_range *range = hmm_vma_walk->range;

>   	struct vm_area_struct *vma = walk->vma;

> @@ -783,24 +782,12 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,

>   	pte_t entry;

>   	int ret = 0;

>   

> -	size = huge_page_size(h);

> -	mask = size - 1;

> -	if (range->page_shift != PAGE_SHIFT) {

> -		/* Make sure we are looking at a full page. */

> -		if (start & mask)

> -			return -EINVAL;

> -		if (end < (start + size))

> -			return -EINVAL;

> -		pfn_inc = size >> PAGE_SHIFT;

> -	} else {

> -		pfn_inc = 1;

> -		size = PAGE_SIZE;

> -	}

> +	mask = huge_page_size(h) - 1;

>   

>   	ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);

>   	entry = huge_ptep_get(pte);

>   

> -	i = (start - range->start) >> range->page_shift;

> +	i = (start - range->start) >> PAGE_SHIFT;

>   	orig_pfn = range->pfns[i];

>   	range->pfns[i] = range->values[HMM_PFN_NONE];

>   	cpu_flags = pte_to_hmm_pfn_flags(range, entry);

> @@ -812,8 +799,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,

>   		goto unlock;

>   	}

>   

> -	pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift);

> -	for (; addr < end; addr += size, i++, pfn += pfn_inc)

> +	pfn = pte_pfn(entry) + ((start & mask) >> PAGE_SHIFT);

> +	for (; addr < end; addr += PAGE_SIZE, i++, pfn++)

>   		range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |

>   				 cpu_flags;

>   	hmm_vma_walk->last = end;

> @@ -850,14 +837,13 @@ static void hmm_pfns_clear(struct hmm_range *range,

>    */

>   int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror)

>   {

> -	unsigned long mask = ((1UL << range->page_shift) - 1UL);

>   	struct hmm *hmm = mirror->hmm;

>   	unsigned long flags;

>   

>   	range->valid = false;

>   	range->hmm = NULL;

>   

> -	if ((range->start & mask) || (range->end & mask))

> +	if ((range->start & (PAGE_SIZE - 1)) || (range->end & (PAGE_SIZE - 1)))

>   		return -EINVAL;

>   	if (range->start >= range->end)

>   		return -EINVAL;

> @@ -964,16 +950,6 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags)

>   		if (vma == NULL || (vma->vm_flags & device_vma))

>   			return -EFAULT;

>   

> -		if (is_vm_hugetlb_page(vma)) {

> -			if (huge_page_shift(hstate_vma(vma)) !=

> -			    range->page_shift &&

> -			    range->page_shift != PAGE_SHIFT)

> -				return -EINVAL;

> -		} else {

> -			if (range->page_shift != PAGE_SHIFT)

> -				return -EINVAL;

> -		}

> -

>   		if (!(vma->vm_flags & VM_READ)) {

>   			/*

>   			 * If vma do not allow read access, then assume that it
On Tue, Jul 30, 2019 at 05:50:16PM +0000, Jason Gunthorpe wrote:
> The way ODP seems to work is once in hugetlb mode the dma addresses
> must give huge pages or the page fault will be failed. I think that is
> a terrible design, but this is how the driver is ..
> 
> So, from this HMM perspective if the caller asked for huge pages then
> the results have to be all huge pages or a hard failure.

Which isn't how the page_shift member works at moment.  It still
allows non-hugetlb mappings even with the member.

> It is not negotiated as an optimization like you are thinking.
> 
> [note, I haven't yet checked carefully how this works in ODP, every
>  time I look at parts of it the thing seems crazy]

This seems pretty crazy.  Especially as hugetlb use in applications
seems to fade in favour of THP, for which this ODP scheme does not seem
to work at all.

> > The best API for mlx4 would of course be to pass a biovec-style
> > variable length structure that hmm_fault could fill out, but that would
> > be a major restructure.
> 
> It would work, but the driver has to expand that into a page list
> right awayhow.
> 
> We can't even dma map the biovec with today's dma API as it needs the
> ability to remap on a page granularity.

We can do dma_map_page loops over each biovec entry pretty trivially,
and that won't be any worse than the current loop over each page in
the hmm dma helpers.  Once I get around the work to have a better
API for iommu mappings for bio_vecs we could coalesce it similar to
how we do it with scatterlist (but without all the mess of a new
structure).  That work is going to take a little longer through, as
it needs the amd and intell iommu drivers to be convered to dma-iommu
which isn't making progress as far as I hoped.

Let me know if you want to keep this code for now despite the issues,
or if we'd rather reimplement it once you've made sense of the ODP
code.