[Mesa-dev,1/3] util/disk_cache: use LRU eviction rather than random eviction (v2)

Submitted by Alan Swanson on March 7, 2017, 1:25 a.m.

Details

Message ID 20170307012503.4047-1-reiver@improbability.net
State New
Headers show
Series "Series without cover letter" ( rev: 2 ) in Mesa

Not browsing as part of any series.

Commit Message

Alan Swanson March 7, 2017, 1:25 a.m.
Still using random selection of two-character subdirectory in which
to check cache files rather than scanning entire cache.

v2: Factor out double strlen call
---
 src/util/disk_cache.c | 78 +++++++++++++++++++++++----------------------------
 1 file changed, 35 insertions(+), 43 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/util/disk_cache.c b/src/util/disk_cache.c
index 31a9336582..2a923be3dc 100644
--- a/src/util/disk_cache.c
+++ b/src/util/disk_cache.c
@@ -438,70 +438,60 @@  make_cache_file_directory(struct disk_cache *cache, const cache_key key)
    free(dir);
 }
 
-/* Given a directory path and predicate function, count all entries in
- * that directory for which the predicate returns true. Then choose a
- * random entry from among those counted.
+/* Given a directory path and predicate function, find the entry with
+ * the oldest access time in that directory for which the predicate
+ * returns true.
  *
  * Returns: A malloc'ed string for the path to the chosen file, (or
  * NULL on any error). The caller should free the string when
  * finished.
  */
 static char *
-choose_random_file_matching(const char *dir_path,
-                            bool (*predicate)(const struct dirent *,
-                                              const char *dir_path))
+choose_lru_file_matching(const char *dir_path,
+                         bool (*predicate)(const struct dirent *,
+                                           const char *dir_path))
 {
    DIR *dir;
    struct dirent *entry;
-   unsigned int count, victim;
+   struct stat sb;
+   char *tmp, *lru_name = NULL;
+   size_t len;
+   time_t lru_atime = 0;
    char *filename;
 
    dir = opendir(dir_path);
    if (dir == NULL)
       return NULL;
 
-   count = 0;
-
-   while (1) {
-      entry = readdir(dir);
-      if (entry == NULL)
-         break;
-      if (!predicate(entry, dir_path))
-         continue;
-
-      count++;
-   }
-
-   if (count == 0) {
-      closedir(dir);
-      return NULL;
-   }
-
-   victim = rand() % count;
-
-   rewinddir(dir);
-   count = 0;
-
    while (1) {
       entry = readdir(dir);
       if (entry == NULL)
          break;
       if (!predicate(entry, dir_path))
          continue;
-      if (count == victim)
-         break;
 
-      count++;
+      if (fstatat(dirfd(dir), entry->d_name, &sb, 0) == 0) {
+         if (!lru_atime || (sb.st_atime < lru_atime)) {
+            len = strlen(entry->d_name) + 1;
+            tmp = realloc(lru_name, len);
+            if (tmp) {
+               lru_name = tmp;
+               memcpy(lru_name, entry->d_name, len);
+               lru_atime = sb.st_atime;
+            }
+         }
+      }
    }
 
-   if (entry == NULL) {
+   if (lru_name == NULL) {
       closedir(dir);
       return NULL;
    }
 
-   if (asprintf(&filename, "%s/%s", dir_path, entry->d_name) < 0)
+   if (asprintf(&filename, "%s/%s", dir_path, lru_name) < 0)
       filename = NULL;
 
+   free(lru_name);
    closedir(dir);
 
    return filename;
@@ -533,12 +523,12 @@  is_regular_non_tmp_file(const struct dirent *entry, const char *path)
 
 /* Returns the size of the deleted file, (or 0 on any error). */
 static size_t
-unlink_random_file_from_directory(const char *path)
+unlink_lru_file_from_directory(const char *path)
 {
    struct stat sb;
    char *filename;
 
-   filename = choose_random_file_matching(path, is_regular_non_tmp_file);
+   filename = choose_lru_file_matching(path, is_regular_non_tmp_file);
    if (filename == NULL)
       return 0;
 
@@ -581,7 +571,7 @@  is_two_character_sub_directory(const struct dirent *entry, const char *path)
 }
 
 static void
-evict_random_item(struct disk_cache *cache)
+evict_lru_item(struct disk_cache *cache)
 {
    const char hex[] = "0123456789abcde";
    char *dir_path;
@@ -591,6 +581,7 @@  evict_random_item(struct disk_cache *cache)
    /* With a reasonably-sized, full cache, (and with keys generated
     * from a cryptographic hash), we can choose two random hex digits
     * and reasonably expect the directory to exist with a file in it.
+    * Provides pseudo-LRU eviction to reduce checking all cache files.
     */
    a = rand() % 16;
    b = rand() % 16;
@@ -598,7 +589,7 @@  evict_random_item(struct disk_cache *cache)
    if (asprintf(&dir_path, "%s/%c%c", cache->path, hex[a], hex[b]) < 0)
       return;
 
-   size = unlink_random_file_from_directory(dir_path);
+   size = unlink_lru_file_from_directory(dir_path);
 
    free(dir_path);
 
@@ -608,18 +599,19 @@  evict_random_item(struct disk_cache *cache)
    }
 
    /* In the case where the random choice of directory didn't find
-    * something, we choose randomly from the existing directories.
+    * something, we choose the least recently accessed from the
+    * existing directories.
     *
     * Really, the only reason this code exists is to allow the unit
     * tests to work, (which use an artificially-small cache to be able
     * to force a single cached item to be evicted).
     */
-   dir_path = choose_random_file_matching(cache->path,
-                                          is_two_character_sub_directory);
+   dir_path = choose_lru_file_matching(cache->path,
+                                       is_two_character_sub_directory);
    if (dir_path == NULL)
       return;
 
-   size = unlink_random_file_from_directory(dir_path);
+   size = unlink_lru_file_from_directory(dir_path);
 
    free(dir_path);
 
@@ -788,7 +780,7 @@  disk_cache_put(struct disk_cache *cache,
     * else first.
     */
    if (*cache->size + size > cache->max_size)
-      evict_random_item(cache);
+      evict_lru_item(cache);
 
    /* Create CRC of the data and store at the start of the file. We will
     * read this when restoring the cache and use it to check for corruption.

Comments

On 07/03/17 12:25, Alan Swanson wrote:
> Still using random selection of two-character subdirectory in which
> to check cache files rather than scanning entire cache.
>
> v2: Factor out double strlen call
> ---
>  src/util/disk_cache.c | 78 +++++++++++++++++++++++----------------------------
>  1 file changed, 35 insertions(+), 43 deletions(-)
>
> diff --git a/src/util/disk_cache.c b/src/util/disk_cache.c
> index 31a9336582..2a923be3dc 100644
> --- a/src/util/disk_cache.c
> +++ b/src/util/disk_cache.c
> @@ -438,70 +438,60 @@ make_cache_file_directory(struct disk_cache *cache, const cache_key key)
>     free(dir);
>  }
>
> -/* Given a directory path and predicate function, count all entries in
> - * that directory for which the predicate returns true. Then choose a
> - * random entry from among those counted.
> +/* Given a directory path and predicate function, find the entry with
> + * the oldest access time in that directory for which the predicate
> + * returns true.
>   *
>   * Returns: A malloc'ed string for the path to the chosen file, (or
>   * NULL on any error). The caller should free the string when
>   * finished.
>   */
>  static char *
> -choose_random_file_matching(const char *dir_path,
> -                            bool (*predicate)(const struct dirent *,
> -                                              const char *dir_path))
> +choose_lru_file_matching(const char *dir_path,
> +                         bool (*predicate)(const struct dirent *,
> +                                           const char *dir_path))
>  {
>     DIR *dir;
>     struct dirent *entry;
> -   unsigned int count, victim;
> +   struct stat sb;
> +   char *tmp, *lru_name = NULL;
> +   size_t len;
> +   time_t lru_atime = 0;

Please try to declare variables where they are used. The original 
version of this file was written before we could use C99 features in Mesa.


>     char *filename;
>
>     dir = opendir(dir_path);
>     if (dir == NULL)
>        return NULL;
>
> -   count = 0;
> -
> -   while (1) {
> -      entry = readdir(dir);
> -      if (entry == NULL)
> -         break;
> -      if (!predicate(entry, dir_path))
> -         continue;
> -
> -      count++;
> -   }
> -
> -   if (count == 0) {
> -      closedir(dir);
> -      return NULL;
> -   }
> -
> -   victim = rand() % count;
> -
> -   rewinddir(dir);
> -   count = 0;
> -
>     while (1) {
>        entry = readdir(dir);
>        if (entry == NULL)
>           break;
>        if (!predicate(entry, dir_path))
>           continue;
> -      if (count == victim)
> -         break;
>
> -      count++;
> +      if (fstatat(dirfd(dir), entry->d_name, &sb, 0) == 0) {
> +         if (!lru_atime || (sb.st_atime < lru_atime)) {
> +            len = strlen(entry->d_name) + 1;
> +            tmp = realloc(lru_name, len);
> +            if (tmp) {
> +               lru_name = tmp;

I don't think we need tmp here. Why not use lru_name directly?

With these couple of nits addressed and the make check test updated for 
patch 2 this series looks good to me. Thanks for writing it.

If you can send these updates I'll push it once we have moved this stuff 
off into a thread [1].

[1] https://lists.freedesktop.org/archives/mesa-dev/2017-March/147442.html

> +               memcpy(lru_name, entry->d_name, len);
> +               lru_atime = sb.st_atime;
> +            }
> +         }
> +      }
>     }
>
> -   if (entry == NULL) {
> +   if (lru_name == NULL) {
>        closedir(dir);
>        return NULL;
>     }
>
> -   if (asprintf(&filename, "%s/%s", dir_path, entry->d_name) < 0)
> +   if (asprintf(&filename, "%s/%s", dir_path, lru_name) < 0)
>        filename = NULL;
>
> +   free(lru_name);
>     closedir(dir);
>
>     return filename;
> @@ -533,12 +523,12 @@ is_regular_non_tmp_file(const struct dirent *entry, const char *path)
>
>  /* Returns the size of the deleted file, (or 0 on any error). */
>  static size_t
> -unlink_random_file_from_directory(const char *path)
> +unlink_lru_file_from_directory(const char *path)
>  {
>     struct stat sb;
>     char *filename;
>
> -   filename = choose_random_file_matching(path, is_regular_non_tmp_file);
> +   filename = choose_lru_file_matching(path, is_regular_non_tmp_file);
>     if (filename == NULL)
>        return 0;
>
> @@ -581,7 +571,7 @@ is_two_character_sub_directory(const struct dirent *entry, const char *path)
>  }
>
>  static void
> -evict_random_item(struct disk_cache *cache)
> +evict_lru_item(struct disk_cache *cache)
>  {
>     const char hex[] = "0123456789abcde";
>     char *dir_path;
> @@ -591,6 +581,7 @@ evict_random_item(struct disk_cache *cache)
>     /* With a reasonably-sized, full cache, (and with keys generated
>      * from a cryptographic hash), we can choose two random hex digits
>      * and reasonably expect the directory to exist with a file in it.
> +    * Provides pseudo-LRU eviction to reduce checking all cache files.
>      */
>     a = rand() % 16;
>     b = rand() % 16;
> @@ -598,7 +589,7 @@ evict_random_item(struct disk_cache *cache)
>     if (asprintf(&dir_path, "%s/%c%c", cache->path, hex[a], hex[b]) < 0)
>        return;
>
> -   size = unlink_random_file_from_directory(dir_path);
> +   size = unlink_lru_file_from_directory(dir_path);
>
>     free(dir_path);
>
> @@ -608,18 +599,19 @@ evict_random_item(struct disk_cache *cache)
>     }
>
>     /* In the case where the random choice of directory didn't find
> -    * something, we choose randomly from the existing directories.
> +    * something, we choose the least recently accessed from the
> +    * existing directories.
>      *
>      * Really, the only reason this code exists is to allow the unit
>      * tests to work, (which use an artificially-small cache to be able
>      * to force a single cached item to be evicted).
>      */
> -   dir_path = choose_random_file_matching(cache->path,
> -                                          is_two_character_sub_directory);
> +   dir_path = choose_lru_file_matching(cache->path,
> +                                       is_two_character_sub_directory);
>     if (dir_path == NULL)
>        return;
>
> -   size = unlink_random_file_from_directory(dir_path);
> +   size = unlink_lru_file_from_directory(dir_path);
>
>     free(dir_path);
>
> @@ -788,7 +780,7 @@ disk_cache_put(struct disk_cache *cache,
>      * else first.
>      */
>     if (*cache->size + size > cache->max_size)
> -      evict_random_item(cache);
> +      evict_lru_item(cache);
>
>     /* Create CRC of the data and store at the start of the file. We will
>      * read this when restoring the cache and use it to check for corruption.
>
On Fri, Mar 10, 2017 at 6:23 AM, Timothy Arceri <tarceri@itsqueeze.com> wrote:
> On 07/03/17 12:25, Alan Swanson wrote:
>>
>> Still using random selection of two-character subdirectory in which
>> to check cache files rather than scanning entire cache.
>>
>> v2: Factor out double strlen call
>> ---
>>  src/util/disk_cache.c | 78
>> +++++++++++++++++++++++----------------------------
>>  1 file changed, 35 insertions(+), 43 deletions(-)
>>
>> diff --git a/src/util/disk_cache.c b/src/util/disk_cache.c
>> index 31a9336582..2a923be3dc 100644
>> --- a/src/util/disk_cache.c
>> +++ b/src/util/disk_cache.c
>> @@ -438,70 +438,60 @@ make_cache_file_directory(struct disk_cache *cache,
>> const cache_key key)
>>     free(dir);
>>  }
>>
>> -/* Given a directory path and predicate function, count all entries in
>> - * that directory for which the predicate returns true. Then choose a
>> - * random entry from among those counted.
>> +/* Given a directory path and predicate function, find the entry with
>> + * the oldest access time in that directory for which the predicate
>> + * returns true.
>>   *
>>   * Returns: A malloc'ed string for the path to the chosen file, (or
>>   * NULL on any error). The caller should free the string when
>>   * finished.
>>   */
>>  static char *
>> -choose_random_file_matching(const char *dir_path,
>> -                            bool (*predicate)(const struct dirent *,
>> -                                              const char *dir_path))
>> +choose_lru_file_matching(const char *dir_path,
>> +                         bool (*predicate)(const struct dirent *,
>> +                                           const char *dir_path))
>>  {
>>     DIR *dir;
>>     struct dirent *entry;
>> -   unsigned int count, victim;
>> +   struct stat sb;
>> +   char *tmp, *lru_name = NULL;
>> +   size_t len;
>> +   time_t lru_atime = 0;
>
>
> Please try to declare variables where they are used. The original version of
> this file was written before we could use C99 features in Mesa.
>
>
>
>>     char *filename;
>>
>>     dir = opendir(dir_path);
>>     if (dir == NULL)
>>        return NULL;
>>
>> -   count = 0;
>> -
>> -   while (1) {
>> -      entry = readdir(dir);
>> -      if (entry == NULL)
>> -         break;
>> -      if (!predicate(entry, dir_path))
>> -         continue;
>> -
>> -      count++;
>> -   }
>> -
>> -   if (count == 0) {
>> -      closedir(dir);
>> -      return NULL;
>> -   }
>> -
>> -   victim = rand() % count;
>> -
>> -   rewinddir(dir);
>> -   count = 0;
>> -
>>     while (1) {
>>        entry = readdir(dir);
>>        if (entry == NULL)
>>           break;
>>        if (!predicate(entry, dir_path))
>>           continue;
>> -      if (count == victim)
>> -         break;
>>
>> -      count++;
>> +      if (fstatat(dirfd(dir), entry->d_name, &sb, 0) == 0) {
>> +         if (!lru_atime || (sb.st_atime < lru_atime)) {
>> +            len = strlen(entry->d_name) + 1;
>> +            tmp = realloc(lru_name, len);
>> +            if (tmp) {
>> +               lru_name = tmp;
>
>
> I don't think we need tmp here. Why not use lru_name directly?

It *is* needed, otherwise if realloc fails, you'll leak old memory.

Gražvydas
On 10 March 2017 at 10:51, Grazvydas Ignotas <notasas@gmail.com> wrote:
> On Fri, Mar 10, 2017 at 6:23 AM, Timothy Arceri <tarceri@itsqueeze.com> wrote:
>> On 07/03/17 12:25, Alan Swanson wrote:
>>>
>>> Still using random selection of two-character subdirectory in which
>>> to check cache files rather than scanning entire cache.
>>>
>>> v2: Factor out double strlen call
>>> ---
>>>  src/util/disk_cache.c | 78
>>> +++++++++++++++++++++++----------------------------
>>>  1 file changed, 35 insertions(+), 43 deletions(-)
>>>
>>> diff --git a/src/util/disk_cache.c b/src/util/disk_cache.c
>>> index 31a9336582..2a923be3dc 100644
>>> --- a/src/util/disk_cache.c
>>> +++ b/src/util/disk_cache.c
>>> @@ -438,70 +438,60 @@ make_cache_file_directory(struct disk_cache *cache,
>>> const cache_key key)
>>>     free(dir);
>>>  }
>>>
>>> -/* Given a directory path and predicate function, count all entries in
>>> - * that directory for which the predicate returns true. Then choose a
>>> - * random entry from among those counted.
>>> +/* Given a directory path and predicate function, find the entry with
>>> + * the oldest access time in that directory for which the predicate
>>> + * returns true.
>>>   *
>>>   * Returns: A malloc'ed string for the path to the chosen file, (or
>>>   * NULL on any error). The caller should free the string when
>>>   * finished.
>>>   */
>>>  static char *
>>> -choose_random_file_matching(const char *dir_path,
>>> -                            bool (*predicate)(const struct dirent *,
>>> -                                              const char *dir_path))
>>> +choose_lru_file_matching(const char *dir_path,
>>> +                         bool (*predicate)(const struct dirent *,
>>> +                                           const char *dir_path))
>>>  {
>>>     DIR *dir;
>>>     struct dirent *entry;
>>> -   unsigned int count, victim;
>>> +   struct stat sb;
>>> +   char *tmp, *lru_name = NULL;
>>> +   size_t len;
>>> +   time_t lru_atime = 0;
>>
>>
>> Please try to declare variables where they are used. The original version of
>> this file was written before we could use C99 features in Mesa.
>>
>>
>>
>>>     char *filename;
>>>
>>>     dir = opendir(dir_path);
>>>     if (dir == NULL)
>>>        return NULL;
>>>
>>> -   count = 0;
>>> -
>>> -   while (1) {
>>> -      entry = readdir(dir);
>>> -      if (entry == NULL)
>>> -         break;
>>> -      if (!predicate(entry, dir_path))
>>> -         continue;
>>> -
>>> -      count++;
>>> -   }
>>> -
>>> -   if (count == 0) {
>>> -      closedir(dir);
>>> -      return NULL;
>>> -   }
>>> -
>>> -   victim = rand() % count;
>>> -
>>> -   rewinddir(dir);
>>> -   count = 0;
>>> -
>>>     while (1) {
>>>        entry = readdir(dir);
>>>        if (entry == NULL)
>>>           break;
>>>        if (!predicate(entry, dir_path))
>>>           continue;
>>> -      if (count == victim)
>>> -         break;
>>>
>>> -      count++;
>>> +      if (fstatat(dirfd(dir), entry->d_name, &sb, 0) == 0) {
>>> +         if (!lru_atime || (sb.st_atime < lru_atime)) {
>>> +            len = strlen(entry->d_name) + 1;
>>> +            tmp = realloc(lru_name, len);
>>> +            if (tmp) {
>>> +               lru_name = tmp;
>>
>>
>> I don't think we need tmp here. Why not use lru_name directly?
>
> It *is* needed, otherwise if realloc fails, you'll leak old memory.
>
Indeed. Alternatively one can avoid all the strlen/realloc/free dances
by using NAME_MAX.

As per the POSIX manual
"The array d_name is of unspecified size, but shall contain a filename
of at most {NAME_MAX} bytes followed by a terminating null byte."

Not 100% sure if it'll play well with Hurd and similar platforms :-\

-Emil
On Sat, 2017-03-11 at 17:08 +0000, Emil Velikov wrote:
> On 10 March 2017 at 10:51, Grazvydas Ignotas <notasas@gmail.com>
> wrote:
> > 
> > On Fri, Mar 10, 2017 at 6:23 AM, Timothy Arceri <tarceri@itsqueeze.
> > com> wrote:
> > > 
> > > On 07/03/17 12:25, Alan Swanson wrote:
> > > > 
> > > > 
> > > > Still using random selection of two-character subdirectory in
> > > > which
> > > > to check cache files rather than scanning entire cache.
> > > > 
> > > > v2: Factor out double strlen call
> > > > ---
> > > >  src/util/disk_cache.c | 78
> > > > +++++++++++++++++++++++----------------------------
> > > >  1 file changed, 35 insertions(+), 43 deletions(-)
> > > > 
> > > > diff --git a/src/util/disk_cache.c b/src/util/disk_cache.c
> > > > index 31a9336582..2a923be3dc 100644
> > > > --- a/src/util/disk_cache.c
> > > > +++ b/src/util/disk_cache.c

[SNIP]

> > > >     char *filename;
> > > > 
> > > >     dir = opendir(dir_path);
> > > >     if (dir == NULL)
> > > >        return NULL;
> > > > 
> > > > -   count = 0;
> > > > -
> > > > -   while (1) {
> > > > -      entry = readdir(dir);
> > > > -      if (entry == NULL)
> > > > -         break;
> > > > -      if (!predicate(entry, dir_path))
> > > > -         continue;
> > > > -
> > > > -      count++;
> > > > -   }
> > > > -
> > > > -   if (count == 0) {
> > > > -      closedir(dir);
> > > > -      return NULL;
> > > > -   }
> > > > -
> > > > -   victim = rand() % count;
> > > > -
> > > > -   rewinddir(dir);
> > > > -   count = 0;
> > > > -
> > > >     while (1) {
> > > >        entry = readdir(dir);
> > > >        if (entry == NULL)
> > > >           break;
> > > >        if (!predicate(entry, dir_path))
> > > >           continue;
> > > > -      if (count == victim)
> > > > -         break;
> > > > 
> > > > -      count++;
> > > > +      if (fstatat(dirfd(dir), entry->d_name, &sb, 0) == 0) {
> > > > +         if (!lru_atime || (sb.st_atime < lru_atime)) {
> > > > +            len = strlen(entry->d_name) + 1;
> > > > +            tmp = realloc(lru_name, len);
> > > > +            if (tmp) {
> > > > +               lru_name = tmp;
> > > 
> > > 
> > > I don't think we need tmp here. Why not use lru_name directly?
> > 
> > It *is* needed, otherwise if realloc fails, you'll leak old memory.
> > 
> Indeed. Alternatively one can avoid all the strlen/realloc/free
> dances
> by using NAME_MAX.
> 
> As per the POSIX manual
> "The array d_name is of unspecified size, but shall contain a
> filename
> of at most {NAME_MAX} bytes followed by a terminating null byte."
> 
> Not 100% sure if it'll play well with Hurd and similar platforms :-\

But here it's a very simple dance. The realloc should mostly become a
noop, at least with glibc, after the first entry in cache and just
activating for any .tmp files. Using MAX always seems like a lazy over
allocation to me.

A follow on patch will pass the sb and len to predicate functions
instead saving them from calling fstat and strlen on same entry again.
Will post that this weekend when I get back home.

-- 
Alan.