From 3acdfd280fe7d807237f2cb7a09d6f8f7f1b484f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 24 Jul 2017 06:22:15 -0400
Subject: errseq: rename __errseq_set to errseq_set

Nothing calls this wrapper anymore, so just remove it and rename the
old function to get rid of the double underscore prefix.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 include/linux/errseq.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/errseq.h b/include/linux/errseq.h
index 9e0d444ac88d..784f0860527b 100644
--- a/include/linux/errseq.h
+++ b/include/linux/errseq.h
@@ -5,14 +5,7 @@
 
 typedef u32	errseq_t;
 
-errseq_t __errseq_set(errseq_t *eseq, int err);
-static inline void errseq_set(errseq_t *eseq, int err)
-{
-	/* Optimize for the common case of no error */
-	if (unlikely(err))
-		__errseq_set(eseq, err);
-}
-
+errseq_t errseq_set(errseq_t *eseq, int err);
 errseq_t errseq_sample(errseq_t *eseq);
 int errseq_check(errseq_t *eseq, errseq_t since);
 int errseq_check_and_advance(errseq_t *eseq, errseq_t *since);
-- 
cgit v1.3


From 80aafd50b6a4fa6b6bba4b451b553d5d221f59ff Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 24 Jul 2017 06:22:16 -0400
Subject: Documentation: add some docs for errseq_t

...and fix up a few comments in the code.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 Documentation/errseq.rst | 149 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/errseq.h   |   5 +-
 include/linux/fs.h       |   2 -
 3 files changed, 152 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/errseq.rst

(limited to 'include')

diff --git a/Documentation/errseq.rst b/Documentation/errseq.rst
new file mode 100644
index 000000000000..4c29bd5afbc5
--- /dev/null
+++ b/Documentation/errseq.rst
@@ -0,0 +1,149 @@
+The errseq_t datatype
+=====================
+An errseq_t is a way of recording errors in one place, and allowing any
+number of "subscribers" to tell whether it has changed since a previous
+point where it was sampled.
+
+The initial use case for this is tracking errors for file
+synchronization syscalls (fsync, fdatasync, msync and sync_file_range),
+but it may be usable in other situations.
+
+It's implemented as an unsigned 32-bit value.  The low order bits are
+designated to hold an error code (between 1 and MAX_ERRNO).  The upper bits
+are used as a counter.  This is done with atomics instead of locking so that
+these functions can be called from any context.
+
+Note that there is a risk of collisions if new errors are being recorded
+frequently, since we have so few bits to use as a counter.
+
+To mitigate this, the bit between the error value and counter is used as
+a flag to tell whether the value has been sampled since a new value was
+recorded.  That allows us to avoid bumping the counter if no one has
+sampled it since the last time an error was recorded.
+
+Thus we end up with a value that looks something like this::
+
+    bit:  31..13        12        11..0
+    +-----------------+----+----------------+
+    |     counter     | SF |      errno     |
+    +-----------------+----+----------------+
+
+The general idea is for "watchers" to sample an errseq_t value and keep
+it as a running cursor.  That value can later be used to tell whether
+any new errors have occurred since that sampling was done, and atomically
+record the state at the time that it was checked.  This allows us to
+record errors in one place, and then have a number of "watchers" that
+can tell whether the value has changed since they last checked it.
+
+A new errseq_t should always be zeroed out.  An errseq_t value of all zeroes
+is the special (but common) case where there has never been an error. An all
+zero value thus serves as the "epoch" if one wishes to know whether there
+has ever been an error set since it was first initialized.
+
+API usage
+=========
+Let me tell you a story about a worker drone.  Now, he's a good worker
+overall, but the company is a little...management heavy.  He has to
+report to 77 supervisors today, and tomorrow the "big boss" is coming in
+from out of town and he's sure to test the poor fellow too.
+
+They're all handing him work to do -- so much he can't keep track of who
+handed him what, but that's not really a big problem.  The supervisors
+just want to know when he's finished all of the work they've handed him so
+far and whether he made any mistakes since they last asked.
+
+He might have made the mistake on work they didn't actually hand him,
+but he can't keep track of things at that level of detail, all he can
+remember is the most recent mistake that he made.
+
+Here's our worker_drone representation::
+
+        struct worker_drone {
+                errseq_t        wd_err; /* for recording errors */
+        };
+
+Every day, the worker_drone starts out with a blank slate::
+
+        struct worker_drone wd;
+
+        wd.wd_err = (errseq_t)0;
+
+The supervisors come in and get an initial read for the day.  They
+don't care about anything that happened before their watch begins::
+
+        struct supervisor {
+                errseq_t        s_wd_err; /* private "cursor" for wd_err */
+                spinlock_t      s_wd_err_lock; /* protects s_wd_err */
+        }
+
+        struct supervisor       su;
+
+        su.s_wd_err = errseq_sample(&wd.wd_err);
+        spin_lock_init(&su.s_wd_err_lock);
+
+Now they start handing him tasks to do.  Every few minutes they ask him to
+finish up all of the work they've handed him so far.  Then they ask him
+whether he made any mistakes on any of it::
+
+        spin_lock(&su.su_wd_err_lock);
+        err = errseq_check_and_advance(&wd.wd_err, &su.s_wd_err);
+        spin_unlock(&su.su_wd_err_lock);
+
+Up to this point, that just keeps returning 0.
+
+Now, the owners of this company are quite miserly and have given him
+substandard equipment with which to do his job. Occasionally it
+glitches and he makes a mistake.  He sighs a heavy sigh, and marks it
+down::
+
+        errseq_set(&wd.wd_err, -EIO);
+
+...and then gets back to work.  The supervisors eventually poll again
+and they each get the error when they next check.  Subsequent calls will
+return 0, until another error is recorded, at which point it's reported
+to each of them once.
+
+Note that the supervisors can't tell how many mistakes he made, only
+whether one was made since they last checked, and the latest value
+recorded.
+
+Occasionally the big boss comes in for a spot check and asks the worker
+to do a one-off job for him. He's not really watching the worker
+full-time like the supervisors, but he does need to know whether a
+mistake occurred while his job was processing.
+
+He can just sample the current errseq_t in the worker, and then use that
+to tell whether an error has occurred later::
+
+        errseq_t since = errseq_sample(&wd.wd_err);
+        /* submit some work and wait for it to complete */
+        err = errseq_check(&wd.wd_err, since);
+
+Since he's just going to discard "since" after that point, he doesn't
+need to advance it here. He also doesn't need any locking since it's
+not usable by anyone else.
+
+Serializing errseq_t cursor updates
+===================================
+Note that the errseq_t API does not protect the errseq_t cursor during a
+check_and_advance_operation. Only the canonical error code is handled
+atomically.  In a situation where more than one task might be using the
+same errseq_t cursor at the same time, it's important to serialize
+updates to that cursor.
+
+If that's not done, then it's possible for the cursor to go backward
+in which case the same error could be reported more than once.
+
+Because of this, it's often advantageous to first do an errseq_check to
+see if anything has changed, and only later do an
+errseq_check_and_advance after taking the lock. e.g.::
+
+        if (errseq_check(&wd.wd_err, READ_ONCE(su.s_wd_err)) {
+                /* su.s_wd_err is protected by s_wd_err_lock */
+                spin_lock(&su.s_wd_err_lock);
+                err = errseq_check_and_advance(&wd.wd_err, &su.s_wd_err);
+                spin_unlock(&su.s_wd_err_lock);
+        }
+
+That avoids the spinlock in the common case where nothing has changed
+since the last time it was checked.
diff --git a/include/linux/errseq.h b/include/linux/errseq.h
index 784f0860527b..f746bd8fe4d0 100644
--- a/include/linux/errseq.h
+++ b/include/linux/errseq.h
@@ -1,8 +1,9 @@
+/*
+ * See Documentation/errseq.rst and lib/errseq.c
+ */
 #ifndef _LINUX_ERRSEQ_H
 #define _LINUX_ERRSEQ_H
 
-/* See lib/errseq.c for more info */
-
 typedef u32	errseq_t;
 
 errseq_t errseq_set(errseq_t *eseq, int err);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7b5d6816542b..21e7df1ad613 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2571,8 +2571,6 @@ extern int __must_check file_write_and_wait_range(struct file *file,
  * When a writeback error occurs, most filesystems will want to call
  * filemap_set_wb_err to record the error in the mapping so that it will be
  * automatically reported whenever fsync is called on the file.
- *
- * FIXME: mention FS_* flag here?
  */
 static inline void filemap_set_wb_err(struct address_space *mapping, int err)
 {
-- 
cgit v1.3


From a823e4589e68996436d16db4ede9a43b646332f9 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 28 Jul 2017 07:24:43 -0400
Subject: mm: add file_fdatawait_range and file_write_and_wait

Necessary now for gfs2_fsync and sync_file_range, but there will
eventually be other callers.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 include/linux/fs.h | 11 ++++++++++-
 mm/filemap.c       | 23 +++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 21e7df1ad613..af592ca3d509 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2544,6 +2544,8 @@ extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
 				   loff_t lend);
 extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
 				  loff_t lend);
+extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
+						loff_t lend);
 extern int filemap_write_and_wait(struct address_space *mapping);
 extern int filemap_write_and_wait_range(struct address_space *mapping,
 				        loff_t lstart, loff_t lend);
@@ -2552,12 +2554,19 @@ extern int __filemap_fdatawrite_range(struct address_space *mapping,
 extern int filemap_fdatawrite_range(struct address_space *mapping,
 				loff_t start, loff_t end);
 extern int filemap_check_errors(struct address_space *mapping);
-
 extern void __filemap_set_wb_err(struct address_space *mapping, int err);
+
+extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
+						loff_t lend);
 extern int __must_check file_check_and_advance_wb_err(struct file *file);
 extern int __must_check file_write_and_wait_range(struct file *file,
 						loff_t start, loff_t end);
 
+static inline int file_write_and_wait(struct file *file)
+{
+	return file_write_and_wait_range(file, 0, LLONG_MAX);
+}
+
 /**
  * filemap_set_wb_err - set a writeback error on an address_space
  * @mapping: mapping in which to set writeback error
diff --git a/mm/filemap.c b/mm/filemap.c
index 72e46e6f0d9a..394bb5e96f87 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -475,6 +475,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
 }
 EXPORT_SYMBOL(filemap_fdatawait_range);
 
+/**
+ * file_fdatawait_range - wait for writeback to complete
+ * @file:		file pointing to address space structure to wait for
+ * @start_byte:		offset in bytes where the range starts
+ * @end_byte:		offset in bytes where the range ends (inclusive)
+ *
+ * Walk the list of under-writeback pages of the address space that file
+ * refers to, in the given range and wait for all of them.  Check error
+ * status of the address space vs. the file->f_wb_err cursor and return it.
+ *
+ * Since the error status of the file is advanced by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
+ */
+int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
+{
+	struct address_space *mapping = file->f_mapping;
+
+	__filemap_fdatawait_range(mapping, start_byte, end_byte);
+	return file_check_and_advance_wb_err(file);
+}
+EXPORT_SYMBOL(file_fdatawait_range);
+
 /**
  * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
  * @mapping: address space structure to wait for
-- 
cgit v1.3


From ffb959bbdf923b4f89a08a04aba2501b1b16d164 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 31 Jul 2017 10:29:38 -0400
Subject: mm: remove optimizations based on i_size in mapping writeback waits

Marcelo added this i_size based optimization with a patch in 2004
(commitid is from the linux-history tree):

    commit 765dad09b4ac101a32d87af2bb793c3060497d3c
    Author: Marcelo Tosatti <marcelo.tosatti@cyclades.com>
    Date:   Tue Sep 7 17:51:17 2004 -0700

	small wait_on_page_writeback_range() optimization

	filemap_fdatawait() calls wait_on_page_writeback_range() with -1
	as "end" parameter.  This is not needed since we know the EOF
	from the inode.  Use that instead.

There may be races here, particularly with clustered or network
filesystems. It also seems like a bit of a layering violation since
we're operating on an address_space here, not an inode.

Finally, it's also questionable whether this optimization really helps
on workloads that we care about. Should we be optimizing for writeback
vs. truncate races in a codepath where we expect to wait anyway? It
doesn't seem worth the risk.

Remove this optimization from the filemap_fdatawait codepaths. This
means that filemap_fdatawait becomes a trivial wrapper around
filemap_fdatawait_range.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 include/linux/fs.h |  7 ++++++-
 mm/filemap.c       | 30 +-----------------------------
 2 files changed, 7 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index af592ca3d509..909210bd6366 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2538,10 +2538,15 @@ extern int invalidate_inode_pages2_range(struct address_space *mapping,
 extern int write_inode_now(struct inode *, int);
 extern int filemap_fdatawrite(struct address_space *);
 extern int filemap_flush(struct address_space *);
-extern int filemap_fdatawait(struct address_space *);
 extern int filemap_fdatawait_keep_errors(struct address_space *mapping);
 extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
 				   loff_t lend);
+
+static inline int filemap_fdatawait(struct address_space *mapping)
+{
+	return filemap_fdatawait_range(mapping, 0, LLONG_MAX);
+}
+
 extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
 				  loff_t lend);
 extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
diff --git a/mm/filemap.c b/mm/filemap.c
index 394bb5e96f87..85dfe3bee324 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -512,39 +512,11 @@ EXPORT_SYMBOL(file_fdatawait_range);
  */
 int filemap_fdatawait_keep_errors(struct address_space *mapping)
 {
-	loff_t i_size = i_size_read(mapping->host);
-
-	if (i_size == 0)
-		return 0;
-
-	__filemap_fdatawait_range(mapping, 0, i_size - 1);
+	__filemap_fdatawait_range(mapping, 0, LLONG_MAX);
 	return filemap_check_and_keep_errors(mapping);
 }
 EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
 
-/**
- * filemap_fdatawait - wait for all under-writeback pages to complete
- * @mapping: address space structure to wait for
- *
- * Walk the list of under-writeback pages of the given address space
- * and wait for all of them.  Check error status of the address space
- * and return it.
- *
- * Since the error status of the address space is cleared by this function,
- * callers are responsible for checking the return value and handling and/or
- * reporting the error.
- */
-int filemap_fdatawait(struct address_space *mapping)
-{
-	loff_t i_size = i_size_read(mapping->host);
-
-	if (i_size == 0)
-		return 0;
-
-	return filemap_fdatawait_range(mapping, 0, i_size - 1);
-}
-EXPORT_SYMBOL(filemap_fdatawait);
-
 static bool mapping_needs_writeback(struct address_space *mapping)
 {
 	return (!dax_mapping(mapping) && mapping->nrpages) ||
-- 
cgit v1.3