diff options
Diffstat (limited to 'fs/xfs/xfs_log.c')
| -rw-r--r-- | fs/xfs/xfs_log.c | 251 | 
1 files changed, 165 insertions, 86 deletions
| diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 36fa2650b081..60ac5fd63f1e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -78,13 +78,12 @@ xlog_verify_iclog(  STATIC void  xlog_verify_tail_lsn(  	struct xlog		*log, -	struct xlog_in_core	*iclog, -	xfs_lsn_t		tail_lsn); +	struct xlog_in_core	*iclog);  #else  #define xlog_verify_dest_ptr(a,b)  #define xlog_verify_grant_tail(a)  #define xlog_verify_iclog(a,b,c) -#define xlog_verify_tail_lsn(a,b,c) +#define xlog_verify_tail_lsn(a,b)  #endif  STATIC int @@ -487,51 +486,80 @@ out_error:  	return error;  } -static bool -__xlog_state_release_iclog( -	struct xlog		*log, -	struct xlog_in_core	*iclog) -{ -	lockdep_assert_held(&log->l_icloglock); - -	if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { -		/* update tail before writing to iclog */ -		xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp); - -		iclog->ic_state = XLOG_STATE_SYNCING; -		iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); -		xlog_verify_tail_lsn(log, iclog, tail_lsn); -		/* cycle incremented when incrementing curr_block */ -		trace_xlog_iclog_syncing(iclog, _RET_IP_); -		return true; -	} - -	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); -	return false; -} -  /*   * Flush iclog to disk if this is the last reference to the given iclog and the   * it is in the WANT_SYNC state. + * + * If the caller passes in a non-zero @old_tail_lsn and the current log tail + * does not match, there may be metadata on disk that must be persisted before + * this iclog is written.  To satisfy that requirement, set the + * XLOG_ICL_NEED_FLUSH flag as a condition for writing this iclog with the new + * log tail value. + * + * If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the + * log tail is updated correctly. NEED_FUA indicates that the iclog will be + * written to stable storage, and implies that a commit record is contained + * within the iclog. We need to ensure that the log tail does not move beyond + * the tail that the first commit record in the iclog ordered against, otherwise + * correct recovery of that checkpoint becomes dependent on future operations + * performed on this iclog. + * + * Hence if NEED_FUA is set and the current iclog tail lsn is empty, write the + * current tail into iclog. Once the iclog tail is set, future operations must + * not modify it, otherwise they potentially violate ordering constraints for + * the checkpoint commit that wrote the initial tail lsn value. The tail lsn in + * the iclog will get zeroed on activation of the iclog after sync, so we + * always capture the tail lsn on the iclog on the first NEED_FUA release + * regardless of the number of active reference counts on this iclog.   */ +  int  xlog_state_release_iclog(  	struct xlog		*log, -	struct xlog_in_core	*iclog) +	struct xlog_in_core	*iclog, +	xfs_lsn_t		old_tail_lsn)  { +	xfs_lsn_t		tail_lsn;  	lockdep_assert_held(&log->l_icloglock);  	trace_xlog_iclog_release(iclog, _RET_IP_);  	if (iclog->ic_state == XLOG_STATE_IOERROR)  		return -EIO; -	if (atomic_dec_and_test(&iclog->ic_refcnt) && -	    __xlog_state_release_iclog(log, iclog)) { -		spin_unlock(&log->l_icloglock); -		xlog_sync(log, iclog); -		spin_lock(&log->l_icloglock); +	/* +	 * Grabbing the current log tail needs to be atomic w.r.t. the writing +	 * of the tail LSN into the iclog so we guarantee that the log tail does +	 * not move between deciding if a cache flush is required and writing +	 * the LSN into the iclog below. +	 */ +	if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) { +		tail_lsn = xlog_assign_tail_lsn(log->l_mp); + +		if (old_tail_lsn && tail_lsn != old_tail_lsn) +			iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; + +		if ((iclog->ic_flags & XLOG_ICL_NEED_FUA) && +		    !iclog->ic_header.h_tail_lsn) +			iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);  	} +	if (!atomic_dec_and_test(&iclog->ic_refcnt)) +		return 0; + +	if (iclog->ic_state != XLOG_STATE_WANT_SYNC) { +		ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); +		return 0; +	} + +	iclog->ic_state = XLOG_STATE_SYNCING; +	if (!iclog->ic_header.h_tail_lsn) +		iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); +	xlog_verify_tail_lsn(log, iclog); +	trace_xlog_iclog_syncing(iclog, _RET_IP_); + +	spin_unlock(&log->l_icloglock); +	xlog_sync(log, iclog); +	spin_lock(&log->l_icloglock);  	return 0;  } @@ -774,6 +802,21 @@ xfs_log_mount_cancel(  }  /* + * Flush out the iclog to disk ensuring that device caches are flushed and + * the iclog hits stable storage before any completion waiters are woken. + */ +static inline int +xlog_force_iclog( +	struct xlog_in_core	*iclog) +{ +	atomic_inc(&iclog->ic_refcnt); +	iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; +	if (iclog->ic_state == XLOG_STATE_ACTIVE) +		xlog_state_switch_iclogs(iclog->ic_log, iclog, 0); +	return xlog_state_release_iclog(iclog->ic_log, iclog, 0); +} + +/*   * Wait for the iclog and all prior iclogs to be written disk as required by the   * log force state machine. Waiting on ic_force_wait ensures iclog completions   * have been ordered and callbacks run before we are woken here, hence @@ -827,13 +870,6 @@ xlog_write_unmount_record(  	/* account for space used by record data */  	ticket->t_curr_res -= sizeof(ulf); -	/* -	 * For external log devices, we need to flush the data device cache -	 * first to ensure all metadata writeback is on stable storage before we -	 * stamp the tail LSN into the unmount record. -	 */ -	if (log->l_targ != log->l_mp->m_ddev_targp) -		blkdev_issue_flush(log->l_targ->bt_bdev);  	return xlog_write(log, &vec, ticket, NULL, NULL, XLOG_UNMOUNT_TRANS);  } @@ -865,18 +901,7 @@ out_err:  	spin_lock(&log->l_icloglock);  	iclog = log->l_iclog; -	atomic_inc(&iclog->ic_refcnt); -	if (iclog->ic_state == XLOG_STATE_ACTIVE) -		xlog_state_switch_iclogs(log, iclog, 0); -	else -		ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || -		       iclog->ic_state == XLOG_STATE_IOERROR); -	/* -	 * Ensure the journal is fully flushed and on stable storage once the -	 * iclog containing the unmount record is written. -	 */ -	iclog->ic_flags |= (XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); -	error = xlog_state_release_iclog(log, iclog); +	error = xlog_force_iclog(iclog);  	xlog_wait_on_iclog(iclog);  	if (tic) { @@ -1796,10 +1821,20 @@ xlog_write_iclog(  	 * metadata writeback and causing priority inversions.  	 */  	iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE; -	if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) +	if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) {  		iclog->ic_bio.bi_opf |= REQ_PREFLUSH; +		/* +		 * For external log devices, we also need to flush the data +		 * device cache first to ensure all metadata writeback covered +		 * by the LSN in this iclog is on stable storage. This is slow, +		 * but it *must* complete before we issue the external log IO. +		 */ +		if (log->l_targ != log->l_mp->m_ddev_targp) +			blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev); +	}  	if (iclog->ic_flags & XLOG_ICL_NEED_FUA)  		iclog->ic_bio.bi_opf |= REQ_FUA; +  	iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);  	if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { @@ -2310,7 +2345,7 @@ xlog_write_copy_finish(  	return 0;  release_iclog: -	error = xlog_state_release_iclog(log, iclog); +	error = xlog_state_release_iclog(log, iclog, 0);  	spin_unlock(&log->l_icloglock);  	return error;  } @@ -2529,7 +2564,7 @@ next_lv:  		ASSERT(optype & XLOG_COMMIT_TRANS);  		*commit_iclog = iclog;  	} else { -		error = xlog_state_release_iclog(log, iclog); +		error = xlog_state_release_iclog(log, iclog, 0);  	}  	spin_unlock(&log->l_icloglock); @@ -2567,6 +2602,7 @@ xlog_state_activate_iclog(  	memset(iclog->ic_header.h_cycle_data, 0,  		sizeof(iclog->ic_header.h_cycle_data));  	iclog->ic_header.h_lsn = 0; +	iclog->ic_header.h_tail_lsn = 0;  }  /* @@ -2967,7 +3003,7 @@ restart:  		 * reference to the iclog.  		 */  		if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) -			error = xlog_state_release_iclog(log, iclog); +			error = xlog_state_release_iclog(log, iclog, 0);  		spin_unlock(&log->l_icloglock);  		if (error)  			return error; @@ -3132,6 +3168,35 @@ xlog_state_switch_iclogs(  }  /* + * Force the iclog to disk and check if the iclog has been completed before + * xlog_force_iclog() returns. This can happen on synchronous (e.g. + * pmem) or fast async storage because we drop the icloglock to issue the IO. + * If completion has already occurred, tell the caller so that it can avoid an + * unnecessary wait on the iclog. + */ +static int +xlog_force_and_check_iclog( +	struct xlog_in_core	*iclog, +	bool			*completed) +{ +	xfs_lsn_t		lsn = be64_to_cpu(iclog->ic_header.h_lsn); +	int			error; + +	*completed = false; +	error = xlog_force_iclog(iclog); +	if (error) +		return error; + +	/* +	 * If the iclog has already been completed and reused the header LSN +	 * will have been rewritten by completion +	 */ +	if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) +		*completed = true; +	return 0; +} + +/*   * Write out all data in the in-core log as of this exact moment in time.   *   * Data may be written to the in-core log during this call.  However, @@ -3165,7 +3230,6 @@ xfs_log_force(  {  	struct xlog		*log = mp->m_log;  	struct xlog_in_core	*iclog; -	xfs_lsn_t		lsn;  	XFS_STATS_INC(mp, xs_log_force);  	trace_xfs_log_force(mp, 0, _RET_IP_); @@ -3193,39 +3257,33 @@ xfs_log_force(  		iclog = iclog->ic_prev;  	} else if (iclog->ic_state == XLOG_STATE_ACTIVE) {  		if (atomic_read(&iclog->ic_refcnt) == 0) { -			/* -			 * We are the only one with access to this iclog. -			 * -			 * Flush it out now.  There should be a roundoff of zero -			 * to show that someone has already taken care of the -			 * roundoff from the previous sync. -			 */ -			atomic_inc(&iclog->ic_refcnt); -			lsn = be64_to_cpu(iclog->ic_header.h_lsn); -			xlog_state_switch_iclogs(log, iclog, 0); -			if (xlog_state_release_iclog(log, iclog)) +			/* We have exclusive access to this iclog. */ +			bool	completed; + +			if (xlog_force_and_check_iclog(iclog, &completed))  				goto out_error; -			if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) +			if (completed)  				goto out_unlock;  		} else {  			/* -			 * Someone else is writing to this iclog. -			 * -			 * Use its call to flush out the data.  However, the -			 * other thread may not force out this LR, so we mark -			 * it WANT_SYNC. +			 * Someone else is still writing to this iclog, so we +			 * need to ensure that when they release the iclog it +			 * gets synced immediately as we may be waiting on it.  			 */  			xlog_state_switch_iclogs(log, iclog, 0);  		} -	} else { -		/* -		 * If the head iclog is not active nor dirty, we just attach -		 * ourselves to the head and go to sleep if necessary. -		 */ -		;  	} +	/* +	 * The iclog we are about to wait on may contain the checkpoint pushed +	 * by the above xlog_cil_force() call, but it may not have been pushed +	 * to disk yet. Like the ACTIVE case above, we need to make sure caches +	 * are flushed when this iclog is written. +	 */ +	if (iclog->ic_state == XLOG_STATE_WANT_SYNC) +		iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; +  	if (flags & XFS_LOG_SYNC)  		return xlog_wait_on_iclog(iclog);  out_unlock: @@ -3245,6 +3303,7 @@ xlog_force_lsn(  	bool			already_slept)  {  	struct xlog_in_core	*iclog; +	bool			completed;  	spin_lock(&log->l_icloglock);  	iclog = log->l_iclog; @@ -3258,7 +3317,8 @@ xlog_force_lsn(  			goto out_unlock;  	} -	if (iclog->ic_state == XLOG_STATE_ACTIVE) { +	switch (iclog->ic_state) { +	case XLOG_STATE_ACTIVE:  		/*  		 * We sleep here if we haven't already slept (e.g. this is the  		 * first time we've looked at the correct iclog buf) and the @@ -3281,12 +3341,31 @@ xlog_force_lsn(  					&log->l_icloglock);  			return -EAGAIN;  		} -		atomic_inc(&iclog->ic_refcnt); -		xlog_state_switch_iclogs(log, iclog, 0); -		if (xlog_state_release_iclog(log, iclog)) +		if (xlog_force_and_check_iclog(iclog, &completed))  			goto out_error;  		if (log_flushed)  			*log_flushed = 1; +		if (completed) +			goto out_unlock; +		break; +	case XLOG_STATE_WANT_SYNC: +		/* +		 * This iclog may contain the checkpoint pushed by the +		 * xlog_cil_force_seq() call, but there are other writers still +		 * accessing it so it hasn't been pushed to disk yet. Like the +		 * ACTIVE case above, we need to make sure caches are flushed +		 * when this iclog is written. +		 */ +		iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; +		break; +	default: +		/* +		 * The entire checkpoint was written by the CIL force and is on +		 * its way to disk already. It will be stable when it +		 * completes, so we don't need to manipulate caches here at all. +		 * We just need to wait for completion if necessary. +		 */ +		break;  	}  	if (flags & XFS_LOG_SYNC) @@ -3559,10 +3638,10 @@ xlog_verify_grant_tail(  STATIC void  xlog_verify_tail_lsn(  	struct xlog		*log, -	struct xlog_in_core	*iclog, -	xfs_lsn_t		tail_lsn) +	struct xlog_in_core	*iclog)  { -    int blocks; +	xfs_lsn_t	tail_lsn = be64_to_cpu(iclog->ic_header.h_tail_lsn); +	int		blocks;      if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {  	blocks = | 
