diff options
Diffstat (limited to 'block/bfq-iosched.c')
| -rw-r--r-- | block/bfq-iosched.c | 529 | 
1 files changed, 374 insertions, 155 deletions
| diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index bcb6d21baf12..47e6ec7427c4 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -166,6 +166,20 @@ static const int bfq_async_charge_factor = 10;  /* Default timeout values, in jiffies, approximating CFQ defaults. */  const int bfq_timeout = HZ / 8; +/* + * Time limit for merging (see comments in bfq_setup_cooperator). Set + * to the slowest value that, in our tests, proved to be effective in + * removing false positives, while not causing true positives to miss + * queue merging. + * + * As can be deduced from the low time limit below, queue merging, if + * successful, happens at the very beggining of the I/O of the involved + * cooperating processes, as a consequence of the arrival of the very + * first requests from each cooperator.  After that, there is very + * little chance to find cooperators. + */ +static const unsigned long bfq_merge_time_limit = HZ/10; +  static struct kmem_cache *bfq_pool;  /* Below this threshold (in ns), we consider thinktime immediate. */ @@ -178,7 +192,7 @@ static struct kmem_cache *bfq_pool;  #define BFQQ_SEEK_THR		(sector_t)(8 * 100)  #define BFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)  #define BFQQ_CLOSE_THR		(sector_t)(8 * 1024) -#define BFQQ_SEEKY(bfqq)	(hweight32(bfqq->seek_history) > 32/8) +#define BFQQ_SEEKY(bfqq)	(hweight32(bfqq->seek_history) > 19)  /* Min number of samples required to perform peak-rate update */  #define BFQ_RATE_MIN_SAMPLES	32 @@ -195,15 +209,17 @@ static struct kmem_cache *bfq_pool;   * interactive applications automatically, using the following formula:   * duration = (R / r) * T, where r is the peak rate of the device, and   * R and T are two reference parameters. - * In particular, R is the peak rate of the reference device (see below), - * and T is a reference time: given the systems that are likely to be - * installed on the reference device according to its speed class, T is - * about the maximum time needed, under BFQ and while reading two files in - * parallel, to load typical large applications on these systems. - * In practice, the slower/faster the device at hand is, the more/less it - * takes to load applications with respect to the reference device. - * Accordingly, the longer/shorter BFQ grants weight raising to interactive - * applications. + * In particular, R is the peak rate of the reference device (see + * below), and T is a reference time: given the systems that are + * likely to be installed on the reference device according to its + * speed class, T is about the maximum time needed, under BFQ and + * while reading two files in parallel, to load typical large + * applications on these systems (see the comments on + * max_service_from_wr below, for more details on how T is obtained). + * In practice, the slower/faster the device at hand is, the more/less + * it takes to load applications with respect to the reference device. + * Accordingly, the longer/shorter BFQ grants weight raising to + * interactive applications.   *   * BFQ uses four different reference pairs (R, T), depending on:   * . whether the device is rotational or non-rotational; @@ -240,6 +256,60 @@ static int T_slow[2];  static int T_fast[2];  static int device_speed_thresh[2]; +/* + * BFQ uses the above-detailed, time-based weight-raising mechanism to + * privilege interactive tasks. This mechanism is vulnerable to the + * following false positives: I/O-bound applications that will go on + * doing I/O for much longer than the duration of weight + * raising. These applications have basically no benefit from being + * weight-raised at the beginning of their I/O. On the opposite end, + * while being weight-raised, these applications + * a) unjustly steal throughput to applications that may actually need + * low latency; + * b) make BFQ uselessly perform device idling; device idling results + * in loss of device throughput with most flash-based storage, and may + * increase latencies when used purposelessly. + * + * BFQ tries to reduce these problems, by adopting the following + * countermeasure. To introduce this countermeasure, we need first to + * finish explaining how the duration of weight-raising for + * interactive tasks is computed. + * + * For a bfq_queue deemed as interactive, the duration of weight + * raising is dynamically adjusted, as a function of the estimated + * peak rate of the device, so as to be equal to the time needed to + * execute the 'largest' interactive task we benchmarked so far. By + * largest task, we mean the task for which each involved process has + * to do more I/O than for any of the other tasks we benchmarked. This + * reference interactive task is the start-up of LibreOffice Writer, + * and in this task each process/bfq_queue needs to have at most ~110K + * sectors transferred. + * + * This last piece of information enables BFQ to reduce the actual + * duration of weight-raising for at least one class of I/O-bound + * applications: those doing sequential or quasi-sequential I/O. An + * example is file copy. In fact, once started, the main I/O-bound + * processes of these applications usually consume the above 110K + * sectors in much less time than the processes of an application that + * is starting, because these I/O-bound processes will greedily devote + * almost all their CPU cycles only to their target, + * throughput-friendly I/O operations. This is even more true if BFQ + * happens to be underestimating the device peak rate, and thus + * overestimating the duration of weight raising. But, according to + * our measurements, once transferred 110K sectors, these processes + * have no right to be weight-raised any longer. + * + * Basing on the last consideration, BFQ ends weight-raising for a + * bfq_queue if the latter happens to have received an amount of + * service at least equal to the following constant. The constant is + * set to slightly more than 110K, to have a minimum safety margin. + * + * This early ending of weight-raising reduces the amount of time + * during which interactive false positives cause the two problems + * described at the beginning of these comments. + */ +static const unsigned long max_service_from_wr = 120000; +  #define RQ_BIC(rq)		icq_to_bic((rq)->elv.priv[0])  #define RQ_BFQQ(rq)		((rq)->elv.priv[1]) @@ -403,6 +473,82 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,  	}  } +/* + * See the comments on bfq_limit_depth for the purpose of + * the depths set in the function. + */ +static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) +{ +	bfqd->sb_shift = bt->sb.shift; + +	/* +	 * In-word depths if no bfq_queue is being weight-raised: +	 * leaving 25% of tags only for sync reads. +	 * +	 * In next formulas, right-shift the value +	 * (1U<<bfqd->sb_shift), instead of computing directly +	 * (1U<<(bfqd->sb_shift - something)), to be robust against +	 * any possible value of bfqd->sb_shift, without having to +	 * limit 'something'. +	 */ +	/* no more than 50% of tags for async I/O */ +	bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U); +	/* +	 * no more than 75% of tags for sync writes (25% extra tags +	 * w.r.t. async I/O, to prevent async I/O from starving sync +	 * writes) +	 */ +	bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U); + +	/* +	 * In-word depths in case some bfq_queue is being weight- +	 * raised: leaving ~63% of tags for sync reads. This is the +	 * highest percentage for which, in our tests, application +	 * start-up times didn't suffer from any regression due to tag +	 * shortage. +	 */ +	/* no more than ~18% of tags for async I/O */ +	bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U); +	/* no more than ~37% of tags for sync writes (~20% extra tags) */ +	bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U); +} + +/* + * Async I/O can easily starve sync I/O (both sync reads and sync + * writes), by consuming all tags. Similarly, storms of sync writes, + * such as those that sync(2) may trigger, can starve sync reads. + * Limit depths of async I/O and sync writes so as to counter both + * problems. + */ +static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) +{ +	struct blk_mq_tags *tags = blk_mq_tags_from_data(data); +	struct bfq_data *bfqd = data->q->elevator->elevator_data; +	struct sbitmap_queue *bt; + +	if (op_is_sync(op) && !op_is_write(op)) +		return; + +	if (data->flags & BLK_MQ_REQ_RESERVED) { +		if (unlikely(!tags->nr_reserved_tags)) { +			WARN_ON_ONCE(1); +			return; +		} +		bt = &tags->breserved_tags; +	} else +		bt = &tags->bitmap_tags; + +	if (unlikely(bfqd->sb_shift != bt->sb.shift)) +		bfq_update_depths(bfqd, bt); + +	data->shallow_depth = +		bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; + +	bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", +			__func__, bfqd->wr_busy_queues, op_is_sync(op), +			data->shallow_depth); +} +  static struct bfq_queue *  bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,  		     sector_t sector, struct rb_node **ret_parent, @@ -444,6 +590,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,  	return bfqq;  } +static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) +{ +	return bfqq->service_from_backlogged > 0 && +		time_is_before_jiffies(bfqq->first_IO_time + +				       bfq_merge_time_limit); +} +  void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)  {  	struct rb_node **p, *parent; @@ -454,6 +607,14 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)  		bfqq->pos_root = NULL;  	} +	/* +	 * bfqq cannot be merged any longer (see comments in +	 * bfq_setup_cooperator): no point in adding bfqq into the +	 * position tree. +	 */ +	if (bfq_too_late_for_merging(bfqq)) +		return; +  	if (bfq_class_idle(bfqq))  		return;  	if (!bfqq->next_rq) @@ -1247,6 +1408,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,  	if (old_wr_coeff == 1 && wr_or_deserves_wr) {  		/* start a weight-raising period */  		if (interactive) { +			bfqq->service_from_wr = 0;  			bfqq->wr_coeff = bfqd->bfq_wr_coeff;  			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);  		} else { @@ -1627,6 +1789,8 @@ static void bfq_remove_request(struct request_queue *q,  			rb_erase(&bfqq->pos_node, bfqq->pos_root);  			bfqq->pos_root = NULL;  		} +	} else { +		bfq_pos_tree_add_move(bfqd, bfqq);  	}  	if (rq->cmd_flags & REQ_META) @@ -1933,6 +2097,9 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)  static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,  					struct bfq_queue *new_bfqq)  { +	if (bfq_too_late_for_merging(new_bfqq)) +		return false; +  	if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||  	    (bfqq->ioprio_class != new_bfqq->ioprio_class))  		return false; @@ -1957,20 +2124,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,  }  /* - * If this function returns true, then bfqq cannot be merged. The idea - * is that true cooperation happens very early after processes start - * to do I/O. Usually, late cooperations are just accidental false - * positives. In case bfqq is weight-raised, such false positives - * would evidently degrade latency guarantees for bfqq. - */ -static bool wr_from_too_long(struct bfq_queue *bfqq) -{ -	return bfqq->wr_coeff > 1 && -		time_is_before_jiffies(bfqq->last_wr_start_finish + -				       msecs_to_jiffies(100)); -} - -/*   * Attempt to schedule a merge of bfqq with the currently in-service   * queue or with a close queue among the scheduled queues.  Return   * NULL if no merge was scheduled, a pointer to the shared bfq_queue @@ -1983,11 +2136,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq)   * to maintain. Besides, in such a critical condition as an out of memory,   * the benefits of queue merging may be little relevant, or even negligible.   * - * Weight-raised queues can be merged only if their weight-raising - * period has just started. In fact cooperating processes are usually - * started together. Thus, with this filter we avoid false positives - * that would jeopardize low-latency guarantees. - *   * WARNING: queue merging may impair fairness among non-weight raised   * queues, for at least two reasons: 1) the original weight of a   * merged queue may change during the merged state, 2) even being the @@ -2001,12 +2149,24 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,  {  	struct bfq_queue *in_service_bfqq, *new_bfqq; +	/* +	 * Prevent bfqq from being merged if it has been created too +	 * long ago. The idea is that true cooperating processes, and +	 * thus their associated bfq_queues, are supposed to be +	 * created shortly after each other. This is the case, e.g., +	 * for KVM/QEMU and dump I/O threads. Basing on this +	 * assumption, the following filtering greatly reduces the +	 * probability that two non-cooperating processes, which just +	 * happen to do close I/O for some short time interval, have +	 * their queues merged by mistake. +	 */ +	if (bfq_too_late_for_merging(bfqq)) +		return NULL; +  	if (bfqq->new_bfqq)  		return bfqq->new_bfqq; -	if (!io_struct || -	    wr_from_too_long(bfqq) || -	    unlikely(bfqq == &bfqd->oom_bfqq)) +	if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))  		return NULL;  	/* If there is only one backlogged queue, don't search. */ @@ -2015,12 +2175,9 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,  	in_service_bfqq = bfqd->in_service_queue; -	if (!in_service_bfqq || in_service_bfqq == bfqq -	    || wr_from_too_long(in_service_bfqq) || -	    unlikely(in_service_bfqq == &bfqd->oom_bfqq)) -		goto check_scheduled; - -	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && +	if (in_service_bfqq && in_service_bfqq != bfqq && +	    likely(in_service_bfqq != &bfqd->oom_bfqq) && +	    bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&  	    bfqq->entity.parent == in_service_bfqq->entity.parent &&  	    bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {  		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); @@ -2032,12 +2189,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,  	 * queues. The only thing we need is that the bio/request is not  	 * NULL, as we need it to establish whether a cooperator exists.  	 */ -check_scheduled:  	new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,  			bfq_io_struct_pos(io_struct, request)); -	if (new_bfqq && !wr_from_too_long(new_bfqq) && -	    likely(new_bfqq != &bfqd->oom_bfqq) && +	if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) &&  	    bfq_may_be_close_cooperator(bfqq, new_bfqq))  		return bfq_setup_merge(bfqq, new_bfqq); @@ -2062,7 +2217,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)  	bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);  	bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);  	if (unlikely(bfq_bfqq_just_created(bfqq) && -		     !bfq_bfqq_in_large_burst(bfqq))) { +		     !bfq_bfqq_in_large_burst(bfqq) && +		     bfqq->bfqd->low_latency)) {  		/*  		 * bfqq being merged right after being created: bfqq  		 * would have deserved interactive weight raising, but @@ -2917,45 +3073,87 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,   * whereas soft_rt_next_start is set to infinity for applications that do   * not.   * - * Unfortunately, even a greedy application may happen to behave in an - * isochronous way if the CPU load is high. In fact, the application may - * stop issuing requests while the CPUs are busy serving other processes, - * then restart, then stop again for a while, and so on. In addition, if - * the disk achieves a low enough throughput with the request pattern - * issued by the application (e.g., because the request pattern is random - * and/or the device is slow), then the application may meet the above - * bandwidth requirement too. To prevent such a greedy application to be - * deemed as soft real-time, a further rule is used in the computation of - * soft_rt_next_start: soft_rt_next_start must be higher than the current - * time plus the maximum time for which the arrival of a request is waited - * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. - * This filters out greedy applications, as the latter issue instead their - * next request as soon as possible after the last one has been completed - * (in contrast, when a batch of requests is completed, a soft real-time - * application spends some time processing data). + * Unfortunately, even a greedy (i.e., I/O-bound) application may + * happen to meet, occasionally or systematically, both the above + * bandwidth and isochrony requirements. This may happen at least in + * the following circumstances. First, if the CPU load is high. The + * application may stop issuing requests while the CPUs are busy + * serving other processes, then restart, then stop again for a while, + * and so on. The other circumstances are related to the storage + * device: the storage device is highly loaded or reaches a low-enough + * throughput with the I/O of the application (e.g., because the I/O + * is random and/or the device is slow). In all these cases, the + * I/O of the application may be simply slowed down enough to meet + * the bandwidth and isochrony requirements. To reduce the probability + * that greedy applications are deemed as soft real-time in these + * corner cases, a further rule is used in the computation of + * soft_rt_next_start: the return value of this function is forced to + * be higher than the maximum between the following two quantities. + * + * (a) Current time plus: (1) the maximum time for which the arrival + *     of a request is waited for when a sync queue becomes idle, + *     namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We + *     postpone for a moment the reason for adding a few extra + *     jiffies; we get back to it after next item (b).  Lower-bounding + *     the return value of this function with the current time plus + *     bfqd->bfq_slice_idle tends to filter out greedy applications, + *     because the latter issue their next request as soon as possible + *     after the last one has been completed. In contrast, a soft + *     real-time application spends some time processing data, after a + *     batch of its requests has been completed.   * - * Unfortunately, the last filter may easily generate false positives if - * only bfqd->bfq_slice_idle is used as a reference time interval and one - * or both the following cases occur: - * 1) HZ is so low that the duration of a jiffy is comparable to or higher - *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with - *    HZ=100. + * (b) Current value of bfqq->soft_rt_next_start. As pointed out + *     above, greedy applications may happen to meet both the + *     bandwidth and isochrony requirements under heavy CPU or + *     storage-device load. In more detail, in these scenarios, these + *     applications happen, only for limited time periods, to do I/O + *     slowly enough to meet all the requirements described so far, + *     including the filtering in above item (a). These slow-speed + *     time intervals are usually interspersed between other time + *     intervals during which these applications do I/O at a very high + *     speed. Fortunately, exactly because of the high speed of the + *     I/O in the high-speed intervals, the values returned by this + *     function happen to be so high, near the end of any such + *     high-speed interval, to be likely to fall *after* the end of + *     the low-speed time interval that follows. These high values are + *     stored in bfqq->soft_rt_next_start after each invocation of + *     this function. As a consequence, if the last value of + *     bfqq->soft_rt_next_start is constantly used to lower-bound the + *     next value that this function may return, then, from the very + *     beginning of a low-speed interval, bfqq->soft_rt_next_start is + *     likely to be constantly kept so high that any I/O request + *     issued during the low-speed interval is considered as arriving + *     to soon for the application to be deemed as soft + *     real-time. Then, in the high-speed interval that follows, the + *     application will not be deemed as soft real-time, just because + *     it will do I/O at a high speed. And so on. + * + * Getting back to the filtering in item (a), in the following two + * cases this filtering might be easily passed by a greedy + * application, if the reference quantity was just + * bfqd->bfq_slice_idle: + * 1) HZ is so low that the duration of a jiffy is comparable to or + *    higher than bfqd->bfq_slice_idle. This happens, e.g., on slow + *    devices with HZ=100. The time granularity may be so coarse + *    that the approximation, in jiffies, of bfqd->bfq_slice_idle + *    is rather lower than the exact value.   * 2) jiffies, instead of increasing at a constant rate, may stop increasing   *    for a while, then suddenly 'jump' by several units to recover the lost   *    increments. This seems to happen, e.g., inside virtual machines. - * To address this issue, we do not use as a reference time interval just - * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In - * particular we add the minimum number of jiffies for which the filter - * seems to be quite precise also in embedded systems and KVM/QEMU virtual - * machines. + * To address this issue, in the filtering in (a) we do not use as a + * reference time interval just bfqd->bfq_slice_idle, but + * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the + * minimum number of jiffies for which the filter seems to be quite + * precise also in embedded systems and KVM/QEMU virtual machines.   */  static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,  						struct bfq_queue *bfqq)  { -	return max(bfqq->last_idle_bklogged + -		   HZ * bfqq->service_from_backlogged / -		   bfqd->bfq_wr_max_softrt_rate, -		   jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); +	return max3(bfqq->soft_rt_next_start, +		    bfqq->last_idle_bklogged + +		    HZ * bfqq->service_from_backlogged / +		    bfqd->bfq_wr_max_softrt_rate, +		    jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);  }  /** @@ -3000,17 +3198,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,  	slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);  	/* -	 * Increase service_from_backlogged before next statement, -	 * because the possible next invocation of -	 * bfq_bfqq_charge_time would likely inflate -	 * entity->service. In contrast, service_from_backlogged must -	 * contain real service, to enable the soft real-time -	 * heuristic to correctly compute the bandwidth consumed by -	 * bfqq. -	 */ -	bfqq->service_from_backlogged += entity->service; - -	/*  	 * As above explained, charge slow (typically seeky) and  	 * timed-out queues with the time and not the service  	 * received, to favor sequential workloads. @@ -3535,6 +3722,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)  				bfqq->entity.prio_changed = 1;  			}  		} +		if (bfqq->wr_coeff > 1 && +		    bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && +		    bfqq->service_from_wr > max_service_from_wr) { +			/* see comments on max_service_from_wr */ +			bfq_bfqq_end_wr(bfqq); +		}  	}  	/*  	 * To improve latency (for this or other queues), immediately @@ -3630,8 +3823,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)  		}  		/* -		 * We exploit the put_rq_private hook to decrement -		 * rq_in_driver, but put_rq_private will not be +		 * We exploit the bfq_finish_request hook to decrement +		 * rq_in_driver, but bfq_finish_request will not be  		 * invoked on this request. So, to avoid unbalance,  		 * just start this request, without incrementing  		 * rq_in_driver. As a negative consequence, @@ -3640,14 +3833,14 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)  		 * bfq_schedule_dispatch to be invoked uselessly.  		 *  		 * As for implementing an exact solution, the -		 * put_request hook, if defined, is probably invoked -		 * also on this request. So, by exploiting this hook, -		 * we could 1) increment rq_in_driver here, and 2) -		 * decrement it in put_request. Such a solution would -		 * let the value of the counter be always accurate, -		 * but it would entail using an extra interface -		 * function. This cost seems higher than the benefit, -		 * being the frequency of non-elevator-private +		 * bfq_finish_request hook, if defined, is probably +		 * invoked also on this request. So, by exploiting +		 * this hook, we could 1) increment rq_in_driver here, +		 * and 2) decrement it in bfq_finish_request. Such a +		 * solution would let the value of the counter be +		 * always accurate, but it would entail using an extra +		 * interface function. This cost seems higher than the +		 * benefit, being the frequency of non-elevator-private  		 * requests very low.  		 */  		goto start_rq; @@ -3689,35 +3882,16 @@ exit:  	return rq;  } -static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) -{ -	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -	struct request *rq;  #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) -	struct bfq_queue *in_serv_queue, *bfqq; -	bool waiting_rq, idle_timer_disabled; -#endif - -	spin_lock_irq(&bfqd->lock); - -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) -	in_serv_queue = bfqd->in_service_queue; -	waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); - -	rq = __bfq_dispatch_request(hctx); - -	idle_timer_disabled = -		waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); - -#else -	rq = __bfq_dispatch_request(hctx); -#endif -	spin_unlock_irq(&bfqd->lock); +static void bfq_update_dispatch_stats(struct request_queue *q, +				      struct request *rq, +				      struct bfq_queue *in_serv_queue, +				      bool idle_timer_disabled) +{ +	struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL; -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) -	bfqq = rq ? RQ_BFQQ(rq) : NULL;  	if (!idle_timer_disabled && !bfqq) -		return rq; +		return;  	/*  	 * rq and bfqq are guaranteed to exist until this function @@ -3732,7 +3906,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)  	 * In addition, the following queue lock guarantees that  	 * bfqq_group(bfqq) exists as well.  	 */ -	spin_lock_irq(hctx->queue->queue_lock); +	spin_lock_irq(q->queue_lock);  	if (idle_timer_disabled)  		/*  		 * Since the idle timer has been disabled, @@ -3751,9 +3925,37 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)  		bfqg_stats_set_start_empty_time(bfqg);  		bfqg_stats_update_io_remove(bfqg, rq->cmd_flags);  	} -	spin_unlock_irq(hctx->queue->queue_lock); +	spin_unlock_irq(q->queue_lock); +} +#else +static inline void bfq_update_dispatch_stats(struct request_queue *q, +					     struct request *rq, +					     struct bfq_queue *in_serv_queue, +					     bool idle_timer_disabled) {}  #endif +static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) +{ +	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; +	struct request *rq; +	struct bfq_queue *in_serv_queue; +	bool waiting_rq, idle_timer_disabled; + +	spin_lock_irq(&bfqd->lock); + +	in_serv_queue = bfqd->in_service_queue; +	waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); + +	rq = __bfq_dispatch_request(hctx); + +	idle_timer_disabled = +		waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); + +	spin_unlock_irq(&bfqd->lock); + +	bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue, +				  idle_timer_disabled); +  	return rq;  } @@ -4002,10 +4204,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,  	bfqq->split_time = bfq_smallest_from_now();  	/* -	 * Set to the value for which bfqq will not be deemed as -	 * soft rt when it becomes backlogged. +	 * To not forget the possibly high bandwidth consumed by a +	 * process/queue in the recent past, +	 * bfq_bfqq_softrt_next_start() returns a value at least equal +	 * to the current value of bfqq->soft_rt_next_start (see +	 * comments on bfq_bfqq_softrt_next_start).  Set +	 * soft_rt_next_start to now, to mean that bfqq has consumed +	 * no bandwidth so far.  	 */ -	bfqq->soft_rt_next_start = bfq_greatest_from_now(); +	bfqq->soft_rt_next_start = jiffies;  	/* first request is almost certainly seeky */  	bfqq->seek_history = 1; @@ -4276,16 +4483,46 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)  	return idle_timer_disabled;  } +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) +static void bfq_update_insert_stats(struct request_queue *q, +				    struct bfq_queue *bfqq, +				    bool idle_timer_disabled, +				    unsigned int cmd_flags) +{ +	if (!bfqq) +		return; + +	/* +	 * bfqq still exists, because it can disappear only after +	 * either it is merged with another queue, or the process it +	 * is associated with exits. But both actions must be taken by +	 * the same process currently executing this flow of +	 * instructions. +	 * +	 * In addition, the following queue lock guarantees that +	 * bfqq_group(bfqq) exists as well. +	 */ +	spin_lock_irq(q->queue_lock); +	bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); +	if (idle_timer_disabled) +		bfqg_stats_update_idle_time(bfqq_group(bfqq)); +	spin_unlock_irq(q->queue_lock); +} +#else +static inline void bfq_update_insert_stats(struct request_queue *q, +					   struct bfq_queue *bfqq, +					   bool idle_timer_disabled, +					   unsigned int cmd_flags) {} +#endif +  static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,  			       bool at_head)  {  	struct request_queue *q = hctx->queue;  	struct bfq_data *bfqd = q->elevator->elevator_data; -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)  	struct bfq_queue *bfqq = RQ_BFQQ(rq);  	bool idle_timer_disabled = false;  	unsigned int cmd_flags; -#endif  	spin_lock_irq(&bfqd->lock);  	if (blk_mq_sched_try_insert_merge(q, rq)) { @@ -4304,7 +4541,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,  		else  			list_add_tail(&rq->queuelist, &bfqd->dispatch);  	} else { -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)  		idle_timer_disabled = __bfq_insert_request(bfqd, rq);  		/*  		 * Update bfqq, because, if a queue merge has occurred @@ -4312,9 +4548,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,  		 * redirected into a new queue.  		 */  		bfqq = RQ_BFQQ(rq); -#else -		__bfq_insert_request(bfqd, rq); -#endif  		if (rq_mergeable(rq)) {  			elv_rqhash_add(q, rq); @@ -4323,35 +4556,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,  		}  	} -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)  	/*  	 * Cache cmd_flags before releasing scheduler lock, because rq  	 * may disappear afterwards (for example, because of a request  	 * merge).  	 */  	cmd_flags = rq->cmd_flags; -#endif +  	spin_unlock_irq(&bfqd->lock); -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) -	if (!bfqq) -		return; -	/* -	 * bfqq still exists, because it can disappear only after -	 * either it is merged with another queue, or the process it -	 * is associated with exits. But both actions must be taken by -	 * the same process currently executing this flow of -	 * instruction. -	 * -	 * In addition, the following queue lock guarantees that -	 * bfqq_group(bfqq) exists as well. -	 */ -	spin_lock_irq(q->queue_lock); -	bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); -	if (idle_timer_disabled) -		bfqg_stats_update_idle_time(bfqq_group(bfqq)); -	spin_unlock_irq(q->queue_lock); -#endif +	bfq_update_insert_stats(q, bfqq, idle_timer_disabled, +				cmd_flags);  }  static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, @@ -4482,7 +4697,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)  		bfq_schedule_dispatch(bfqd);  } -static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) +static void bfq_finish_request_body(struct bfq_queue *bfqq)  {  	bfqq->allocated--; @@ -4512,7 +4727,7 @@ static void bfq_finish_request(struct request *rq)  		spin_lock_irqsave(&bfqd->lock, flags);  		bfq_completed_request(bfqq, bfqd); -		bfq_put_rq_priv_body(bfqq); +		bfq_finish_request_body(bfqq);  		spin_unlock_irqrestore(&bfqd->lock, flags);  	} else { @@ -4533,7 +4748,7 @@ static void bfq_finish_request(struct request *rq)  			bfqg_stats_update_io_remove(bfqq_group(bfqq),  						    rq->cmd_flags);  		} -		bfq_put_rq_priv_body(bfqq); +		bfq_finish_request_body(bfqq);  	}  	rq->elv.priv[0] = NULL; @@ -4818,6 +5033,9 @@ static void bfq_exit_queue(struct elevator_queue *e)  	hrtimer_cancel(&bfqd->idle_slice_timer);  #ifdef CONFIG_BFQ_GROUP_IOSCHED +	/* release oom-queue reference to root group */ +	bfqg_and_blkg_put(bfqd->root_group); +  	blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);  #else  	spin_lock_irq(&bfqd->lock); @@ -5206,6 +5424,7 @@ static struct elv_fs_entry bfq_attrs[] = {  static struct elevator_type iosched_bfq_mq = {  	.ops.mq = { +		.limit_depth		= bfq_limit_depth,  		.prepare_request	= bfq_prepare_request,  		.finish_request		= bfq_finish_request,  		.exit_icq		= bfq_exit_icq, | 
