kernel: backport GRO improvements
Improves network performance Signed-off-by: Felix Fietkau <nbd@nbd.name>
This commit is contained in:
		| @@ -0,0 +1,78 @@ | ||||
| From: Alexander Lobakin <alobakin@dlink.ru> | ||||
| Date: Fri, 15 Nov 2019 12:11:35 +0300 | ||||
| Subject: [PATCH] net: core: allow fast GRO for skbs with Ethernet header in | ||||
|  head | ||||
|  | ||||
| Commit 78d3fd0b7de8 ("gro: Only use skb_gro_header for completely | ||||
| non-linear packets") back in May'09 (v2.6.31-rc1) has changed the | ||||
| original condition '!skb_headlen(skb)' to | ||||
| 'skb->mac_header == skb->tail' in gro_reset_offset() saying: "Since | ||||
| the drivers that need this optimisation all provide completely | ||||
| non-linear packets" (note that this condition has become the current | ||||
| 'skb_mac_header(skb) == skb_tail_pointer(skb)' later with commmit | ||||
| ced14f6804a9 ("net: Correct comparisons and calculations using | ||||
| skb->tail and skb-transport_header") without any functional changes). | ||||
|  | ||||
| For now, we have the following rough statistics for v5.4-rc7: | ||||
| 1) napi_gro_frags: 14 | ||||
| 2) napi_gro_receive with skb->head containing (most of) payload: 83 | ||||
| 3) napi_gro_receive with skb->head containing all the headers: 20 | ||||
| 4) napi_gro_receive with skb->head containing only Ethernet header: 2 | ||||
|  | ||||
| With the current condition, fast GRO with the usage of | ||||
| NAPI_GRO_CB(skb)->frag0 is available only in the [1] case. | ||||
| Packets pushed by [2] and [3] go through the 'slow' path, but | ||||
| it's not a problem for them as they already contain all the needed | ||||
| headers in skb->head, so pskb_may_pull() only moves skb->data. | ||||
|  | ||||
| The layout of skbs in the fourth [4] case at the moment of | ||||
| dev_gro_receive() is identical to skbs that have come through [1], | ||||
| as napi_frags_skb() pulls Ethernet header to skb->head. The only | ||||
| difference is that the mentioned condition is always false for them, | ||||
| because skb_put() and friends irreversibly alter the tail pointer. | ||||
| They also go through the 'slow' path, but now every single | ||||
| pskb_may_pull() in every single .gro_receive() will call the *really* | ||||
| slow __pskb_pull_tail() to pull headers to head. This significantly | ||||
| decreases the overall performance for no visible reasons. | ||||
|  | ||||
| The only two users of method [4] is: | ||||
| * drivers/staging/qlge | ||||
| * drivers/net/wireless/iwlwifi (all three variants: dvm, mvm, mvm-mq) | ||||
|  | ||||
| Note that in case with wireless drivers we can't use [1] | ||||
| (napi_gro_frags()) at least for now and mac80211 stack always | ||||
| performs pushes and pulls anyways, so performance hit is inavoidable. | ||||
|  | ||||
| At the moment of v2.6.31 the mentioned change was necessary (that's | ||||
| why I don't add the "Fixes:" tag), but it became obsolete since | ||||
| skb_gro_mac_header() has gone in commit a50e233c50db ("net-gro: | ||||
| restore frag0 optimization"), so we can simply revert the condition | ||||
| in gro_reset_offset() to allow skbs from [4] go through the 'fast' | ||||
| path just like in case [1]. | ||||
|  | ||||
| This was tested on a 600 MHz MIPS CPU and a custom driver and this | ||||
| patch gave boosts up to 40 Mbps to method [4] in both directions | ||||
| comparing to net-next, which made overall performance relatively | ||||
| close to [1] (without it, [4] is the slowest). | ||||
|  | ||||
| v2: | ||||
| - Add more references and explanations to commit message | ||||
| - Fix some typos ibid | ||||
| - No functional changes | ||||
|  | ||||
| Signed-off-by: Alexander Lobakin <alobakin@dlink.ru> | ||||
| Signed-off-by: David S. Miller <davem@davemloft.net> | ||||
| --- | ||||
|  | ||||
| --- a/net/core/dev.c | ||||
| +++ b/net/core/dev.c | ||||
| @@ -5403,8 +5403,7 @@ static void skb_gro_reset_offset(struct | ||||
|  	NAPI_GRO_CB(skb)->frag0 = NULL; | ||||
|  	NAPI_GRO_CB(skb)->frag0_len = 0; | ||||
|   | ||||
| -	if (skb_mac_header(skb) == skb_tail_pointer(skb) && | ||||
| -	    pinfo->nr_frags && | ||||
| +	if (!skb_headlen(skb) && pinfo->nr_frags && | ||||
|  	    !PageHighMem(skb_frag_page(frag0))) { | ||||
|  		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); | ||||
|  		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, | ||||
| @@ -0,0 +1,51 @@ | ||||
| From: Alexander Lobakin <alobakin@dlink.ru> | ||||
| Date: Mon, 14 Oct 2019 11:00:33 +0300 | ||||
| Subject: [PATCH] net: core: use listified Rx for GRO_NORMAL in | ||||
|  napi_gro_receive() | ||||
|  | ||||
| Commit 323ebb61e32b4 ("net: use listified RX for handling GRO_NORMAL | ||||
| skbs") made use of listified skb processing for the users of | ||||
| napi_gro_frags(). | ||||
| The same technique can be used in a way more common napi_gro_receive() | ||||
| to speed up non-merged (GRO_NORMAL) skbs for a wide range of drivers | ||||
| including gro_cells and mac80211 users. | ||||
| This slightly changes the return value in cases where skb is being | ||||
| dropped by the core stack, but it seems to have no impact on related | ||||
| drivers' functionality. | ||||
| gro_normal_batch is left untouched as it's very individual for every | ||||
| single system configuration and might be tuned in manual order to | ||||
| achieve an optimal performance. | ||||
|  | ||||
| Signed-off-by: Alexander Lobakin <alobakin@dlink.ru> | ||||
| Acked-by: Edward Cree <ecree@solarflare.com> | ||||
| Signed-off-by: David S. Miller <davem@davemloft.net> | ||||
| --- | ||||
|  | ||||
| --- a/net/core/dev.c | ||||
| +++ b/net/core/dev.c | ||||
| @@ -5601,12 +5601,13 @@ static void napi_skb_free_stolen_head(st | ||||
|  	kmem_cache_free(skbuff_head_cache, skb); | ||||
|  } | ||||
|   | ||||
| -static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) | ||||
| +static gro_result_t napi_skb_finish(struct napi_struct *napi, | ||||
| +				    struct sk_buff *skb, | ||||
| +				    gro_result_t ret) | ||||
|  { | ||||
|  	switch (ret) { | ||||
|  	case GRO_NORMAL: | ||||
| -		if (netif_receive_skb_internal(skb)) | ||||
| -			ret = GRO_DROP; | ||||
| +		gro_normal_one(napi, skb); | ||||
|  		break; | ||||
|   | ||||
|  	case GRO_DROP: | ||||
| @@ -5638,7 +5639,7 @@ gro_result_t napi_gro_receive(struct nap | ||||
|   | ||||
|  	skb_gro_reset_offset(skb); | ||||
|   | ||||
| -	ret = napi_skb_finish(dev_gro_receive(napi, skb), skb); | ||||
| +	ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); | ||||
|  	trace_napi_gro_receive_exit(ret); | ||||
|   | ||||
|  	return ret; | ||||
| @@ -32,7 +32,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name> | ||||
|  	__u16			tc_index;	/* traffic control index */ | ||||
| --- a/net/core/dev.c | ||||
| +++ b/net/core/dev.c | ||||
| @@ -5469,6 +5469,9 @@ static enum gro_result dev_gro_receive(s | ||||
| @@ -5468,6 +5468,9 @@ static enum gro_result dev_gro_receive(s | ||||
|  	int same_flow; | ||||
|  	int grow; | ||||
|   | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Felix Fietkau
					Felix Fietkau