qcacmn: Prefetch Tx HW desc, SW desc and SKB in pipeline fashion

Prefetch TX HW desc, SW desc and SKB in pipeline fashion in Tx.
completion path.
This improves the UDP DL CPU idle% by ~4.5%

Change-Id: I48096e996cd835321ce2681d3981fa94c7189f54
This commit is contained in:
Neha Bisht
2021-10-28 12:22:36 +05:30
committed by Madan Koyyalamudi
parent d927fa2e18
commit 15b88ae15e
5 changed files with 143 additions and 10 deletions

View File

@@ -2193,7 +2193,9 @@ static inline void dp_srng_dst_inv_cached_descs(struct dp_soc *dp_soc,
}
#endif /* QCA_CACHED_RING_DESC */
#if defined(QCA_CACHED_RING_DESC) && defined(QCA_DP_RX_HW_SW_NBUF_DESC_PREFETCH)
#if defined(QCA_CACHED_RING_DESC) && \
(defined(QCA_DP_RX_HW_SW_NBUF_DESC_PREFETCH) || \
defined(QCA_DP_TX_HW_SW_NBUF_DESC_PREFETCH))
/**
* dp_srng_dst_prefetch() - Wrapper function to prefetch descs from dest ring
* @hal_soc_hdl: HAL SOC handle

View File

@@ -1,6 +1,6 @@
/*
* Copyright (c) 2016-2021 The Linux Foundation. All rights reserved.
* Copyright (c) 2021 Qualcomm Innovation Center, Inc. All rights reserved.
* Copyright (c) 2021,2022 Qualcomm Innovation Center, Inc. All rights reserved.
*
* Permission to use, copy, modify, and/or distribute this software for
* any purpose with or without fee is hereby granted, provided that the
@@ -4360,6 +4360,35 @@ void dp_tx_update_peer_basic_stats(struct dp_peer *peer, uint32_t length,
}
#endif
/*
* dp_tx_prefetch_next_nbuf_data(): Prefetch nbuf and nbuf data
* @nbuf: skb buffer
*
* Return: none
*/
#ifdef QCA_DP_RX_NBUF_AND_NBUF_DATA_PREFETCH
static inline
void dp_tx_prefetch_next_nbuf_data(struct dp_tx_desc_s *next)
{
qdf_nbuf_t nbuf = NULL;
if (next)
nbuf = next->nbuf;
if (nbuf) {
/* prefetch skb->next and first few bytes of skb->cb */
qdf_prefetch(nbuf);
/* prefetch skb fields present in different cachelines */
qdf_prefetch(&nbuf->len);
qdf_prefetch(&nbuf->users);
}
}
#else
static inline
void dp_tx_prefetch_next_nbuf_data(struct dp_tx_desc_s *next)
{
}
#endif
/**
* dp_tx_comp_process_desc_list() - Tx complete software descriptor handler
* @soc: core txrx main context
@@ -4385,6 +4414,9 @@ dp_tx_comp_process_desc_list(struct dp_soc *soc,
desc = comp_head;
while (desc) {
next = desc->next;
dp_tx_prefetch_next_nbuf_data(next);
if (peer_id != desc->peer_id) {
if (peer)
dp_peer_unref_delete(peer,
@@ -4409,7 +4441,6 @@ dp_tx_comp_process_desc_list(struct dp_soc *soc,
* Calling a QDF WRAPPER here is creating signifcant
* performance impact so avoided the wrapper call here
*/
next = desc->next;
dp_tx_desc_history_add(soc, desc->dma_addr, desc->nbuf,
desc->id, DP_TX_COMP_UNMAP);
qdf_nbuf_unmap_nbytes_single_paddr(soc->osdev,
@@ -4434,8 +4465,6 @@ dp_tx_comp_process_desc_list(struct dp_soc *soc,
dp_tx_comp_process_desc(soc, desc, &ts, peer);
next = desc->next;
dp_tx_desc_release(desc, desc->pool_id);
desc = next;
}
@@ -4511,6 +4540,9 @@ uint32_t dp_tx_comp_handler(struct dp_intr *int_ctx, struct dp_soc *soc,
uint32_t quota)
{
void *tx_comp_hal_desc;
void *last_prefetched_hw_desc = NULL;
struct dp_tx_desc_s *last_prefetched_sw_desc = NULL;
hal_soc_handle_t hal_soc;
uint8_t buffer_src;
struct dp_tx_desc_s *tx_desc = NULL;
struct dp_tx_desc_s *head_desc = NULL;
@@ -4525,6 +4557,8 @@ uint32_t dp_tx_comp_handler(struct dp_intr *int_ctx, struct dp_soc *soc,
DP_HIST_INIT();
more_data:
hal_soc = soc->hal_soc;
/* Re-initialize local variables to be re-used */
head_desc = NULL;
tail_desc = NULL;
@@ -4539,12 +4573,14 @@ more_data:
return 0;
}
num_avail_for_reap = hal_srng_dst_num_valid(soc->hal_soc, hal_ring_hdl, 0);
num_avail_for_reap = hal_srng_dst_num_valid(hal_soc, hal_ring_hdl, 0);
if (num_avail_for_reap >= quota)
num_avail_for_reap = quota;
dp_srng_dst_inv_cached_descs(soc, hal_ring_hdl, num_avail_for_reap);
last_prefetched_hw_desc = dp_srng_dst_prefetch(hal_soc, hal_ring_hdl,
num_avail_for_reap);
/* Find head descriptor from completion ring */
while (qdf_likely(num_avail_for_reap--)) {
@@ -4552,7 +4588,7 @@ more_data:
tx_comp_hal_desc = dp_srng_dst_get_next(soc, hal_ring_hdl);
if (qdf_unlikely(!tx_comp_hal_desc))
break;
buffer_src = hal_tx_comp_get_buffer_source(soc->hal_soc,
buffer_src = hal_tx_comp_get_buffer_source(hal_soc,
tx_comp_hal_desc);
/* If this buffer was not released by TQM or FW, then it is not
@@ -4578,7 +4614,7 @@ more_data:
* Tx completions, and should just be ignored
*/
wbm_internal_error = hal_get_wbm_internal_error(
soc->hal_soc,
hal_soc,
tx_comp_hal_desc);
if (wbm_internal_error) {
@@ -4689,6 +4725,12 @@ next_desc:
count++;
dp_tx_prefetch_hw_sw_nbuf_desc(soc, hal_soc,
num_avail_for_reap,
hal_ring_hdl,
&last_prefetched_hw_desc,
&last_prefetched_sw_desc);
if (dp_tx_comp_loop_pkt_limit_hit(soc, count, max_reap_limit))
break;
}

View File

@@ -1,6 +1,6 @@
/*
* Copyright (c) 2016-2021 The Linux Foundation. All rights reserved.
* Copyright (c) 2021 Qualcomm Innovation Center, Inc. All rights reserved.
* Copyright (c) 2021,2022 Qualcomm Innovation Center, Inc. All rights reserved.
*
* Permission to use, copy, modify, and/or distribute this software for
* any purpose with or without fee is hereby granted, provided that the
@@ -388,6 +388,56 @@ static inline QDF_STATUS dp_tx_pdev_init(struct dp_pdev *pdev)
return QDF_STATUS_SUCCESS;
}
/**
* dp_tx_prefetch_hw_sw_nbuf_desc() - function to prefetch HW and SW desc
* @soc: Handle to HAL Soc structure
* @hal_soc: HAL SOC handle
* @num_avail_for_reap: descriptors available for reap
* @hal_ring_hdl: ring pointer
* @last_prefetched_hw_desc: pointer to the last prefetched HW descriptor
* @last_prefetched_sw_desc: pointer to last prefetch SW desc
*
* Return: None
*/
#ifdef QCA_DP_TX_HW_SW_NBUF_DESC_PREFETCH
static inline
void dp_tx_prefetch_hw_sw_nbuf_desc(struct dp_soc *soc,
hal_soc_handle_t hal_soc,
uint32_t num_avail_for_reap,
hal_ring_handle_t hal_ring_hdl,
void **last_prefetched_hw_desc,
struct dp_tx_desc_s
**last_prefetched_sw_desc)
{
if (*last_prefetched_sw_desc) {
qdf_prefetch((uint8_t *)(*last_prefetched_sw_desc)->nbuf);
qdf_prefetch((uint8_t *)(*last_prefetched_sw_desc)->nbuf + 64);
}
if (num_avail_for_reap && *last_prefetched_hw_desc) {
dp_tx_comp_get_prefetched_params_from_hal_desc(
soc,
*last_prefetched_hw_desc,
last_prefetched_sw_desc);
*last_prefetched_hw_desc =
hal_srng_dst_prefetch_next_cached_desc(
hal_soc,
hal_ring_hdl,
(uint8_t *)*last_prefetched_hw_desc);
}
}
#else
static inline
void dp_tx_prefetch_hw_sw_nbuf_desc(struct dp_soc *soc,
hal_soc_handle_t hal_soc,
uint32_t num_avail_for_reap,
hal_ring_handle_t hal_ring_hdl,
void **last_prefetched_hw_desc,
struct dp_tx_desc_s
**last_prefetched_sw_desc)
{
}
#endif
#ifndef FEATURE_WDS
static inline void dp_tx_mec_handler(struct dp_vdev *vdev, uint8_t *status)

View File

@@ -1677,6 +1677,7 @@ struct dp_arch_ops {
struct dp_tx_desc_s *tx_desc,
uint8_t *status,
uint8_t ring_id);
uint32_t (*dp_rx_process)(struct dp_intr *int_ctx,
hal_ring_handle_t hal_ring_hdl,
uint8_t reo_ring_num, uint32_t quota);
@@ -3751,4 +3752,19 @@ void dp_vdev_get_default_reo_hash(struct dp_vdev *vdev,
bool dp_reo_remap_config(struct dp_soc *soc, uint32_t *remap0,
uint32_t *remap1, uint32_t *remap2);
#ifdef QCA_DP_TX_HW_SW_NBUF_DESC_PREFETCH
/**
* dp_tx_comp_get_prefetched_params_from_hal_desc() - Get prefetched TX desc
* @soc: DP soc handle
* @tx_comp_hal_desc: HAL TX Comp Descriptor
* @r_tx_desc: SW Tx Descriptor retrieved from HAL desc.
*
* Return: None
*/
void dp_tx_comp_get_prefetched_params_from_hal_desc(
struct dp_soc *soc,
void *tx_comp_hal_desc,
struct dp_tx_desc_s **r_tx_desc);
#endif
#endif /* _DP_TYPES_H_ */

View File

@@ -1,6 +1,6 @@
/*
* Copyright (c) 2021 The Linux Foundation. All rights reserved.
* Copyright (c) 2021 Qualcomm Innovation Center, Inc. All rights reserved.
* Copyright (c) 2021,2022 Qualcomm Innovation Center, Inc. All rights reserved.
*
* Permission to use, copy, modify, and/or distribute this software for
* any purpose with or without fee is hereby granted, provided that the
@@ -22,6 +22,7 @@
#include <dp_htt.h>
#include "dp_li.h"
#include "dp_li_tx.h"
#include "dp_tx_desc.h"
#include "dp_li_rx.h"
#include "dp_peer.h"
@@ -429,3 +430,25 @@ void dp_initialize_arch_ops_li(struct dp_arch_ops *arch_ops)
arch_ops->txrx_print_peer_stats = dp_print_peer_txrx_stats_li;
}
#ifdef QCA_DP_TX_HW_SW_NBUF_DESC_PREFETCH
void dp_tx_comp_get_prefetched_params_from_hal_desc(
struct dp_soc *soc,
void *tx_comp_hal_desc,
struct dp_tx_desc_s **r_tx_desc)
{
uint8_t pool_id;
uint32_t tx_desc_id;
tx_desc_id = hal_tx_comp_get_desc_id(tx_comp_hal_desc);
pool_id = (tx_desc_id & DP_TX_DESC_ID_POOL_MASK) >>
DP_TX_DESC_ID_POOL_OS;
/* Find Tx descriptor */
*r_tx_desc = dp_tx_desc_find(soc, pool_id,
(tx_desc_id & DP_TX_DESC_ID_PAGE_MASK) >>
DP_TX_DESC_ID_PAGE_OS,
(tx_desc_id & DP_TX_DESC_ID_OFFSET_MASK) >>
DP_TX_DESC_ID_OFFSET_OS);
qdf_prefetch((uint8_t *)*r_tx_desc);
}
#endif