Explorar o código

qcacmn: Prefetch RX HW desc, SW desc and SKB in pipeline fashion

Prefetch RX HW desc, SW desc and SKB in pipeline
fasion in the first loop of RX processing.

This has improved TPUT by 200Mbps and provided a
10% gain in CPU (single core)

PINE with other optimizations: 3960Mbps @ 100% core-3
PINE + pipeline prefetch: 4130Mbps @ 90%  core-3

Change-Id: I47f351601b264eb3a2b50e4154229d55da738724
Tallapragada Kalyan %!s(int64=3) %!d(string=hai) anos
pai
achega
4e7ceff561
Modificáronse 4 ficheiros con 175 adicións e 0 borrados
  1. 24 0
      dp/wifi3.0/dp_internal.h
  2. 11 0
      dp/wifi3.0/li/dp_li_rx.c
  3. 71 0
      dp/wifi3.0/li/dp_li_rx.h
  4. 69 0
      hal/wifi3.0/hal_api.h

+ 24 - 0
dp/wifi3.0/dp_internal.h

@@ -2435,6 +2435,30 @@ static inline void dp_srng_dst_inv_cached_descs(struct dp_soc *dp_soc,
 }
 #endif /* QCA_CACHED_RING_DESC */
 
+#if defined(QCA_CACHED_RING_DESC) && defined(QCA_DP_RX_HW_SW_NBUF_DESC_PREFETCH)
+/**
+ * dp_srng_dst_prefetch() - Wrapper function to prefetch descs from dest ring
+ * @hal_soc_hdl: HAL SOC handle
+ * @hal_ring: opaque pointer to the HAL Rx Destination ring
+ * @num_entries: Entry count
+ *
+ * Return: None
+ */
+static inline void *dp_srng_dst_prefetch(hal_soc_handle_t hal_soc,
+					 hal_ring_handle_t hal_ring_hdl,
+					 uint32_t num_entries)
+{
+	return hal_srng_dst_prefetch(hal_soc, hal_ring_hdl, num_entries);
+}
+#else
+static inline void *dp_srng_dst_prefetch(hal_soc_handle_t hal_soc,
+					 hal_ring_handle_t hal_ring_hdl,
+					 uint32_t num_entries)
+{
+	return NULL;
+}
+#endif
+
 #ifdef QCA_ENH_V3_STATS_SUPPORT
 /**
  * dp_pdev_print_delay_stats(): Print pdev level delay stats

+ 11 - 0
dp/wifi3.0/li/dp_li_rx.c

@@ -203,8 +203,10 @@ uint32_t dp_rx_process_li(struct dp_intr *int_ctx,
 			  uint32_t quota)
 {
 	hal_ring_desc_t ring_desc;
+	hal_ring_desc_t last_prefetched_hw_desc;
 	hal_soc_handle_t hal_soc;
 	struct dp_rx_desc *rx_desc = NULL;
+	struct dp_rx_desc *last_prefetched_sw_desc = NULL;
 	qdf_nbuf_t nbuf, next;
 	bool near_full;
 	union dp_rx_desc_list_elem_t *head[MAX_PDEV_CNT];
@@ -296,6 +298,9 @@ more_data:
 	if (num_pending > quota)
 		num_pending = quota;
 
+	last_prefetched_hw_desc = dp_srng_dst_prefetch(hal_soc, hal_ring_hdl,
+						       num_pending);
+
 	/*
 	 * start reaping the buffers from reo ring and queue
 	 * them in per vdev queue.
@@ -516,6 +521,12 @@ more_data:
 		dp_rx_add_to_free_desc_list(&head[rx_desc->pool_id],
 					    &tail[rx_desc->pool_id], rx_desc);
 		num_rx_bufs_reaped++;
+
+		dp_rx_prefetch_hw_sw_nbuf_desc(soc, hal_soc, num_pending,
+					       hal_ring_hdl,
+					       &last_prefetched_hw_desc,
+					       &last_prefetched_sw_desc);
+
 		/*
 		 * only if complete msdu is received for scatter case,
 		 * then allow break.

+ 71 - 0
dp/wifi3.0/li/dp_li_rx.h

@@ -130,4 +130,75 @@ void dp_rx_prefetch_nbuf_data(qdf_nbuf_t nbuf, qdf_nbuf_t next)
 {
 }
 #endif
+
+#ifdef QCA_DP_RX_HW_SW_NBUF_DESC_PREFETCH
+/**
+ * dp_rx_cookie_2_va_rxdma_buf_prefetch() - function to prefetch the SW desc
+ * @soc: Handle to DP Soc structure
+ * @cookie: cookie used to lookup virtual address
+ *
+ * Return: prefetched Rx descriptor virtual address
+ */
+static inline
+void *dp_rx_cookie_2_va_rxdma_buf_prefetch(struct dp_soc *soc, uint32_t cookie)
+{
+	uint8_t pool_id = DP_RX_DESC_COOKIE_POOL_ID_GET(cookie);
+	uint16_t index = DP_RX_DESC_COOKIE_INDEX_GET(cookie);
+	struct rx_desc_pool *rx_desc_pool;
+	void *prefetch_desc;
+
+	if (qdf_unlikely(pool_id >= MAX_RXDESC_POOLS))
+		return NULL;
+
+	rx_desc_pool = &soc->rx_desc_buf[pool_id];
+
+	if (qdf_unlikely(index >= rx_desc_pool->pool_size))
+		return NULL;
+
+	prefetch_desc = &soc->rx_desc_buf[pool_id].array[index].rx_desc;
+	qdf_prefetch(prefetch_desc);
+	return prefetch_desc;
+}
+
+/**
+ * dp_rx_prefetch_hw_sw_nbuf_desc() - function to prefetch HW and SW desc
+ * @soc: Handle to HAL Soc structure
+ * @num_entries: valid number of HW descriptors
+ * @hal_ring_hdl: Destination ring pointer
+ * @last_prefetched_hw_desc: pointer to the last prefetched HW descriptor
+ * @last_prefetched_sw_desc: input & output param of last prefetch SW desc
+ *
+ * Return: None
+ */
+static inline
+void dp_rx_prefetch_hw_sw_nbuf_desc(struct dp_soc *soc,
+				    hal_soc_handle_t hal_soc,
+				    uint32_t num_entries,
+				    hal_ring_handle_t hal_ring_hdl,
+				    hal_ring_desc_t *last_prefetched_hw_desc,
+				    struct dp_rx_desc **last_prefetched_sw_desc)
+{
+	if (*last_prefetched_sw_desc) {
+		qdf_prefetch((uint8_t *)(*last_prefetched_sw_desc)->nbuf);
+		qdf_prefetch((uint8_t *)(*last_prefetched_sw_desc)->nbuf + 64);
+	}
+
+	if (num_entries) {
+		*last_prefetched_sw_desc = dp_rx_cookie_2_va_rxdma_buf_prefetch(soc, HAL_RX_REO_BUF_COOKIE_GET(*last_prefetched_hw_desc));
+		*last_prefetched_hw_desc = hal_srng_dst_prefetch_next_cached_desc(hal_soc,
+										  hal_ring_hdl,
+										  (uint8_t *)*last_prefetched_hw_desc);
+	}
+}
+#else
+static inline
+void dp_rx_prefetch_hw_sw_nbuf_desc(struct dp_soc *soc,
+				    hal_soc_handle_t hal_soc,
+				    uint32_t quota,
+				    hal_ring_handle_t hal_ring_hdl,
+				    hal_ring_desc_t *last_prefetched_hw_desc,
+				    struct dp_rx_desc **last_prefetched_sw_desc)
+{
+}
+#endif
 #endif

+ 69 - 0
hal/wifi3.0/hal_api.h

@@ -2969,4 +2969,73 @@ hal_dmac_cmn_src_rxbuf_ring_get(hal_soc_handle_t hal_soc_hdl)
 
 	return hal_soc->dmac_cmn_src_rxbuf_ring;
 }
+
+/**
+ * hal_srng_dst_prefetch() - function to prefetch 4 destination ring descs
+ * @hal_soc_hdl: HAL SOC handle
+ * @hal_ring_hdl: Destination ring pointer
+ * @num_valid: valid entries in the ring
+ *
+ * return: last prefetched destination ring descriptor
+ */
+static inline
+void *hal_srng_dst_prefetch(hal_soc_handle_t hal_soc_hdl,
+			    hal_ring_handle_t hal_ring_hdl,
+			    uint16_t num_valid)
+{
+	struct hal_srng *srng = (struct hal_srng *)hal_ring_hdl;
+	uint8_t *desc;
+	uint32_t cnt;
+	/*
+	 * prefetching 4 HW descriptors will ensure atleast by the time
+	 * 5th HW descriptor is being processed it is guranteed that the
+	 * 5th HW descriptor, its SW Desc, its nbuf and its nbuf's data
+	 * are in cache line. basically ensuring all the 4 (HW, SW, nbuf
+	 * & nbuf->data) are prefetched.
+	 */
+	uint32_t max_prefetch = 4;
+
+	if (srng->u.dst_ring.tp == srng->u.dst_ring.cached_hp)
+		return NULL;
+
+	desc = (uint8_t *)&srng->ring_base_vaddr[srng->u.dst_ring.tp];
+
+	if (num_valid < max_prefetch)
+		max_prefetch = num_valid;
+
+	for (cnt = 0; cnt < max_prefetch; cnt++) {
+		desc += srng->entry_size * sizeof(uint32_t);
+		if (desc  == ((uint8_t *)srng->ring_vaddr_end))
+			desc = (uint8_t *)&srng->ring_base_vaddr[0];
+
+		qdf_prefetch(desc);
+	}
+	return (void *)desc;
+}
+
+/**
+ * hal_srng_dst_prefetch_next_cached_desc() - function to prefetch next desc
+ * @hal_soc_hdl: HAL SOC handle
+ * @hal_ring_hdl: Destination ring pointer
+ * @last_prefetched_hw_desc: last prefetched HW descriptor
+ *
+ * return: next prefetched destination descriptor
+ */
+static inline
+void *hal_srng_dst_prefetch_next_cached_desc(hal_soc_handle_t hal_soc_hdl,
+					     hal_ring_handle_t hal_ring_hdl,
+					     uint8_t *last_prefetched_hw_desc)
+{
+	struct hal_srng *srng = (struct hal_srng *)hal_ring_hdl;
+
+	if (srng->u.dst_ring.tp == srng->u.dst_ring.cached_hp)
+		return NULL;
+
+	last_prefetched_hw_desc += srng->entry_size * sizeof(uint32_t);
+	if (last_prefetched_hw_desc == ((uint8_t *)srng->ring_vaddr_end))
+		last_prefetched_hw_desc = (uint8_t *)&srng->ring_base_vaddr[0];
+
+	qdf_prefetch(last_prefetched_hw_desc);
+	return (void *)last_prefetched_hw_desc;
+}
 #endif /* _HAL_APIH_ */