Browse Source

qcacmn: Do Batched invalidate of tx completion descriptor

Do batched invalidate of tx completion descriptor to avoid
unnecessary D-cache miss for 32 byte size descriptor.

Change-Id: Ia580fe78dcef5b36f117aaad171a2df6d0e34966
Neha Bisht 5 năm trước cách đây
mục cha
commit
9aa9221c98
3 tập tin đã thay đổi với 194 bổ sung38 xóa
  1. 49 0
      dp/wifi3.0/dp_internal.h
  2. 16 6
      dp/wifi3.0/dp_tx.c
  3. 129 32
      hal/wifi3.0/hal_api.h

+ 49 - 0
dp/wifi3.0/dp_internal.h

@@ -1761,6 +1761,55 @@ static inline void dp_srng_access_end(struct dp_intr *int_ctx,
 }
 #endif /* WLAN_FEATURE_DP_EVENT_HISTORY */
 
+#ifdef QCA_CACHED_RING_DESC
+/**
+ * dp_srng_dst_get_next() - Wrapper function to get next ring desc
+ * @dp_socsoc: DP Soc handle
+ * @hal_ring: opaque pointer to the HAL Destination Ring
+ *
+ * Return: HAL ring descriptor
+ */
+static inline void *dp_srng_dst_get_next(struct dp_soc *dp_soc,
+					 hal_ring_handle_t hal_ring_hdl)
+{
+	hal_soc_handle_t hal_soc = dp_soc->hal_soc;
+
+	return hal_srng_dst_get_next_cached(hal_soc, hal_ring_hdl);
+}
+
+/**
+ * dp_srng_dst_inv_cached_descs() - Wrapper function to invalidate cached
+ * descriptors
+ * @dp_socsoc: DP Soc handle
+ * @hal_ring: opaque pointer to the HAL Rx Destination ring
+ * @num_entries: Entry count
+ *
+ * Return: None
+ */
+static inline void dp_srng_dst_inv_cached_descs(struct dp_soc *dp_soc,
+						hal_ring_handle_t hal_ring_hdl,
+						uint32_t num_entries)
+{
+	hal_soc_handle_t hal_soc = dp_soc->hal_soc;
+
+	hal_srng_dst_inv_cached_descs(soc->hal_soc, hal_ring_hdl, num_entries);
+}
+#else
+static inline void *dp_srng_dst_get_next(struct dp_soc *dp_soc,
+					 hal_ring_handle_t hal_ring_hdl)
+{
+	hal_soc_handle_t hal_soc = dp_soc->hal_soc;
+
+	return hal_srng_dst_get_next(hal_soc, hal_ring_hdl);
+}
+
+static inline void dp_srng_dst_inv_cached_descs(struct dp_soc *dp_soc,
+						hal_ring_handle_t hal_ring_hdl,
+						uint32_t num_entries)
+{
+}
+#endif /* QCA_CACHED_RING_DESC */
+
 #ifdef QCA_ENH_V3_STATS_SUPPORT
 /**
  * dp_pdev_print_delay_stats(): Print pdev level delay stats

+ 16 - 6
dp/wifi3.0/dp_tx.c

@@ -3599,6 +3599,7 @@ uint32_t dp_tx_comp_handler(struct dp_intr *int_ctx, struct dp_soc *soc,
 	struct dp_tx_desc_s *tail_desc = NULL;
 	uint32_t num_processed = 0;
 	uint32_t count = 0;
+	uint32_t num_avail_for_reap = 0;
 	bool force_break = false;
 
 	DP_HIST_INIT();
@@ -3612,9 +3613,18 @@ more_data:
 		return 0;
 	}
 
+	num_avail_for_reap = hal_srng_dst_num_valid(soc->hal_soc, hal_ring_hdl, 0);
+
+	if (num_avail_for_reap >= quota)
+		num_avail_for_reap = quota;
+
+	dp_srng_dst_inv_cached_descs(soc, hal_ring_hdl, num_avail_for_reap);
+
 	/* Find head descriptor from completion ring */
-	while (qdf_likely(tx_comp_hal_desc =
-			hal_srng_dst_get_next(soc->hal_soc, hal_ring_hdl))) {
+	while (qdf_likely(num_avail_for_reap)) {
+		tx_comp_hal_desc =  dp_srng_dst_get_next(soc, hal_ring_hdl);
+		if (qdf_unlikely(!tx_comp_hal_desc))
+			break;
 
 		buffer_src = hal_tx_comp_get_buffer_source(tx_comp_hal_desc);
 
@@ -3756,10 +3766,6 @@ more_data:
 		 * Processed packet count is more than given quota
 		 * stop to processing
 		 */
-		if (num_processed >= quota) {
-			force_break = true;
-			break;
-		}
 
 		count++;
 
@@ -3774,6 +3780,10 @@ more_data:
 		dp_tx_comp_process_desc_list(soc, head_desc, ring_id);
 
 	if (dp_tx_comp_enable_eol_data_check(soc)) {
+
+		if (num_processed >= quota)
+			force_break = true;
+
 		if (!force_break &&
 		    hal_srng_dst_peek_sync_locked(soc->hal_soc,
 						  hal_ring_hdl)) {

+ 129 - 32
hal/wifi3.0/hal_api.h

@@ -986,50 +986,85 @@ static inline int hal_srng_access_start(hal_soc_handle_t hal_soc_hdl,
 }
 
 /**
- * hal_srng_dst_get_next - Get next entry from a destination ring and move
- * cached tail pointer
- *
+ * hal_srng_dst_get_next - Get next entry from a destination ring
  * @hal_soc: Opaque HAL SOC handle
  * @hal_ring_hdl: Destination ring pointer
  *
- * Return: Opaque pointer for next ring entry; NULL on failire
+ * Return: Opaque pointer for next ring entry; NULL on failure
  */
 static inline
 void *hal_srng_dst_get_next(void *hal_soc,
 			    hal_ring_handle_t hal_ring_hdl)
 {
 	struct hal_srng *srng = (struct hal_srng *)hal_ring_hdl;
-	struct hal_soc *soc = (struct hal_soc *)hal_soc;
 	uint32_t *desc;
-	uint32_t *desc_next;
-	uint32_t tp;
 
-	if (srng->u.dst_ring.tp != srng->u.dst_ring.cached_hp) {
-		desc = &(srng->ring_base_vaddr[srng->u.dst_ring.tp]);
-		/* TODO: Using % is expensive, but we have to do this since
-		 * size of some SRNG rings is not power of 2 (due to descriptor
-		 * sizes). Need to create separate API for rings used
-		 * per-packet, with sizes power of 2 (TCL2SW, REO2SW,
-		 * SW2RXDMA and CE rings)
-		 */
-		srng->u.dst_ring.tp = (srng->u.dst_ring.tp + srng->entry_size) %
-			srng->ring_size;
-
-		if (srng->flags & HAL_SRNG_CACHED_DESC) {
-			tp = srng->u.dst_ring.tp;
-			desc_next = &srng->ring_base_vaddr[tp];
-			qdf_mem_dma_cache_sync(soc->qdf_dev,
-					       qdf_mem_virt_to_phys(desc_next),
-					       QDF_DMA_FROM_DEVICE,
-					       (srng->entry_size *
-						sizeof(uint32_t)));
-			qdf_prefetch(desc_next);
-		}
+	if (srng->u.dst_ring.tp == srng->u.dst_ring.cached_hp)
+		return NULL;
 
-		return (void *)desc;
+	desc = &srng->ring_base_vaddr[srng->u.dst_ring.tp];
+	/* TODO: Using % is expensive, but we have to do this since
+	 * size of some SRNG rings is not power of 2 (due to descriptor
+	 * sizes). Need to create separate API for rings used
+	 * per-packet, with sizes power of 2 (TCL2SW, REO2SW,
+	 * SW2RXDMA and CE rings)
+	 */
+	srng->u.dst_ring.tp = (srng->u.dst_ring.tp + srng->entry_size);
+	if (srng->u.dst_ring.tp == srng->ring_size)
+		srng->u.dst_ring.tp = 0;
+
+	if (srng->flags & HAL_SRNG_CACHED_DESC) {
+		struct hal_soc *soc = (struct hal_soc *)hal_soc;
+		uint32_t *desc_next;
+		uint32_t tp;
+
+		tp = srng->u.dst_ring.tp;
+		desc_next = &srng->ring_base_vaddr[srng->u.dst_ring.tp];
+		qdf_mem_dma_cache_sync(soc->qdf_dev,
+				       qdf_mem_virt_to_phys(desc_next),
+				       QDF_DMA_FROM_DEVICE,
+				       (srng->entry_size *
+					sizeof(uint32_t)));
+		qdf_prefetch(desc_next);
 	}
 
-	return NULL;
+	return (void *)desc;
+}
+
+/**
+ * hal_srng_dst_get_next_cached - Get cached next entry
+ * @hal_soc: Opaque HAL SOC handle
+ * @hal_ring_hdl: Destination ring pointer
+ *
+ * Get next entry from a destination ring and move cached tail pointer
+ *
+ * Return: Opaque pointer for next ring entry; NULL on failure
+ */
+static inline
+void *hal_srng_dst_get_next_cached(void *hal_soc,
+				   hal_ring_handle_t hal_ring_hdl)
+{
+	struct hal_srng *srng = (struct hal_srng *)hal_ring_hdl;
+	uint32_t *desc;
+	uint32_t *desc_next;
+
+	if (srng->u.dst_ring.tp == srng->u.dst_ring.cached_hp)
+		return NULL;
+
+	desc = &srng->ring_base_vaddr[srng->u.dst_ring.tp];
+	/* TODO: Using % is expensive, but we have to do this since
+	 * size of some SRNG rings is not power of 2 (due to descriptor
+	 * sizes). Need to create separate API for rings used
+	 * per-packet, with sizes power of 2 (TCL2SW, REO2SW,
+	 * SW2RXDMA and CE rings)
+	 */
+	srng->u.dst_ring.tp = (srng->u.dst_ring.tp + srng->entry_size);
+	if (srng->u.dst_ring.tp == srng->ring_size)
+		srng->u.dst_ring.tp = 0;
+
+	desc_next = &srng->ring_base_vaddr[srng->u.dst_ring.tp];
+	qdf_prefetch(desc_next);
+	return (void *)desc;
 }
 
 /**
@@ -1148,8 +1183,70 @@ uint32_t hal_srng_dst_num_valid(void *hal_soc,
 
 	if (hp >= tp)
 		return (hp - tp) / srng->entry_size;
-	else
-		return (srng->ring_size - tp + hp) / srng->entry_size;
+
+	return (srng->ring_size - tp + hp) / srng->entry_size;
+}
+
+/**
+ * hal_srng_dst_inv_cached_descs - API to invalidate descriptors in batch mode
+ * @hal_soc: Opaque HAL SOC handle
+ * @hal_ring_hdl: Destination ring pointer
+ * @entry_count: Number of descriptors to be invalidated
+ *
+ * Invalidates a set of cached descriptors starting from tail to
+ * provided count worth
+ *
+ * Return - None
+ */
+static inline void hal_srng_dst_inv_cached_descs(void *hal_soc,
+						 hal_ring_handle_t hal_ring_hdl,
+						 uint32_t entry_count)
+{
+	struct hal_srng *srng = (struct hal_srng *)hal_ring_hdl;
+	uint32_t hp = srng->u.dst_ring.cached_hp;
+	uint32_t tp = srng->u.dst_ring.tp;
+	uint32_t sync_p = 0;
+
+	/*
+	 * If SRNG does not have cached descriptors this
+	 * API call should be a no op
+	 */
+	if (!(srng->flags & HAL_SRNG_CACHED_DESC))
+		return;
+
+	if (qdf_unlikely(entry_count == 0))
+		return;
+
+	sync_p = (entry_count - 1) * srng->entry_size;
+
+	if (hp > tp) {
+		qdf_nbuf_dma_inv_range(&srng->ring_base_vaddr[tp],
+				       &srng->ring_base_vaddr[tp + sync_p]
+				       + (srng->entry_size * sizeof(uint32_t)));
+	} else {
+		/*
+		 * We have wrapped around
+		 */
+		uint32_t wrap_cnt = ((srng->ring_size - tp) / srng->entry_size);
+
+		if (entry_count <= wrap_cnt) {
+			qdf_nbuf_dma_inv_range(&srng->ring_base_vaddr[tp],
+					       &srng->ring_base_vaddr[tp + sync_p] +
+					       (srng->entry_size * sizeof(uint32_t)));
+			return;
+		}
+
+		entry_count -= wrap_cnt;
+		sync_p = (entry_count - 1) * srng->entry_size;
+
+		qdf_nbuf_dma_inv_range(&srng->ring_base_vaddr[tp],
+				       &srng->ring_base_vaddr[srng->ring_size - srng->entry_size] +
+				       (srng->entry_size * sizeof(uint32_t)));
+
+		qdf_nbuf_dma_inv_range(&srng->ring_base_vaddr[0],
+				       &srng->ring_base_vaddr[sync_p]
+				       + (srng->entry_size * sizeof(uint32_t)));
+	}
 }
 
 /**