From fafa3f57e3404c3e45c5a3c94fd50226896de470 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 25 Mar 2015 14:30:46 +0000 Subject: [PATCH] kernel: merge upstream bgmac driver improvements Signed-off-by: Felix Fietkau SVN-Revision: 44978 --- ...scriptor-frame-start-end-definitions.patch | 24 ++ ...gmac-implement-GRO-and-use-build_skb.patch | 189 +++++++++++++ ...mac-implement-scatter-gather-support.patch | 267 ++++++++++++++++++ 3 files changed, 480 insertions(+) create mode 100644 target/linux/generic/patches-3.18/077-01-bgmac-fix-descriptor-frame-start-end-definitions.patch create mode 100644 target/linux/generic/patches-3.18/077-02-bgmac-implement-GRO-and-use-build_skb.patch create mode 100644 target/linux/generic/patches-3.18/077-03-bgmac-implement-scatter-gather-support.patch diff --git a/target/linux/generic/patches-3.18/077-01-bgmac-fix-descriptor-frame-start-end-definitions.patch b/target/linux/generic/patches-3.18/077-01-bgmac-fix-descriptor-frame-start-end-definitions.patch new file mode 100644 index 0000000000..fdfae3aeff --- /dev/null +++ b/target/linux/generic/patches-3.18/077-01-bgmac-fix-descriptor-frame-start-end-definitions.patch @@ -0,0 +1,24 @@ +From: Felix Fietkau +Date: Mon, 23 Mar 2015 02:40:06 +0100 +Subject: [PATCH] bgmac: fix descriptor frame start/end definitions + +The start-of-frame and end-of-frame bits were accidentally swapped. +In the current code it does not make any difference, since they are +always used together. + +Signed-off-by: Felix Fietkau +--- + +--- a/drivers/net/ethernet/broadcom/bgmac.h ++++ b/drivers/net/ethernet/broadcom/bgmac.h +@@ -345,8 +345,8 @@ + + #define BGMAC_DESC_CTL0_EOT 0x10000000 /* End of ring */ + #define BGMAC_DESC_CTL0_IOC 0x20000000 /* IRQ on complete */ +-#define BGMAC_DESC_CTL0_SOF 0x40000000 /* Start of frame */ +-#define BGMAC_DESC_CTL0_EOF 0x80000000 /* End of frame */ ++#define BGMAC_DESC_CTL0_EOF 0x40000000 /* End of frame */ ++#define BGMAC_DESC_CTL0_SOF 0x80000000 /* Start of frame */ + #define BGMAC_DESC_CTL1_LEN 0x00001FFF + + #define BGMAC_PHY_NOREGS 0x1E diff --git a/target/linux/generic/patches-3.18/077-02-bgmac-implement-GRO-and-use-build_skb.patch b/target/linux/generic/patches-3.18/077-02-bgmac-implement-GRO-and-use-build_skb.patch new file mode 100644 index 0000000000..3636fb619a --- /dev/null +++ b/target/linux/generic/patches-3.18/077-02-bgmac-implement-GRO-and-use-build_skb.patch @@ -0,0 +1,189 @@ +From: Felix Fietkau +Date: Mon, 23 Mar 2015 02:41:25 +0100 +Subject: [PATCH] bgmac: implement GRO and use build_skb + +This improves performance for routing and local rx + +Signed-off-by: Felix Fietkau +--- + +--- a/drivers/net/ethernet/broadcom/bgmac.c ++++ b/drivers/net/ethernet/broadcom/bgmac.c +@@ -276,31 +276,31 @@ static int bgmac_dma_rx_skb_for_slot(str + struct bgmac_slot_info *slot) + { + struct device *dma_dev = bgmac->core->dma_dev; +- struct sk_buff *skb; + dma_addr_t dma_addr; + struct bgmac_rx_header *rx; ++ void *buf; + + /* Alloc skb */ +- skb = netdev_alloc_skb(bgmac->net_dev, BGMAC_RX_BUF_SIZE); +- if (!skb) ++ buf = netdev_alloc_frag(BGMAC_RX_ALLOC_SIZE); ++ if (!buf) + return -ENOMEM; + + /* Poison - if everything goes fine, hardware will overwrite it */ +- rx = (struct bgmac_rx_header *)skb->data; ++ rx = buf; + rx->len = cpu_to_le16(0xdead); + rx->flags = cpu_to_le16(0xbeef); + + /* Map skb for the DMA */ +- dma_addr = dma_map_single(dma_dev, skb->data, +- BGMAC_RX_BUF_SIZE, DMA_FROM_DEVICE); ++ dma_addr = dma_map_single(dma_dev, buf, BGMAC_RX_BUF_SIZE, ++ DMA_FROM_DEVICE); + if (dma_mapping_error(dma_dev, dma_addr)) { + bgmac_err(bgmac, "DMA mapping error\n"); +- dev_kfree_skb(skb); ++ put_page(virt_to_head_page(buf)); + return -ENOMEM; + } + + /* Update the slot */ +- slot->skb = skb; ++ slot->buf = buf; + slot->dma_addr = dma_addr; + + return 0; +@@ -343,8 +343,9 @@ static int bgmac_dma_rx_read(struct bgma + while (ring->start != ring->end) { + struct device *dma_dev = bgmac->core->dma_dev; + struct bgmac_slot_info *slot = &ring->slots[ring->start]; +- struct sk_buff *skb = slot->skb; +- struct bgmac_rx_header *rx; ++ struct bgmac_rx_header *rx = slot->buf; ++ struct sk_buff *skb; ++ void *buf = slot->buf; + u16 len, flags; + + /* Unmap buffer to make it accessible to the CPU */ +@@ -352,7 +353,6 @@ static int bgmac_dma_rx_read(struct bgma + BGMAC_RX_BUF_SIZE, DMA_FROM_DEVICE); + + /* Get info from the header */ +- rx = (struct bgmac_rx_header *)skb->data; + len = le16_to_cpu(rx->len); + flags = le16_to_cpu(rx->flags); + +@@ -393,12 +393,13 @@ static int bgmac_dma_rx_read(struct bgma + dma_unmap_single(dma_dev, old_dma_addr, + BGMAC_RX_BUF_SIZE, DMA_FROM_DEVICE); + ++ skb = build_skb(buf, BGMAC_RX_ALLOC_SIZE); + skb_put(skb, BGMAC_RX_FRAME_OFFSET + len); + skb_pull(skb, BGMAC_RX_FRAME_OFFSET); + + skb_checksum_none_assert(skb); + skb->protocol = eth_type_trans(skb, bgmac->net_dev); +- netif_receive_skb(skb); ++ napi_gro_receive(&bgmac->napi, skb); + handled++; + } while (0); + +@@ -434,12 +435,11 @@ static bool bgmac_dma_unaligned(struct b + return false; + } + +-static void bgmac_dma_ring_free(struct bgmac *bgmac, +- struct bgmac_dma_ring *ring) ++static void bgmac_dma_tx_ring_free(struct bgmac *bgmac, ++ struct bgmac_dma_ring *ring) + { + struct device *dma_dev = bgmac->core->dma_dev; + struct bgmac_slot_info *slot; +- int size; + int i; + + for (i = 0; i < ring->num_slots; i++) { +@@ -451,23 +451,55 @@ static void bgmac_dma_ring_free(struct b + dev_kfree_skb(slot->skb); + } + } ++} + +- if (ring->cpu_base) { +- /* Free ring of descriptors */ +- size = ring->num_slots * sizeof(struct bgmac_dma_desc); +- dma_free_coherent(dma_dev, size, ring->cpu_base, +- ring->dma_base); ++static void bgmac_dma_rx_ring_free(struct bgmac *bgmac, ++ struct bgmac_dma_ring *ring) ++{ ++ struct device *dma_dev = bgmac->core->dma_dev; ++ struct bgmac_slot_info *slot; ++ int i; ++ ++ for (i = 0; i < ring->num_slots; i++) { ++ slot = &ring->slots[i]; ++ if (!slot->buf) ++ continue; ++ ++ if (slot->dma_addr) ++ dma_unmap_single(dma_dev, slot->dma_addr, ++ BGMAC_RX_BUF_SIZE, ++ DMA_FROM_DEVICE); ++ put_page(virt_to_head_page(slot->buf)); + } + } + ++static void bgmac_dma_ring_desc_free(struct bgmac *bgmac, ++ struct bgmac_dma_ring *ring) ++{ ++ struct device *dma_dev = bgmac->core->dma_dev; ++ int size; ++ ++ if (!ring->cpu_base) ++ return; ++ ++ /* Free ring of descriptors */ ++ size = ring->num_slots * sizeof(struct bgmac_dma_desc); ++ dma_free_coherent(dma_dev, size, ring->cpu_base, ++ ring->dma_base); ++} ++ + static void bgmac_dma_free(struct bgmac *bgmac) + { + int i; + +- for (i = 0; i < BGMAC_MAX_TX_RINGS; i++) +- bgmac_dma_ring_free(bgmac, &bgmac->tx_ring[i]); +- for (i = 0; i < BGMAC_MAX_RX_RINGS; i++) +- bgmac_dma_ring_free(bgmac, &bgmac->rx_ring[i]); ++ for (i = 0; i < BGMAC_MAX_TX_RINGS; i++) { ++ bgmac_dma_tx_ring_free(bgmac, &bgmac->tx_ring[i]); ++ bgmac_dma_ring_desc_free(bgmac, &bgmac->tx_ring[i]); ++ } ++ for (i = 0; i < BGMAC_MAX_RX_RINGS; i++) { ++ bgmac_dma_rx_ring_free(bgmac, &bgmac->rx_ring[i]); ++ bgmac_dma_ring_desc_free(bgmac, &bgmac->rx_ring[i]); ++ } + } + + static int bgmac_dma_alloc(struct bgmac *bgmac) +--- a/drivers/net/ethernet/broadcom/bgmac.h ++++ b/drivers/net/ethernet/broadcom/bgmac.h +@@ -362,6 +362,8 @@ + #define BGMAC_RX_FRAME_OFFSET 30 /* There are 2 unused bytes between header and real data */ + #define BGMAC_RX_MAX_FRAME_SIZE 1536 /* Copied from b44/tg3 */ + #define BGMAC_RX_BUF_SIZE (BGMAC_RX_FRAME_OFFSET + BGMAC_RX_MAX_FRAME_SIZE) ++#define BGMAC_RX_ALLOC_SIZE (SKB_DATA_ALIGN(BGMAC_RX_BUF_SIZE) + \ ++ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) + + #define BGMAC_BFL_ENETROBO 0x0010 /* has ephy roboswitch spi */ + #define BGMAC_BFL_ENETADM 0x0080 /* has ADMtek switch */ +@@ -383,7 +385,10 @@ + #define ETHER_MAX_LEN 1518 + + struct bgmac_slot_info { +- struct sk_buff *skb; ++ union { ++ struct sk_buff *skb; ++ void *buf; ++ }; + dma_addr_t dma_addr; + }; + diff --git a/target/linux/generic/patches-3.18/077-03-bgmac-implement-scatter-gather-support.patch b/target/linux/generic/patches-3.18/077-03-bgmac-implement-scatter-gather-support.patch new file mode 100644 index 0000000000..5cb21a565a --- /dev/null +++ b/target/linux/generic/patches-3.18/077-03-bgmac-implement-scatter-gather-support.patch @@ -0,0 +1,267 @@ +From: Felix Fietkau +Date: Mon, 23 Mar 2015 02:42:26 +0100 +Subject: [PATCH] bgmac: implement scatter/gather support + +Always use software checksumming, since the hardware does not have any +checksum offload support. +This significantly improves local TCP tx performance. + +Signed-off-by: Felix Fietkau +--- + +--- a/drivers/net/ethernet/broadcom/bgmac.c ++++ b/drivers/net/ethernet/broadcom/bgmac.c +@@ -115,53 +115,91 @@ static void bgmac_dma_tx_enable(struct b + bgmac_write(bgmac, ring->mmio_base + BGMAC_DMA_TX_CTL, ctl); + } + ++static void ++bgmac_dma_tx_add_buf(struct bgmac *bgmac, struct bgmac_dma_ring *ring, ++ int i, int len, u32 ctl0) ++{ ++ struct bgmac_slot_info *slot; ++ struct bgmac_dma_desc *dma_desc; ++ u32 ctl1; ++ ++ if (i == ring->num_slots - 1) ++ ctl0 |= BGMAC_DESC_CTL0_EOT; ++ ++ ctl1 = len & BGMAC_DESC_CTL1_LEN; ++ ++ slot = &ring->slots[i]; ++ dma_desc = &ring->cpu_base[i]; ++ dma_desc->addr_low = cpu_to_le32(lower_32_bits(slot->dma_addr)); ++ dma_desc->addr_high = cpu_to_le32(upper_32_bits(slot->dma_addr)); ++ dma_desc->ctl0 = cpu_to_le32(ctl0); ++ dma_desc->ctl1 = cpu_to_le32(ctl1); ++} ++ + static netdev_tx_t bgmac_dma_tx_add(struct bgmac *bgmac, + struct bgmac_dma_ring *ring, + struct sk_buff *skb) + { + struct device *dma_dev = bgmac->core->dma_dev; + struct net_device *net_dev = bgmac->net_dev; +- struct bgmac_dma_desc *dma_desc; +- struct bgmac_slot_info *slot; +- u32 ctl0, ctl1; ++ struct bgmac_slot_info *slot = &ring->slots[ring->end]; + int free_slots; ++ int nr_frags; ++ u32 flags; ++ int index = ring->end; ++ int i; + + if (skb->len > BGMAC_DESC_CTL1_LEN) { + bgmac_err(bgmac, "Too long skb (%d)\n", skb->len); +- goto err_stop_drop; ++ goto err_drop; + } + ++ if (skb->ip_summed == CHECKSUM_PARTIAL) ++ skb_checksum_help(skb); ++ ++ nr_frags = skb_shinfo(skb)->nr_frags; ++ + if (ring->start <= ring->end) + free_slots = ring->start - ring->end + BGMAC_TX_RING_SLOTS; + else + free_slots = ring->start - ring->end; +- if (free_slots == 1) { ++ ++ if (free_slots <= nr_frags + 1) { + bgmac_err(bgmac, "TX ring is full, queue should be stopped!\n"); + netif_stop_queue(net_dev); + return NETDEV_TX_BUSY; + } + +- slot = &ring->slots[ring->end]; +- slot->skb = skb; +- slot->dma_addr = dma_map_single(dma_dev, skb->data, skb->len, ++ slot->dma_addr = dma_map_single(dma_dev, skb->data, skb_headlen(skb), + DMA_TO_DEVICE); +- if (dma_mapping_error(dma_dev, slot->dma_addr)) { +- bgmac_err(bgmac, "Mapping error of skb on ring 0x%X\n", +- ring->mmio_base); +- goto err_stop_drop; +- } ++ if (unlikely(dma_mapping_error(dma_dev, slot->dma_addr))) ++ goto err_dma_head; + +- ctl0 = BGMAC_DESC_CTL0_IOC | BGMAC_DESC_CTL0_SOF | BGMAC_DESC_CTL0_EOF; +- if (ring->end == ring->num_slots - 1) +- ctl0 |= BGMAC_DESC_CTL0_EOT; +- ctl1 = skb->len & BGMAC_DESC_CTL1_LEN; ++ flags = BGMAC_DESC_CTL0_SOF; ++ if (!nr_frags) ++ flags |= BGMAC_DESC_CTL0_EOF | BGMAC_DESC_CTL0_IOC; ++ ++ bgmac_dma_tx_add_buf(bgmac, ring, index, skb_headlen(skb), flags); ++ flags = 0; ++ ++ for (i = 0; i < nr_frags; i++) { ++ struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i]; ++ int len = skb_frag_size(frag); ++ ++ index = (index + 1) % BGMAC_TX_RING_SLOTS; ++ slot = &ring->slots[index]; ++ slot->dma_addr = skb_frag_dma_map(dma_dev, frag, 0, ++ len, DMA_TO_DEVICE); ++ if (unlikely(dma_mapping_error(dma_dev, slot->dma_addr))) ++ goto err_dma; + +- dma_desc = ring->cpu_base; +- dma_desc += ring->end; +- dma_desc->addr_low = cpu_to_le32(lower_32_bits(slot->dma_addr)); +- dma_desc->addr_high = cpu_to_le32(upper_32_bits(slot->dma_addr)); +- dma_desc->ctl0 = cpu_to_le32(ctl0); +- dma_desc->ctl1 = cpu_to_le32(ctl1); ++ if (i == nr_frags - 1) ++ flags |= BGMAC_DESC_CTL0_EOF | BGMAC_DESC_CTL0_IOC; ++ ++ bgmac_dma_tx_add_buf(bgmac, ring, index, len, flags); ++ } ++ ++ slot->skb = skb; + + netdev_sent_queue(net_dev, skb->len); + +@@ -170,20 +208,35 @@ static netdev_tx_t bgmac_dma_tx_add(stru + /* Increase ring->end to point empty slot. We tell hardware the first + * slot it should *not* read. + */ +- if (++ring->end >= BGMAC_TX_RING_SLOTS) +- ring->end = 0; ++ ring->end = (index + 1) % BGMAC_TX_RING_SLOTS; + bgmac_write(bgmac, ring->mmio_base + BGMAC_DMA_TX_INDEX, + ring->index_base + + ring->end * sizeof(struct bgmac_dma_desc)); + +- /* Always keep one slot free to allow detecting bugged calls. */ +- if (--free_slots == 1) ++ free_slots -= nr_frags + 1; ++ if (free_slots < 8) + netif_stop_queue(net_dev); + + return NETDEV_TX_OK; + +-err_stop_drop: +- netif_stop_queue(net_dev); ++err_dma: ++ dma_unmap_single(dma_dev, slot->dma_addr, skb_headlen(skb), ++ DMA_TO_DEVICE); ++ ++ while (i > 0) { ++ int index = (ring->end + i) % BGMAC_TX_RING_SLOTS; ++ struct bgmac_slot_info *slot = &ring->slots[index]; ++ u32 ctl1 = le32_to_cpu(ring->cpu_base[index].ctl1); ++ int len = ctl1 & BGMAC_DESC_CTL1_LEN; ++ ++ dma_unmap_page(dma_dev, slot->dma_addr, len, DMA_TO_DEVICE); ++ } ++ ++err_dma_head: ++ bgmac_err(bgmac, "Mapping error of skb on ring 0x%X\n", ++ ring->mmio_base); ++ ++err_drop: + dev_kfree_skb(skb); + return NETDEV_TX_OK; + } +@@ -205,32 +258,45 @@ static void bgmac_dma_tx_free(struct bgm + + while (ring->start != empty_slot) { + struct bgmac_slot_info *slot = &ring->slots[ring->start]; ++ u32 ctl1 = le32_to_cpu(ring->cpu_base[ring->start].ctl1); ++ int len = ctl1 & BGMAC_DESC_CTL1_LEN; + +- if (slot->skb) { ++ if (!slot->dma_addr) { ++ bgmac_err(bgmac, "Hardware reported transmission for empty TX ring slot %d! End of ring: %d\n", ++ ring->start, ring->end); ++ goto next; ++ } ++ ++ if (ctl1 & BGMAC_DESC_CTL0_SOF) + /* Unmap no longer used buffer */ +- dma_unmap_single(dma_dev, slot->dma_addr, +- slot->skb->len, DMA_TO_DEVICE); +- slot->dma_addr = 0; ++ dma_unmap_single(dma_dev, slot->dma_addr, len, ++ DMA_TO_DEVICE); ++ else ++ dma_unmap_page(dma_dev, slot->dma_addr, len, ++ DMA_TO_DEVICE); + ++ if (slot->skb) { + bytes_compl += slot->skb->len; + pkts_compl++; + + /* Free memory! :) */ + dev_kfree_skb(slot->skb); + slot->skb = NULL; +- } else { +- bgmac_err(bgmac, "Hardware reported transmission for empty TX ring slot %d! End of ring: %d\n", +- ring->start, ring->end); + } + ++next: ++ slot->dma_addr = 0; + if (++ring->start >= BGMAC_TX_RING_SLOTS) + ring->start = 0; + freed = true; + } + ++ if (!pkts_compl) ++ return; ++ + netdev_completed_queue(bgmac->net_dev, pkts_compl, bytes_compl); + +- if (freed && netif_queue_stopped(bgmac->net_dev)) ++ if (netif_queue_stopped(bgmac->net_dev)) + netif_wake_queue(bgmac->net_dev); + } + +@@ -439,17 +505,25 @@ static void bgmac_dma_tx_ring_free(struc + struct bgmac_dma_ring *ring) + { + struct device *dma_dev = bgmac->core->dma_dev; ++ struct bgmac_dma_desc *dma_desc = ring->cpu_base; + struct bgmac_slot_info *slot; + int i; + + for (i = 0; i < ring->num_slots; i++) { ++ int len = dma_desc[i].ctl1 & BGMAC_DESC_CTL1_LEN; ++ + slot = &ring->slots[i]; +- if (slot->skb) { +- if (slot->dma_addr) +- dma_unmap_single(dma_dev, slot->dma_addr, +- slot->skb->len, DMA_TO_DEVICE); +- dev_kfree_skb(slot->skb); +- } ++ dev_kfree_skb(slot->skb); ++ ++ if (!slot->dma_addr) ++ continue; ++ ++ if (slot->skb) ++ dma_unmap_single(dma_dev, slot->dma_addr, ++ len, DMA_TO_DEVICE); ++ else ++ dma_unmap_page(dma_dev, slot->dma_addr, ++ len, DMA_TO_DEVICE); + } + } + +@@ -1583,6 +1657,10 @@ static int bgmac_probe(struct bcma_devic + goto err_dma_free; + } + ++ net_dev->features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; ++ net_dev->hw_features = net_dev->features; ++ net_dev->vlan_features = net_dev->features; ++ + err = register_netdev(bgmac->net_dev); + if (err) { + bgmac_err(bgmac, "Cannot register net device\n");