config root man

Current Path : /sys/amd64/compile/hs32/modules/usr/src/sys/modules/dtrace/prototype/@/dev/sfxge/

FreeBSD hs32.drive.ne.jp 9.1-RELEASE FreeBSD 9.1-RELEASE #1: Wed Jan 14 12:18:08 JST 2015 root@hs32.drive.ne.jp:/sys/amd64/compile/hs32 amd64
Upload File :
Current File : //sys/amd64/compile/hs32/modules/usr/src/sys/modules/dtrace/prototype/@/dev/sfxge/sfxge_rx.c

/*-
 * Copyright (c) 2010-2011 Solarflare Communications, Inc.
 * All rights reserved.
 *
 * This software was developed in part by Philip Paeps under contract for
 * Solarflare Communications, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD: release/9.1.0/sys/dev/sfxge/sfxge_rx.c 227569 2011-11-16 17:11:13Z philip $");

#include <sys/types.h>
#include <sys/mbuf.h>
#include <sys/smp.h>
#include <sys/socket.h>
#include <sys/sysctl.h>
#include <sys/limits.h>

#include <net/ethernet.h>
#include <net/if.h>
#include <net/if_vlan_var.h>

#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
#include <netinet/tcp.h>

#include <machine/in_cksum.h>

#include "common/efx.h"


#include "sfxge.h"
#include "sfxge_rx.h"

#define RX_REFILL_THRESHOLD (EFX_RXQ_LIMIT(SFXGE_NDESCS) * 9 / 10)
#define RX_REFILL_THRESHOLD_2 (RX_REFILL_THRESHOLD / 2)

/* Size of the LRO hash table.  Must be a power of 2.  A larger table
 * means we can accelerate a larger number of streams.
 */
static unsigned lro_table_size = 128;

/* Maximum length of a hash chain.  If chains get too long then the lookup
 * time increases and may exceed the benefit of LRO.
 */
static unsigned lro_chain_max = 20;

/* Maximum time (in ticks) that a connection can be idle before it's LRO
 * state is discarded.
 */
static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */

/* Number of packets with payload that must arrive in-order before a
 * connection is eligible for LRO.  The idea is we should avoid coalescing
 * segments when the sender is in slow-start because reducing the ACK rate
 * can damage performance.
 */
static int lro_slow_start_packets = 2000;

/* Number of packets with payload that must arrive in-order following loss
 * before a connection is eligible for LRO.  The idea is we should avoid
 * coalescing segments when the sender is recovering from loss, because
 * reducing the ACK rate can damage performance.
 */
static int lro_loss_packets = 20;

/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
#define SFXGE_LRO_L2_ID_VLAN 0x4000
#define SFXGE_LRO_L2_ID_IPV6 0x8000
#define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
#define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))

/* Compare IPv6 addresses, avoiding conditional branches */
static __inline unsigned long ipv6_addr_cmp(const struct in6_addr *left,
					    const struct in6_addr *right)
{
#if LONG_BIT == 64
	const uint64_t *left64 = (const uint64_t *)left;
	const uint64_t *right64 = (const uint64_t *)right;
	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
#else
	return (left->s6_addr32[0] - right->s6_addr32[0]) |
	       (left->s6_addr32[1] - right->s6_addr32[1]) |
	       (left->s6_addr32[2] - right->s6_addr32[2]) |
	       (left->s6_addr32[3] - right->s6_addr32[3]);
#endif
}

void
sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
{

	rxq->flush_state = SFXGE_FLUSH_DONE;
}

void
sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
{

	rxq->flush_state = SFXGE_FLUSH_FAILED;
}

static uint8_t toep_key[] = {
	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
};

static void
sfxge_rx_post_refill(void *arg)
{
	struct sfxge_rxq *rxq = arg;
	struct sfxge_softc *sc;
	unsigned int index;
	struct sfxge_evq *evq;
	uint16_t magic;

	sc = rxq->sc;
	index = rxq->index;
	evq = sc->evq[index];

	magic = SFXGE_MAGIC_RX_QREFILL | index;

	/* This is guaranteed due to the start/stop order of rx and ev */
	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
	    ("evq not started"));
	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
	    ("rxq not started"));
	efx_ev_qpost(evq->common, magic);
}

static void
sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
{
	/* Initially retry after 100 ms, but back off in case of
	 * repeated failures as we probably have to wait for the
	 * administrator to raise the pool limit. */
	if (retrying)
		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
	else
		rxq->refill_delay = hz / 10;

	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
			     sfxge_rx_post_refill, rxq);
}

static inline struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
{
	struct mb_args args;
	struct mbuf *m;

	/* Allocate mbuf structure */
	args.flags = M_PKTHDR;
	args.type = MT_DATA;
	m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_DONTWAIT);

	/* Allocate (and attach) packet buffer */
	if (m && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_DONTWAIT)) {
		uma_zfree(zone_mbuf, m);
		m = NULL;
	}

	return m;
}

#define	SFXGE_REFILL_BATCH  64

static void
sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
{
	struct sfxge_softc *sc;
	unsigned int index;
	struct sfxge_evq *evq;
	unsigned int batch;
	unsigned int rxfill;
	unsigned int mblksize;
	int ntodo;
	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];

	sc = rxq->sc;
	index = rxq->index;
	evq = sc->evq[index];

	prefetch_read_many(sc->enp);
	prefetch_read_many(rxq->common);

	mtx_assert(&evq->lock, MA_OWNED);

	if (rxq->init_state != SFXGE_RXQ_STARTED)
		return;

	rxfill = rxq->added - rxq->completed;
	KASSERT(rxfill <= EFX_RXQ_LIMIT(SFXGE_NDESCS),
	    ("rxfill > EFX_RXQ_LIMIT(SFXGE_NDESCS)"));
	ntodo = min(EFX_RXQ_LIMIT(SFXGE_NDESCS) - rxfill, target);
	KASSERT(ntodo <= EFX_RXQ_LIMIT(SFXGE_NDESCS),
	    ("ntodo > EFX_RQX_LIMIT(SFXGE_NDESCS)"));

	if (ntodo == 0)
		return;

	batch = 0;
	mblksize = sc->rx_buffer_size;
	while (ntodo-- > 0) {
		unsigned int id;
		struct sfxge_rx_sw_desc *rx_desc;
		bus_dma_segment_t seg;
		struct mbuf *m;

		id = (rxq->added + batch) & (SFXGE_NDESCS - 1);
		rx_desc = &rxq->queue[id];
		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));

		rx_desc->flags = EFX_DISCARD;
		m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
		if (m == NULL)
			break;
		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
		addr[batch++] = seg.ds_addr;

		if (batch == SFXGE_REFILL_BATCH) {
			efx_rx_qpost(rxq->common, addr, mblksize, batch,
			    rxq->completed, rxq->added);
			rxq->added += batch;
			batch = 0;
		}
	}

	if (ntodo != 0)
		sfxge_rx_schedule_refill(rxq, retrying);

	if (batch != 0) {
		efx_rx_qpost(rxq->common, addr, mblksize, batch,
		    rxq->completed, rxq->added);
		rxq->added += batch;
	}

	/* Make the descriptors visible to the hardware */
	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
			BUS_DMASYNC_PREWRITE);

	efx_rx_qpush(rxq->common, rxq->added);
}

void
sfxge_rx_qrefill(struct sfxge_rxq *rxq)
{

	if (rxq->init_state != SFXGE_RXQ_STARTED)
		return;

	/* Make sure the queue is full */
	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_TRUE);
}

static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
{
	struct ifnet *ifp = sc->ifnet;

	m->m_pkthdr.rcvif = ifp;
	m->m_pkthdr.header = m->m_data;
	m->m_pkthdr.csum_data = 0xffff;
	ifp->if_input(ifp, m);
}

static void
sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
{
	struct mbuf *m = rx_desc->mbuf;
	int csum_flags;

	/* Convert checksum flags */
	csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ?
		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
	if (rx_desc->flags & EFX_CKSUM_TCPUDP)
		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;

#ifdef SFXGE_HAVE_MQ
	/* The hash covers a 4-tuple for TCP only */
	if (rx_desc->flags & EFX_PKT_TCP) {
		m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
						       mtod(m, uint8_t *));
		m->m_flags |= M_FLOWID;
	}
#endif
	m->m_data += sc->rx_prefix_size;
	m->m_len = rx_desc->size - sc->rx_prefix_size;
	m->m_pkthdr.len = m->m_len;
	m->m_pkthdr.csum_flags = csum_flags;
	__sfxge_rx_deliver(sc, rx_desc->mbuf);

	rx_desc->flags = EFX_DISCARD;
	rx_desc->mbuf = NULL;
}

static void
sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
{
	struct sfxge_softc *sc = st->sc;
	struct mbuf *m = c->mbuf;
	struct tcphdr *c_th;
	int csum_flags;

	KASSERT(m, ("no mbuf to deliver"));

	++st->n_bursts;

	/* Finish off packet munging and recalculate IP header checksum. */
	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
		struct ip *iph = c->nh;
		iph->ip_len = htons(iph->ip_len);
		iph->ip_sum = 0;
		iph->ip_sum = in_cksum_hdr(iph);
		c_th = (struct tcphdr *)(iph + 1);
		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
			      CSUM_IP_CHECKED | CSUM_IP_VALID);
	} else {
		struct ip6_hdr *iph = c->nh;
		iph->ip6_plen = htons(iph->ip6_plen);
		c_th = (struct tcphdr *)(iph + 1);
		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
	}

	c_th->th_win = c->th_last->th_win;
	c_th->th_ack = c->th_last->th_ack;
	if (c_th->th_off == c->th_last->th_off) {
		/* Copy TCP options (take care to avoid going negative). */
		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
		memcpy(c_th + 1, c->th_last + 1, optlen);
	}

#ifdef SFXGE_HAVE_MQ
	m->m_pkthdr.flowid = c->conn_hash;
	m->m_flags |= M_FLOWID;
#endif
	m->m_pkthdr.csum_flags = csum_flags;
	__sfxge_rx_deliver(sc, m);

	c->mbuf = NULL;
	c->delivered = 1;
}

/* Drop the given connection, and add it to the free list. */
static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
{
	unsigned bucket;

	KASSERT(!c->mbuf, ("found orphaned mbuf"));

	if (c->next_buf.mbuf) {
		sfxge_rx_deliver(rxq->sc, &c->next_buf);
		LIST_REMOVE(c, active_link);
	}

	bucket = c->conn_hash & rxq->lro.conns_mask;
	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
	--rxq->lro.conns_n[bucket];
	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
}

/* Stop tracking connections that have gone idle in order to keep hash
 * chains short.
 */
static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
{
	struct sfxge_lro_conn *c;
	unsigned i;

	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
		("found active connections"));

	rxq->lro.last_purge_ticks = now;
	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
			continue;

		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
		if (now - c->last_pkt_ticks > lro_idle_ticks) {
			++rxq->lro.n_drop_idle;
			sfxge_lro_drop(rxq, c);
		}
	}
}

static void
sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
		struct mbuf *mbuf, struct tcphdr *th)
{
	struct tcphdr *c_th;

	/* Tack the new mbuf onto the chain. */
	KASSERT(!mbuf->m_next, ("mbuf already chained"));
	c->mbuf_tail->m_next = mbuf;
	c->mbuf_tail = mbuf;

	/* Increase length appropriately */
	c->mbuf->m_pkthdr.len += mbuf->m_len;

	/* Update the connection state flags */
	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
		struct ip *iph = c->nh;
		iph->ip_len += mbuf->m_len;
		c_th = (struct tcphdr *)(iph + 1);
	} else {
		struct ip6_hdr *iph = c->nh;
		iph->ip6_plen += mbuf->m_len;
		c_th = (struct tcphdr *)(iph + 1);
	}
	c_th->th_flags |= (th->th_flags & TH_PUSH);
	c->th_last = th;
	++st->n_merges;

	/* Pass packet up now if another segment could overflow the IP
	 * length.
	 */
	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
		sfxge_lro_deliver(st, c);
}

static void
sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
		struct mbuf *mbuf, void *nh, struct tcphdr *th)
{
	/* Start the chain */
	c->mbuf = mbuf;
	c->mbuf_tail = c->mbuf;
	c->nh = nh;
	c->th_last = th;

	mbuf->m_pkthdr.len = mbuf->m_len;

	/* Mangle header fields for later processing */
	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
		struct ip *iph = nh;
		iph->ip_len = ntohs(iph->ip_len);
	} else {
		struct ip6_hdr *iph = nh;
		iph->ip6_plen = ntohs(iph->ip6_plen);
	}
}

/* Try to merge or otherwise hold or deliver (as appropriate) the
 * packet buffered for this connection (c->next_buf).  Return a flag
 * indicating whether the connection is still active for LRO purposes.
 */
static int
sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
{
	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
	char *eh = c->next_eh;
	int data_length, hdr_length, dont_merge;
	unsigned th_seq, pkt_length;
	struct tcphdr *th;
	unsigned now;

	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
		struct ip *iph = c->next_nh;
		th = (struct tcphdr *)(iph + 1);
		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
	} else {
		struct ip6_hdr *iph = c->next_nh;
		th = (struct tcphdr *)(iph + 1);
		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
	}

	hdr_length = (char *) th + th->th_off * 4 - eh;
	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
		       hdr_length);
	th_seq = ntohl(th->th_seq);
	dont_merge = ((data_length <= 0)
		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));

	/* Check for options other than aligned timestamp. */
	if (th->th_off != 5) {
		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
		if (th->th_off == 8 &&
		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
					(TCPOPT_NOP << 16) |
					(TCPOPT_TIMESTAMP << 8) |
					TCPOLEN_TIMESTAMP)) {
			/* timestamp option -- okay */
		} else {
			dont_merge = 1;
		}
	}

	if (__predict_false(th_seq != c->next_seq)) {
		/* Out-of-order, so start counting again. */
		if (c->mbuf)
			sfxge_lro_deliver(&rxq->lro, c);
		c->n_in_order_pkts -= lro_loss_packets;
		c->next_seq = th_seq + data_length;
		++rxq->lro.n_misorder;
		goto deliver_buf_out;
	}
	c->next_seq = th_seq + data_length;

	now = ticks;
	if (now - c->last_pkt_ticks > lro_idle_ticks) {
		++rxq->lro.n_drop_idle;
		if (c->mbuf)
			sfxge_lro_deliver(&rxq->lro, c);
		sfxge_lro_drop(rxq, c);
		return 0;
	}
	c->last_pkt_ticks = ticks;

	if (c->n_in_order_pkts < lro_slow_start_packets) {
		/* May be in slow-start, so don't merge. */
		++rxq->lro.n_slow_start;
		++c->n_in_order_pkts;
		goto deliver_buf_out;
	}

	if (__predict_false(dont_merge)) {
		if (c->mbuf)
			sfxge_lro_deliver(&rxq->lro, c);
		if (th->th_flags & (TH_FIN | TH_RST)) {
			++rxq->lro.n_drop_closed;
			sfxge_lro_drop(rxq, c);
			return 0;
		}
		goto deliver_buf_out;
	}

	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;

	if (__predict_true(c->mbuf != NULL)) {
		/* Remove headers and any padding */
		rx_buf->mbuf->m_data += hdr_length;
		rx_buf->mbuf->m_len = data_length;

		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
	} else {
		/* Remove any padding */
		rx_buf->mbuf->m_len = pkt_length;

		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
	}

	rx_buf->mbuf = NULL;
	return 1;

 deliver_buf_out:
	sfxge_rx_deliver(rxq->sc, rx_buf);
	return 1;
}

static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
			       uint16_t l2_id, void *nh, struct tcphdr *th)
{
	unsigned bucket = conn_hash & st->conns_mask;
	struct sfxge_lro_conn *c;

	if (st->conns_n[bucket] >= lro_chain_max) {
		++st->n_too_many;
		return;
	}

	if (!TAILQ_EMPTY(&st->free_conns)) {
		c = TAILQ_FIRST(&st->free_conns);
		TAILQ_REMOVE(&st->free_conns, c, link);
	} else {
		c = malloc(sizeof(*c), M_SFXGE, M_DONTWAIT);
		if (c == NULL)
			return;
		c->mbuf = NULL;
		c->next_buf.mbuf = NULL;
	}

	/* Create the connection tracking data */
	++st->conns_n[bucket];
	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
	c->l2_id = l2_id;
	c->conn_hash = conn_hash;
	c->source = th->th_sport;
	c->dest = th->th_dport;
	c->n_in_order_pkts = 0;
	c->last_pkt_ticks = *(volatile int *)&ticks;
	c->delivered = 0;
	++st->n_new_stream;
	/* NB. We don't initialise c->next_seq, and it doesn't matter what
	 * value it has.  Most likely the next packet received for this
	 * connection will not match -- no harm done.
	 */
}

/* Process mbuf and decide whether to dispatch it to the stack now or
 * later.
 */
static void
sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
{
	struct sfxge_softc *sc = rxq->sc;
	struct mbuf *m = rx_buf->mbuf;
	struct ether_header *eh;
	struct sfxge_lro_conn *c;
	uint16_t l2_id;
	uint16_t l3_proto;
        void *nh;
	struct tcphdr *th;
	uint32_t conn_hash;
	unsigned bucket;

	/* Get the hardware hash */
	conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
				      mtod(m, uint8_t *));

	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
			SFXGE_LRO_L2_ID_VLAN;
		l3_proto = veh->evl_proto;
		nh = veh + 1;
	} else {
		l2_id = 0;
		l3_proto = eh->ether_type;
		nh = eh + 1;
	}

	/* Check whether this is a suitable packet (unfragmented
	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
	 * length, and compute a hash if necessary.  If not, return.
	 */
	if (l3_proto == htons(ETHERTYPE_IP)) {
		struct ip *iph = nh;
		if ((iph->ip_p - IPPROTO_TCP) |
		    (iph->ip_hl - (sizeof(*iph) >> 2u)) |
		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
			goto deliver_now;
		th = (struct tcphdr *)(iph + 1);
	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
		struct ip6_hdr *iph = nh;
		if (iph->ip6_nxt != IPPROTO_TCP)
			goto deliver_now;
		l2_id |= SFXGE_LRO_L2_ID_IPV6;
		th = (struct tcphdr *)(iph + 1);
	} else {
		goto deliver_now;
	}

	bucket = conn_hash & rxq->lro.conns_mask;

	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
			continue;
		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
			continue;
		if (c->mbuf) {
			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
				struct ip *c_iph, *iph = nh;
				c_iph = c->nh;
				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
					continue;
			} else {
				struct ip6_hdr *c_iph, *iph = nh;
				c_iph = c->nh;
				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
					continue;
			}
		}

		/* Re-insert at head of list to reduce lookup time. */
		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);

		if (c->next_buf.mbuf) {
			if (!sfxge_lro_try_merge(rxq, c))
				goto deliver_now;
		} else {
			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
			    active_link);
		}
		c->next_buf = *rx_buf;
		c->next_eh = eh;
		c->next_nh = nh;

		rx_buf->mbuf = NULL;
		rx_buf->flags = EFX_DISCARD;
		return;
	}

	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
 deliver_now:
	sfxge_rx_deliver(sc, rx_buf);
}

static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
{
	struct sfxge_lro_state *st = &rxq->lro;
	struct sfxge_lro_conn *c;
	unsigned t;

	while (!LIST_EMPTY(&st->active_conns)) {
		c = LIST_FIRST(&st->active_conns);
		if (!c->delivered && c->mbuf)
			sfxge_lro_deliver(st, c);
		if (sfxge_lro_try_merge(rxq, c)) {
			if (c->mbuf)
				sfxge_lro_deliver(st, c);
			LIST_REMOVE(c, active_link);
		}
		c->delivered = 0;
	}

	t = *(volatile int *)&ticks;
	if (__predict_false(t != st->last_purge_ticks))
		sfxge_lro_purge_idle(rxq, t);
}

void
sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
{
	struct sfxge_softc *sc = rxq->sc;
	int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
	unsigned int index;
	struct sfxge_evq *evq;
	unsigned int completed;
	unsigned int level;
	struct mbuf *m;
	struct sfxge_rx_sw_desc *prev = NULL;

	index = rxq->index;
	evq = sc->evq[index];

	mtx_assert(&evq->lock, MA_OWNED);

	completed = rxq->completed;
	while (completed != rxq->pending) {
		unsigned int id;
		struct sfxge_rx_sw_desc *rx_desc;

		id = completed++ & (SFXGE_NDESCS - 1);
		rx_desc = &rxq->queue[id];
		m = rx_desc->mbuf;

		if (rxq->init_state != SFXGE_RXQ_STARTED)
			goto discard;

		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
			goto discard;

		prefetch_read_many(mtod(m, caddr_t));

		/* Check for loopback packets */
		if (!(rx_desc->flags & EFX_PKT_IPV4) &&
		    !(rx_desc->flags & EFX_PKT_IPV6)) {
			struct ether_header *etherhp;

			/*LINTED*/
			etherhp = mtod(m, struct ether_header *);

			if (etherhp->ether_type ==
			    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
				EFSYS_PROBE(loopback);

				rxq->loopback++;
				goto discard;
			}
		}

		/* Pass packet up the stack or into LRO (pipelined) */
		if (prev != NULL) {
			if (lro_enabled)
				sfxge_lro(rxq, prev);
			else
				sfxge_rx_deliver(sc, prev);
		}
		prev = rx_desc;
		continue;

discard:
		/* Return the packet to the pool */
		m_free(m);
		rx_desc->mbuf = NULL;
	}
	rxq->completed = completed;

	level = rxq->added - rxq->completed;

	/* Pass last packet up the stack or into LRO */
	if (prev != NULL) {
		if (lro_enabled)
			sfxge_lro(rxq, prev);
		else
			sfxge_rx_deliver(sc, prev);
	}

	/*
	 * If there are any pending flows and this is the end of the
	 * poll then they must be completed.
	 */
	if (eop)
		sfxge_lro_end_of_burst(rxq);

	/* Top up the queue if necessary */
	if (level < RX_REFILL_THRESHOLD)
		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_FALSE);
}

static void
sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
{
	struct sfxge_rxq *rxq;
	struct sfxge_evq *evq;
	unsigned int count;

	rxq = sc->rxq[index];
	evq = sc->evq[index];

	mtx_lock(&evq->lock);
	
	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
	    ("rxq not started"));

	rxq->init_state = SFXGE_RXQ_INITIALIZED;

	callout_stop(&rxq->refill_callout);

again:
	rxq->flush_state = SFXGE_FLUSH_PENDING;

	/* Flush the receive queue */
	efx_rx_qflush(rxq->common);

	mtx_unlock(&evq->lock);

	count = 0;
	do {
		/* Spin for 100 ms */
		DELAY(100000);

		if (rxq->flush_state != SFXGE_FLUSH_PENDING)
			break;

	} while (++count < 20);

	mtx_lock(&evq->lock);

	if (rxq->flush_state == SFXGE_FLUSH_FAILED)
		goto again;

	rxq->flush_state = SFXGE_FLUSH_DONE;

	rxq->pending = rxq->added;
	sfxge_rx_qcomplete(rxq, B_TRUE);

	KASSERT(rxq->completed == rxq->pending,
	    ("rxq->completed != rxq->pending"));

	rxq->added = 0;
	rxq->pending = 0;
	rxq->completed = 0;
	rxq->loopback = 0;

	/* Destroy the common code receive queue. */
	efx_rx_qdestroy(rxq->common);	

	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
	    EFX_RXQ_NBUFS(SFXGE_NDESCS));

	mtx_unlock(&evq->lock);
}

static int
sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
{
	struct sfxge_rxq *rxq;
	efsys_mem_t *esmp;
	struct sfxge_evq *evq;
	int rc;

	rxq = sc->rxq[index];
	esmp = &rxq->mem;
	evq = sc->evq[index];

	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
	    ("evq->init_state != SFXGE_EVQ_STARTED"));

	/* Program the buffer table. */
	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
	    EFX_RXQ_NBUFS(SFXGE_NDESCS))) != 0)
		return rc;

	/* Create the common code receive queue. */
	if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
	    esmp, SFXGE_NDESCS, rxq->buf_base_id, evq->common,
	    &rxq->common)) != 0)
		goto fail;

	mtx_lock(&evq->lock);

	/* Enable the receive queue. */
	efx_rx_qenable(rxq->common);

	rxq->init_state = SFXGE_RXQ_STARTED;

	/* Try to fill the queue from the pool. */
	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_FALSE);

	mtx_unlock(&evq->lock);

	return (0);

fail:
	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
	    EFX_RXQ_NBUFS(SFXGE_NDESCS));
	return rc;
}

void
sfxge_rx_stop(struct sfxge_softc *sc)
{
	struct sfxge_intr *intr;
	int index;

	intr = &sc->intr;

	/* Stop the receive queue(s) */
	index = intr->n_alloc;
	while (--index >= 0)
		sfxge_rx_qstop(sc, index);

	sc->rx_prefix_size = 0;
	sc->rx_buffer_size = 0;

	efx_rx_fini(sc->enp);
}

int
sfxge_rx_start(struct sfxge_softc *sc)
{
	struct sfxge_intr *intr;
	int index;
	int rc;

	intr = &sc->intr;

	/* Initialize the common code receive module. */
	if ((rc = efx_rx_init(sc->enp)) != 0)
		return (rc);

	/* Calculate the receive packet buffer size. */
	sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
	sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
			      sc->rx_prefix_size);

	/* Select zone for packet buffers */
	if (sc->rx_buffer_size <= MCLBYTES)
		sc->rx_buffer_zone = zone_clust;
	else if (sc->rx_buffer_size <= MJUMPAGESIZE)
		sc->rx_buffer_zone = zone_jumbop;
	else if (sc->rx_buffer_size <= MJUM9BYTES)
		sc->rx_buffer_zone = zone_jumbo9;
	else
		sc->rx_buffer_zone = zone_jumbo16;

	/*
	 * Set up the scale table.  Enable all hash types and hash insertion.
	 */
	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
		sc->rx_indir_table[index] = index % sc->intr.n_alloc;
	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
				       SFXGE_RX_SCALE_MAX)) != 0)
		goto fail;
	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);

	if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
	    sizeof(toep_key))) != 0)
		goto fail;

	/* Start the receive queue(s). */
	for (index = 0; index < intr->n_alloc; index++) {
		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
			goto fail2;
	}

	return (0);

fail2:
	while (--index >= 0)
		sfxge_rx_qstop(sc, index);

fail:
	efx_rx_fini(sc->enp);

	return (rc);
}

static void sfxge_lro_init(struct sfxge_rxq *rxq)
{
	struct sfxge_lro_state *st = &rxq->lro;
	unsigned i;

	st->conns_mask = lro_table_size - 1;
	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
		("lro_table_size must be a power of 2"));
	st->sc = rxq->sc;
	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
			   M_SFXGE, M_WAITOK);
	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
			     M_SFXGE, M_WAITOK);
	for (i = 0; i <= st->conns_mask; ++i) {
		TAILQ_INIT(&st->conns[i]);
		st->conns_n[i] = 0;
	}
	LIST_INIT(&st->active_conns);
	TAILQ_INIT(&st->free_conns);
}

static void sfxge_lro_fini(struct sfxge_rxq *rxq)
{
	struct sfxge_lro_state *st = &rxq->lro;
	struct sfxge_lro_conn *c;
	unsigned i;

	/* Return cleanly if sfxge_lro_init() has not been called. */
	if (st->conns == NULL)
		return;

	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));

	for (i = 0; i <= st->conns_mask; ++i) {
		while (!TAILQ_EMPTY(&st->conns[i])) {
			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
			sfxge_lro_drop(rxq, c);
		}
	}

	while (!TAILQ_EMPTY(&st->free_conns)) {
		c = TAILQ_FIRST(&st->free_conns);
		TAILQ_REMOVE(&st->free_conns, c, link);
		KASSERT(!c->mbuf, ("found orphaned mbuf"));
		free(c, M_SFXGE);
	}

	free(st->conns_n, M_SFXGE);
	free(st->conns, M_SFXGE);
	st->conns = NULL;
}

static void
sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
{
	struct sfxge_rxq *rxq;

	rxq = sc->rxq[index];

	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));

	/* Free the context array and the flow table. */
	free(rxq->queue, M_SFXGE);
	sfxge_lro_fini(rxq);

	/* Release DMA memory. */
	sfxge_dma_free(&rxq->mem);

	sc->rxq[index] = NULL;

	free(rxq, M_SFXGE);
}

static int
sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
{
	struct sfxge_rxq *rxq;
	struct sfxge_evq *evq;
	efsys_mem_t *esmp;
	int rc;

	KASSERT(index < sc->intr.n_alloc, ("index >= %d", sc->intr.n_alloc));

	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
	rxq->sc = sc;
	rxq->index = index;

	sc->rxq[index] = rxq;
	esmp = &rxq->mem;

	evq = sc->evq[index];

	/* Allocate and zero DMA space. */
	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(SFXGE_NDESCS), esmp)) != 0)
		return (rc);
	(void)memset(esmp->esm_base, 0, EFX_RXQ_SIZE(SFXGE_NDESCS));

	/* Allocate buffer table entries. */
	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(SFXGE_NDESCS),
				 &rxq->buf_base_id);

	/* Allocate the context array and the flow table. */
	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * SFXGE_NDESCS,
	    M_SFXGE, M_WAITOK | M_ZERO);
	sfxge_lro_init(rxq);

	callout_init(&rxq->refill_callout, B_TRUE);

	rxq->init_state = SFXGE_RXQ_INITIALIZED;

	return (0);
}

static const struct {
	const char *name;
	size_t offset;
} sfxge_rx_stats[] = {
#define SFXGE_RX_STAT(name, member) \
	{ #name, offsetof(struct sfxge_rxq, member) }
	SFXGE_RX_STAT(lro_merges, lro.n_merges),
	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
};

static int
sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
{
	struct sfxge_softc *sc = arg1;
	unsigned int id = arg2;
	unsigned int sum, index;

	/* Sum across all RX queues */
	sum = 0;
	for (index = 0; index < sc->intr.n_alloc; index++)
		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
					 sfxge_rx_stats[id].offset);

	return SYSCTL_OUT(req, &sum, sizeof(sum));
}

static void
sfxge_rx_stat_init(struct sfxge_softc *sc)
{
	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
	struct sysctl_oid_list *stat_list;
	unsigned int id;

	stat_list = SYSCTL_CHILDREN(sc->stats_node);

	for (id = 0;
	     id < sizeof(sfxge_rx_stats) / sizeof(sfxge_rx_stats[0]);
	     id++) {
		SYSCTL_ADD_PROC(
			ctx, stat_list,
			OID_AUTO, sfxge_rx_stats[id].name,
			CTLTYPE_UINT|CTLFLAG_RD,
			sc, id, sfxge_rx_stat_handler, "IU",
			"");
	}
}

void
sfxge_rx_fini(struct sfxge_softc *sc)
{
	struct sfxge_intr *intr;
	int index;

	intr = &sc->intr;

	index = intr->n_alloc;
	while (--index >= 0)
		sfxge_rx_qfini(sc, index);
}

int
sfxge_rx_init(struct sfxge_softc *sc)
{
	struct sfxge_intr *intr;
	int index;
	int rc;

	if (lro_idle_ticks == 0)
		lro_idle_ticks = hz / 10 + 1; /* 100 ms */

	intr = &sc->intr;

	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
	    ("intr->state != SFXGE_INTR_INITIALIZED"));

	/* Initialize the receive queue(s) - one per interrupt. */
	for (index = 0; index < intr->n_alloc; index++) {
		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
			goto fail;
	}

	sfxge_rx_stat_init(sc);

	return (0);

fail:
	/* Tear down the receive queue(s). */
	while (--index >= 0)
		sfxge_rx_qfini(sc, index);

	return (rc);
}

Man Man