config root man

Current Path : /usr/src/contrib/ofed/libmthca/src/

FreeBSD hs32.drive.ne.jp 9.1-RELEASE FreeBSD 9.1-RELEASE #1: Wed Jan 14 12:18:08 JST 2015 root@hs32.drive.ne.jp:/sys/amd64/compile/hs32 amd64
Upload File :
Current File : //usr/src/contrib/ofed/libmthca/src/qp.c

/*
 * Copyright (c) 2005 Topspin Communications.  All rights reserved.
 * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#if HAVE_CONFIG_H
#  include <config.h>
#endif /* HAVE_CONFIG_H */

#include <stdlib.h>
#include <netinet/in.h>
#include <pthread.h>
#include <string.h>

#include "mthca.h"
#include "doorbell.h"
#include "wqe.h"

enum {
	MTHCA_SEND_DOORBELL_FENCE = 1 << 5
};

static const uint8_t mthca_opcode[] = {
	[IBV_WR_SEND]                 = MTHCA_OPCODE_SEND,
	[IBV_WR_SEND_WITH_IMM]        = MTHCA_OPCODE_SEND_IMM,
	[IBV_WR_RDMA_WRITE]           = MTHCA_OPCODE_RDMA_WRITE,
	[IBV_WR_RDMA_WRITE_WITH_IMM]  = MTHCA_OPCODE_RDMA_WRITE_IMM,
	[IBV_WR_RDMA_READ]            = MTHCA_OPCODE_RDMA_READ,
	[IBV_WR_ATOMIC_CMP_AND_SWP]   = MTHCA_OPCODE_ATOMIC_CS,
	[IBV_WR_ATOMIC_FETCH_AND_ADD] = MTHCA_OPCODE_ATOMIC_FA,
};

static void *get_recv_wqe(struct mthca_qp *qp, int n)
{
	return qp->buf.buf + (n << qp->rq.wqe_shift);
}

static void *get_send_wqe(struct mthca_qp *qp, int n)
{
	return qp->buf.buf + qp->send_wqe_offset + (n << qp->sq.wqe_shift);
}

void mthca_init_qp_indices(struct mthca_qp *qp)
{
	qp->sq.next_ind  = 0;
	qp->sq.last_comp = qp->sq.max - 1;
	qp->sq.head    	 = 0;
	qp->sq.tail    	 = 0;
	qp->sq.last      = get_send_wqe(qp, qp->sq.max - 1);

	qp->rq.next_ind	 = 0;
	qp->rq.last_comp = qp->rq.max - 1;
	qp->rq.head    	 = 0;
	qp->rq.tail    	 = 0;
	qp->rq.last      = get_recv_wqe(qp, qp->rq.max - 1);
}

static inline int wq_overflow(struct mthca_wq *wq, int nreq, struct mthca_cq *cq)
{
	unsigned cur;

	cur = wq->head - wq->tail;
	if (cur + nreq < wq->max)
		return 0;

	pthread_spin_lock(&cq->lock);
	cur = wq->head - wq->tail;
	pthread_spin_unlock(&cq->lock);

	return cur + nreq >= wq->max;
}

int mthca_tavor_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
			  struct ibv_send_wr **bad_wr)
{
	struct mthca_qp *qp = to_mqp(ibqp);
	void *wqe, *prev_wqe;
	int ind;
	int nreq;
	int ret = 0;
	int size;
	int size0 = 0;
	int i;
	/*
	 * f0 and op0 cannot be used unless nreq > 0, which means this
	 * function makes it through the loop at least once.  So the
	 * code inside the if (!size0) will be executed, and f0 and
	 * op0 will be initialized.  So any gcc warning about "may be
	 * used unitialized" is bogus.
	 */
	uint32_t f0;
	uint32_t op0;

	pthread_spin_lock(&qp->sq.lock);

	ind = qp->sq.next_ind;

	for (nreq = 0; wr; ++nreq, wr = wr->next) {
		if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
			ret = -1;
			*bad_wr = wr;
			goto out;
		}

		wqe = get_send_wqe(qp, ind);
		prev_wqe = qp->sq.last;
		qp->sq.last = wqe;

		((struct mthca_next_seg *) wqe)->nda_op = 0;
		((struct mthca_next_seg *) wqe)->ee_nds = 0;
		((struct mthca_next_seg *) wqe)->flags =
			((wr->send_flags & IBV_SEND_SIGNALED) ?
			 htonl(MTHCA_NEXT_CQ_UPDATE) : 0) |
			((wr->send_flags & IBV_SEND_SOLICITED) ?
			 htonl(MTHCA_NEXT_SOLICIT) : 0)   |
			htonl(1);
		if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
		    wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
			((struct mthca_next_seg *) wqe)->imm = wr->imm_data;

		wqe += sizeof (struct mthca_next_seg);
		size = sizeof (struct mthca_next_seg) / 16;

		switch (ibqp->qp_type) {
		case IBV_QPT_RC:
			switch (wr->opcode) {
			case IBV_WR_ATOMIC_CMP_AND_SWP:
			case IBV_WR_ATOMIC_FETCH_AND_ADD:
				((struct mthca_raddr_seg *) wqe)->raddr =
					htonll(wr->wr.atomic.remote_addr);
				((struct mthca_raddr_seg *) wqe)->rkey =
					htonl(wr->wr.atomic.rkey);
				((struct mthca_raddr_seg *) wqe)->reserved = 0;

				wqe += sizeof (struct mthca_raddr_seg);

				if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
					((struct mthca_atomic_seg *) wqe)->swap_add =
						htonll(wr->wr.atomic.swap);
					((struct mthca_atomic_seg *) wqe)->compare =
						htonll(wr->wr.atomic.compare_add);
				} else {
					((struct mthca_atomic_seg *) wqe)->swap_add =
						htonll(wr->wr.atomic.compare_add);
					((struct mthca_atomic_seg *) wqe)->compare = 0;
				}

				wqe += sizeof (struct mthca_atomic_seg);
				size += (sizeof (struct mthca_raddr_seg) +
					 sizeof (struct mthca_atomic_seg)) / 16;
				break;

			case IBV_WR_RDMA_WRITE:
			case IBV_WR_RDMA_WRITE_WITH_IMM:
			case IBV_WR_RDMA_READ:
				((struct mthca_raddr_seg *) wqe)->raddr =
					htonll(wr->wr.rdma.remote_addr);
				((struct mthca_raddr_seg *) wqe)->rkey =
					htonl(wr->wr.rdma.rkey);
				((struct mthca_raddr_seg *) wqe)->reserved = 0;
				wqe += sizeof (struct mthca_raddr_seg);
				size += sizeof (struct mthca_raddr_seg) / 16;
				break;

			default:
				/* No extra segments required for sends */
				break;
			}

			break;

		case IBV_QPT_UC:
			switch (wr->opcode) {
			case IBV_WR_RDMA_WRITE:
			case IBV_WR_RDMA_WRITE_WITH_IMM:
				((struct mthca_raddr_seg *) wqe)->raddr =
					htonll(wr->wr.rdma.remote_addr);
				((struct mthca_raddr_seg *) wqe)->rkey =
					htonl(wr->wr.rdma.rkey);
				((struct mthca_raddr_seg *) wqe)->reserved = 0;
				wqe += sizeof (struct mthca_raddr_seg);
				size += sizeof (struct mthca_raddr_seg) / 16;
				break;

			default:
				/* No extra segments required for sends */
				break;
			}

			break;

		case IBV_QPT_UD:
			((struct mthca_tavor_ud_seg *) wqe)->lkey =
				htonl(to_mah(wr->wr.ud.ah)->key);
			((struct mthca_tavor_ud_seg *) wqe)->av_addr =
				htonll((uintptr_t) to_mah(wr->wr.ud.ah)->av);
			((struct mthca_tavor_ud_seg *) wqe)->dqpn =
				htonl(wr->wr.ud.remote_qpn);
			((struct mthca_tavor_ud_seg *) wqe)->qkey =
				htonl(wr->wr.ud.remote_qkey);

			wqe += sizeof (struct mthca_tavor_ud_seg);
			size += sizeof (struct mthca_tavor_ud_seg) / 16;
			break;

		default:
			break;
		}

		if (wr->num_sge > qp->sq.max_gs) {
			ret = -1;
			*bad_wr = wr;
			goto out;
		}

		if (wr->send_flags & IBV_SEND_INLINE) {
			if (wr->num_sge) {
				struct mthca_inline_seg *seg = wqe;
				int s = 0;

				wqe += sizeof *seg;
				for (i = 0; i < wr->num_sge; ++i) {
					struct ibv_sge *sge = &wr->sg_list[i];

					s += sge->length;

					if (s > qp->max_inline_data) {
						ret = -1;
						*bad_wr = wr;
						goto out;
					}

					memcpy(wqe, (void *) (intptr_t) sge->addr,
					       sge->length);
					wqe += sge->length;
				}

				seg->byte_count = htonl(MTHCA_INLINE_SEG | s);
				size += align(s + sizeof *seg, 16) / 16;
			}
		} else {
			struct mthca_data_seg *seg;

			for (i = 0; i < wr->num_sge; ++i) {
				seg = wqe;
				seg->byte_count = htonl(wr->sg_list[i].length);
				seg->lkey = htonl(wr->sg_list[i].lkey);
				seg->addr = htonll(wr->sg_list[i].addr);
				wqe += sizeof *seg;
			}

			size += wr->num_sge * (sizeof *seg / 16);
		}

		qp->wrid[ind + qp->rq.max] = wr->wr_id;

		if (wr->opcode >= sizeof mthca_opcode / sizeof mthca_opcode[0]) {
			ret = -1;
			*bad_wr = wr;
			goto out;
		}

		((struct mthca_next_seg *) prev_wqe)->nda_op =
			htonl(((ind << qp->sq.wqe_shift) +
			       qp->send_wqe_offset) |
			      mthca_opcode[wr->opcode]);
		/*
		 * Make sure that nda_op is written before setting ee_nds.
		 */
		wmb();
		((struct mthca_next_seg *) prev_wqe)->ee_nds =
			htonl((size0 ? 0 : MTHCA_NEXT_DBD) | size |
			((wr->send_flags & IBV_SEND_FENCE) ?
			 MTHCA_NEXT_FENCE : 0));

		if (!size0) {
			size0 = size;
			op0   = mthca_opcode[wr->opcode];
			f0    = wr->send_flags & IBV_SEND_FENCE ?
				MTHCA_SEND_DOORBELL_FENCE : 0;
		}

		++ind;
		if (ind >= qp->sq.max)
			ind -= qp->sq.max;
	}

out:
	if (nreq) {
		uint32_t doorbell[2];

		doorbell[0] = htonl(((qp->sq.next_ind << qp->sq.wqe_shift) +
				     qp->send_wqe_offset) | f0 | op0);
		doorbell[1] = htonl((ibqp->qp_num << 8) | size0);

		mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_SEND_DOORBELL);
	}

	qp->sq.next_ind = ind;
	qp->sq.head    += nreq;

	pthread_spin_unlock(&qp->sq.lock);
	return ret;
}

int mthca_tavor_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
			  struct ibv_recv_wr **bad_wr)
{
	struct mthca_qp *qp = to_mqp(ibqp);
	uint32_t doorbell[2];
	int ret = 0;
	int nreq;
	int i;
	int size;
	int size0 = 0;
	int ind;
	void *wqe;
	void *prev_wqe;

	pthread_spin_lock(&qp->rq.lock);

	ind = qp->rq.next_ind;

	for (nreq = 0; wr; wr = wr->next) {
		if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {
			ret = -1;
			*bad_wr = wr;
			goto out;
		}

		wqe = get_recv_wqe(qp, ind);
		prev_wqe = qp->rq.last;
		qp->rq.last = wqe;

		((struct mthca_next_seg *) wqe)->ee_nds =
			htonl(MTHCA_NEXT_DBD);
		((struct mthca_next_seg *) wqe)->flags =
			htonl(MTHCA_NEXT_CQ_UPDATE);

		wqe += sizeof (struct mthca_next_seg);
		size = sizeof (struct mthca_next_seg) / 16;

		if (wr->num_sge > qp->rq.max_gs) {
			ret = -1;
			*bad_wr = wr;
			goto out;
		}

		for (i = 0; i < wr->num_sge; ++i) {
			((struct mthca_data_seg *) wqe)->byte_count =
				htonl(wr->sg_list[i].length);
			((struct mthca_data_seg *) wqe)->lkey =
				htonl(wr->sg_list[i].lkey);
			((struct mthca_data_seg *) wqe)->addr =
				htonll(wr->sg_list[i].addr);
			wqe += sizeof (struct mthca_data_seg);
			size += sizeof (struct mthca_data_seg) / 16;
		}

		qp->wrid[ind] = wr->wr_id;

		((struct mthca_next_seg *) prev_wqe)->ee_nds =
			htonl(MTHCA_NEXT_DBD | size);

		if (!size0)
			size0 = size;

		++ind;
		if (ind >= qp->rq.max)
			ind -= qp->rq.max;

		++nreq;
		if (nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB) {
			nreq = 0;

			doorbell[0] = htonl((qp->rq.next_ind << qp->rq.wqe_shift) | size0);
			doorbell[1] = htonl(ibqp->qp_num << 8);

			/*
			 * Make sure that descriptors are written
			 * before doorbell is rung.
			 */
			wmb();

			mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_RECV_DOORBELL);

			qp->rq.next_ind = ind;
			qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB;
			size0 = 0;
		}
	}

out:
	if (nreq) {
		doorbell[0] = htonl((qp->rq.next_ind << qp->rq.wqe_shift) | size0);
		doorbell[1] = htonl((ibqp->qp_num << 8) | nreq);

		/*
		 * Make sure that descriptors are written before
		 * doorbell is rung.
		 */
		wmb();

		mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_RECV_DOORBELL);
	}

	qp->rq.next_ind = ind;
	qp->rq.head    += nreq;

	pthread_spin_unlock(&qp->rq.lock);
	return ret;
}

int mthca_arbel_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
			  struct ibv_send_wr **bad_wr)
{
	struct mthca_qp *qp = to_mqp(ibqp);
	uint32_t doorbell[2];
	void *wqe, *prev_wqe;
	int ind;
	int nreq;
	int ret = 0;
	int size;
	int size0 = 0;
	int i;
	/*
	 * f0 and op0 cannot be used unless nreq > 0, which means this
	 * function makes it through the loop at least once.  So the
	 * code inside the if (!size0) will be executed, and f0 and
	 * op0 will be initialized.  So any gcc warning about "may be
	 * used unitialized" is bogus.
	 */
	uint32_t f0;
	uint32_t op0;

	pthread_spin_lock(&qp->sq.lock);

	/* XXX check that state is OK to post send */

	ind = qp->sq.head & (qp->sq.max - 1);

	for (nreq = 0; wr; ++nreq, wr = wr->next) {
		if (nreq == MTHCA_ARBEL_MAX_WQES_PER_SEND_DB) {
			nreq = 0;

			doorbell[0] = htonl((MTHCA_ARBEL_MAX_WQES_PER_SEND_DB << 24) |
					    ((qp->sq.head & 0xffff) << 8) | f0 | op0);
			doorbell[1] = htonl((ibqp->qp_num << 8) | size0);

			qp->sq.head += MTHCA_ARBEL_MAX_WQES_PER_SEND_DB;

			/*
			 * Make sure that descriptors are written before
			 * doorbell record.
			 */
			wmb();
			*qp->sq.db = htonl(qp->sq.head & 0xffff);

			/*
			 * Make sure doorbell record is written before we
			 * write MMIO send doorbell.
			 */
			wmb();
			mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_SEND_DOORBELL);

			size0 = 0;
		}

		if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
			ret = -1;
			*bad_wr = wr;
			goto out;
		}

		wqe = get_send_wqe(qp, ind);
		prev_wqe = qp->sq.last;
		qp->sq.last = wqe;

		((struct mthca_next_seg *) wqe)->flags =
			((wr->send_flags & IBV_SEND_SIGNALED) ?
			 htonl(MTHCA_NEXT_CQ_UPDATE) : 0) |
			((wr->send_flags & IBV_SEND_SOLICITED) ?
			 htonl(MTHCA_NEXT_SOLICIT) : 0)   |
			htonl(1);
		if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
		    wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
			((struct mthca_next_seg *) wqe)->imm = wr->imm_data;

		wqe += sizeof (struct mthca_next_seg);
		size = sizeof (struct mthca_next_seg) / 16;

		switch (ibqp->qp_type) {
		case IBV_QPT_RC:
			switch (wr->opcode) {
			case IBV_WR_ATOMIC_CMP_AND_SWP:
			case IBV_WR_ATOMIC_FETCH_AND_ADD:
				((struct mthca_raddr_seg *) wqe)->raddr =
					htonll(wr->wr.atomic.remote_addr);
				((struct mthca_raddr_seg *) wqe)->rkey =
					htonl(wr->wr.atomic.rkey);
				((struct mthca_raddr_seg *) wqe)->reserved = 0;

				wqe += sizeof (struct mthca_raddr_seg);

				if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
					((struct mthca_atomic_seg *) wqe)->swap_add =
						htonll(wr->wr.atomic.swap);
					((struct mthca_atomic_seg *) wqe)->compare =
						htonll(wr->wr.atomic.compare_add);
				} else {
					((struct mthca_atomic_seg *) wqe)->swap_add =
						htonll(wr->wr.atomic.compare_add);
					((struct mthca_atomic_seg *) wqe)->compare = 0;
				}

				wqe += sizeof (struct mthca_atomic_seg);
				size += (sizeof (struct mthca_raddr_seg) +
					 sizeof (struct mthca_atomic_seg)) / 16;
				break;

			case IBV_WR_RDMA_WRITE:
			case IBV_WR_RDMA_WRITE_WITH_IMM:
			case IBV_WR_RDMA_READ:
				((struct mthca_raddr_seg *) wqe)->raddr =
					htonll(wr->wr.rdma.remote_addr);
				((struct mthca_raddr_seg *) wqe)->rkey =
					htonl(wr->wr.rdma.rkey);
				((struct mthca_raddr_seg *) wqe)->reserved = 0;
				wqe += sizeof (struct mthca_raddr_seg);
				size += sizeof (struct mthca_raddr_seg) / 16;
				break;

			default:
				/* No extra segments required for sends */
				break;
			}

			break;

		case IBV_QPT_UC:
			switch (wr->opcode) {
			case IBV_WR_RDMA_WRITE:
			case IBV_WR_RDMA_WRITE_WITH_IMM:
				((struct mthca_raddr_seg *) wqe)->raddr =
					htonll(wr->wr.rdma.remote_addr);
				((struct mthca_raddr_seg *) wqe)->rkey =
					htonl(wr->wr.rdma.rkey);
				((struct mthca_raddr_seg *) wqe)->reserved = 0;
				wqe += sizeof (struct mthca_raddr_seg);
				size += sizeof (struct mthca_raddr_seg) / 16;
				break;

			default:
				/* No extra segments required for sends */
				break;
			}

			break;

		case IBV_QPT_UD:
			memcpy(((struct mthca_arbel_ud_seg *) wqe)->av,
			       to_mah(wr->wr.ud.ah)->av, sizeof (struct mthca_av));
			((struct mthca_arbel_ud_seg *) wqe)->dqpn =
				htonl(wr->wr.ud.remote_qpn);
			((struct mthca_arbel_ud_seg *) wqe)->qkey =
				htonl(wr->wr.ud.remote_qkey);

			wqe += sizeof (struct mthca_arbel_ud_seg);
			size += sizeof (struct mthca_arbel_ud_seg) / 16;
			break;

		default:
			break;
		}

		if (wr->num_sge > qp->sq.max_gs) {
			ret = -1;
			*bad_wr = wr;
			goto out;
		}

		if (wr->send_flags & IBV_SEND_INLINE) {
			if (wr->num_sge) {
				struct mthca_inline_seg *seg = wqe;
				int s = 0;

				wqe += sizeof *seg;
				for (i = 0; i < wr->num_sge; ++i) {
					struct ibv_sge *sge = &wr->sg_list[i];

					s += sge->length;

					if (s > qp->max_inline_data) {
						ret = -1;
						*bad_wr = wr;
						goto out;
					}

					memcpy(wqe, (void *) (uintptr_t) sge->addr,
					       sge->length);
					wqe += sge->length;
				}

				seg->byte_count = htonl(MTHCA_INLINE_SEG | s);
				size += align(s + sizeof *seg, 16) / 16;
			}
		} else {
			struct mthca_data_seg *seg;

			for (i = 0; i < wr->num_sge; ++i) {
				seg = wqe;
				seg->byte_count = htonl(wr->sg_list[i].length);
				seg->lkey = htonl(wr->sg_list[i].lkey);
				seg->addr = htonll(wr->sg_list[i].addr);
				wqe += sizeof *seg;
			}

			size += wr->num_sge * (sizeof *seg / 16);
		}

		qp->wrid[ind + qp->rq.max] = wr->wr_id;

		if (wr->opcode >= sizeof mthca_opcode / sizeof mthca_opcode[0]) {
			ret = -1;
			*bad_wr = wr;
			goto out;
		}

		((struct mthca_next_seg *) prev_wqe)->nda_op =
			htonl(((ind << qp->sq.wqe_shift) +
			       qp->send_wqe_offset) |
			      mthca_opcode[wr->opcode]);
		wmb();
		((struct mthca_next_seg *) prev_wqe)->ee_nds =
			htonl(MTHCA_NEXT_DBD | size |
			      ((wr->send_flags & IBV_SEND_FENCE) ?
			       MTHCA_NEXT_FENCE : 0));

		if (!size0) {
			size0 = size;
			op0   = mthca_opcode[wr->opcode];
			f0    = wr->send_flags & IBV_SEND_FENCE ?
				MTHCA_SEND_DOORBELL_FENCE : 0;
		}

		++ind;
		if (ind >= qp->sq.max)
			ind -= qp->sq.max;
	}

out:
	if (nreq) {
		doorbell[0] = htonl((nreq << 24)                  |
				    ((qp->sq.head & 0xffff) << 8) |
				    f0 | op0);
		doorbell[1] = htonl((ibqp->qp_num << 8) | size0);

		qp->sq.head += nreq;

		/*
		 * Make sure that descriptors are written before
		 * doorbell record.
		 */
		wmb();
		*qp->sq.db = htonl(qp->sq.head & 0xffff);

		/*
		 * Make sure doorbell record is written before we
		 * write MMIO send doorbell.
		 */
		wmb();
		mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_SEND_DOORBELL);
	}

	pthread_spin_unlock(&qp->sq.lock);
	return ret;
}

int mthca_arbel_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
			  struct ibv_recv_wr **bad_wr)
{
	struct mthca_qp *qp = to_mqp(ibqp);
	int ret = 0;
	int nreq;
	int ind;
	int i;
	void *wqe;

	pthread_spin_lock(&qp->rq.lock);

	/* XXX check that state is OK to post receive */

	ind = qp->rq.head & (qp->rq.max - 1);

	for (nreq = 0; wr; ++nreq, wr = wr->next) {
		if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {
			ret = -1;
			*bad_wr = wr;
			goto out;
		}

		wqe = get_recv_wqe(qp, ind);

		((struct mthca_next_seg *) wqe)->flags = 0;

		wqe += sizeof (struct mthca_next_seg);

		if (wr->num_sge > qp->rq.max_gs) {
			ret = -1;
			*bad_wr = wr;
			goto out;
		}

		for (i = 0; i < wr->num_sge; ++i) {
			((struct mthca_data_seg *) wqe)->byte_count =
				htonl(wr->sg_list[i].length);
			((struct mthca_data_seg *) wqe)->lkey =
				htonl(wr->sg_list[i].lkey);
			((struct mthca_data_seg *) wqe)->addr =
				htonll(wr->sg_list[i].addr);
			wqe += sizeof (struct mthca_data_seg);
		}

		if (i < qp->rq.max_gs) {
			((struct mthca_data_seg *) wqe)->byte_count = 0;
			((struct mthca_data_seg *) wqe)->lkey = htonl(MTHCA_INVAL_LKEY);
			((struct mthca_data_seg *) wqe)->addr = 0;
		}

		qp->wrid[ind] = wr->wr_id;

		++ind;
		if (ind >= qp->rq.max)
			ind -= qp->rq.max;
	}
out:
	if (nreq) {
		qp->rq.head += nreq;

		/*
		 * Make sure that descriptors are written before
		 * doorbell record.
		 */
		wmb();
		*qp->rq.db = htonl(qp->rq.head & 0xffff);
	}

	pthread_spin_unlock(&qp->rq.lock);
	return ret;
}

int mthca_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
		       enum ibv_qp_type type, struct mthca_qp *qp)
{
	int size;
	int max_sq_sge;
	struct mthca_next_seg *next;
	int i;

	qp->rq.max_gs 	 = cap->max_recv_sge;
	qp->sq.max_gs 	 = cap->max_send_sge;
	max_sq_sge 	 = align(cap->max_inline_data + sizeof (struct mthca_inline_seg),
				 sizeof (struct mthca_data_seg)) / sizeof (struct mthca_data_seg);
	if (max_sq_sge < cap->max_send_sge)
		max_sq_sge = cap->max_send_sge;

	qp->wrid = malloc((qp->rq.max + qp->sq.max) * sizeof (uint64_t));
	if (!qp->wrid)
		return -1;

	size = sizeof (struct mthca_next_seg) +
		qp->rq.max_gs * sizeof (struct mthca_data_seg);

	for (qp->rq.wqe_shift = 6; 1 << qp->rq.wqe_shift < size;
	     qp->rq.wqe_shift++)
		; /* nothing */

	size = max_sq_sge * sizeof (struct mthca_data_seg);
	switch (type) {
	case IBV_QPT_UD:
		size += mthca_is_memfree(pd->context) ?
			sizeof (struct mthca_arbel_ud_seg) :
			sizeof (struct mthca_tavor_ud_seg);
		break;

	case IBV_QPT_UC:
		size += sizeof (struct mthca_raddr_seg);
		break;

	case IBV_QPT_RC:
		size += sizeof (struct mthca_raddr_seg);
		/*
		 * An atomic op will require an atomic segment, a
		 * remote address segment and one scatter entry.
		 */
		if (size < (sizeof (struct mthca_atomic_seg) +
			    sizeof (struct mthca_raddr_seg) +
			    sizeof (struct mthca_data_seg)))
			size = (sizeof (struct mthca_atomic_seg) +
				sizeof (struct mthca_raddr_seg) +
				sizeof (struct mthca_data_seg));
		break;

	default:
		break;
	}

	/* Make sure that we have enough space for a bind request */
	if (size < sizeof (struct mthca_bind_seg))
		size = sizeof (struct mthca_bind_seg);

	size += sizeof (struct mthca_next_seg);

	for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
	     qp->sq.wqe_shift++)
		; /* nothing */

	qp->send_wqe_offset = align(qp->rq.max << qp->rq.wqe_shift,
				    1 << qp->sq.wqe_shift);

	qp->buf_size = qp->send_wqe_offset + (qp->sq.max << qp->sq.wqe_shift);

	if (mthca_alloc_buf(&qp->buf,
			    align(qp->buf_size, to_mdev(pd->context->device)->page_size),
			    to_mdev(pd->context->device)->page_size)) {
		free(qp->wrid);
		return -1;
	}

	memset(qp->buf.buf, 0, qp->buf_size);

	if (mthca_is_memfree(pd->context)) {
		struct mthca_data_seg *scatter;
		uint32_t sz;

		sz = htonl((sizeof (struct mthca_next_seg) +
			    qp->rq.max_gs * sizeof (struct mthca_data_seg)) / 16);

		for (i = 0; i < qp->rq.max; ++i) {
			next = get_recv_wqe(qp, i);
			next->nda_op = htonl(((i + 1) & (qp->rq.max - 1)) <<
					     qp->rq.wqe_shift);
			next->ee_nds = sz;

			for (scatter = (void *) (next + 1);
			     (void *) scatter < (void *) next + (1 << qp->rq.wqe_shift);
			     ++scatter)
				scatter->lkey = htonl(MTHCA_INVAL_LKEY);
		}

		for (i = 0; i < qp->sq.max; ++i) {
			next = get_send_wqe(qp, i);
			next->nda_op = htonl((((i + 1) & (qp->sq.max - 1)) <<
					      qp->sq.wqe_shift) +
					     qp->send_wqe_offset);
		}
	} else {
		for (i = 0; i < qp->rq.max; ++i) {
			next = get_recv_wqe(qp, i);
			next->nda_op = htonl((((i + 1) % qp->rq.max) <<
					     qp->rq.wqe_shift) | 1);
		}
	}

	qp->sq.last = get_send_wqe(qp, qp->sq.max - 1);
	qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1);

	return 0;
}

struct mthca_qp *mthca_find_qp(struct mthca_context *ctx, uint32_t qpn)
{
	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;

	if (ctx->qp_table[tind].refcnt)
		return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];
	else
		return NULL;
}

int mthca_store_qp(struct mthca_context *ctx, uint32_t qpn, struct mthca_qp *qp)
{
	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;

	if (!ctx->qp_table[tind].refcnt) {
		ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1,
						   sizeof (struct mthca_qp *));
		if (!ctx->qp_table[tind].table)
			return -1;
	}

	++ctx->qp_table[tind].refcnt;
	ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;
	return 0;
}

void mthca_clear_qp(struct mthca_context *ctx, uint32_t qpn)
{
	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;

	if (!--ctx->qp_table[tind].refcnt)
		free(ctx->qp_table[tind].table);
	else
		ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
}

int mthca_free_err_wqe(struct mthca_qp *qp, int is_send,
		       int index, int *dbd, uint32_t *new_wqe)
{
	struct mthca_next_seg *next;

	/*
	 * For SRQs, all receive WQEs generate a CQE, so we're always
	 * at the end of the doorbell chain.
	 */
	if (qp->ibv_qp.srq && !is_send) {
		*new_wqe = 0;
		return 0;
	}

	if (is_send)
		next = get_send_wqe(qp, index);
	else
		next = get_recv_wqe(qp, index);

	*dbd = !!(next->ee_nds & htonl(MTHCA_NEXT_DBD));
	if (next->ee_nds & htonl(0x3f))
		*new_wqe = (next->nda_op & htonl(~0x3f)) |
			(next->ee_nds & htonl(0x3f));
	else
		*new_wqe = 0;

	return 0;
}


Man Man