[mirror_ubuntu-artful-kernel.git] / drivers / staging / lustre / lustre / ptlrpc / events.c

/*
 * GPL HEADER START
 *
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 only,
 * as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License version 2 for more details (a copy is included
 * in the LICENSE file that accompanied this code).
 *
 * You should have received a copy of the GNU General Public License
 * version 2 along with this program; If not, see
 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
 *
 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
 * CA 95054 USA or visit www.sun.com if you need additional information or
 * have any questions.
 *
 * GPL HEADER END
 */
/*
 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
 * Use is subject to license terms.
 *
 * Copyright (c) 2012, Intel Corporation.
 */
/*
 * This file is part of Lustre, http://www.lustre.org/
 * Lustre is a trademark of Sun Microsystems, Inc.
 */

#define DEBUG_SUBSYSTEM S_RPC

# include <linux/libcfs/libcfs.h>
# ifdef __mips64__
#  include <linux/kernel.h>
# endif

#include <obd_class.h>
#include <lustre_net.h>
#include <lustre_sec.h>
#include "ptlrpc_internal.h"

lnet_handle_eq_t   ptlrpc_eq_h;

/*
 *  Client's outgoing request callback
 */
void request_out_callback(lnet_event_t *ev)
{
	struct ptlrpc_cb_id   *cbid = ev->md.user_ptr;
	struct ptlrpc_request *req = cbid->cbid_arg;

	LASSERT(ev->type == LNET_EVENT_SEND ||
		ev->type == LNET_EVENT_UNLINK);
	LASSERT(ev->unlinked);

	DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);

	sptlrpc_request_out_callback(req);
	req->rq_real_sent = cfs_time_current_sec();

	if (ev->type == LNET_EVENT_UNLINK || ev->status != 0) {

		/* Failed send: make it seem like the reply timed out, just
		 * like failing sends in client.c does currently...  */

		spin_lock(&req->rq_lock);
		req->rq_net_err = 1;
		spin_unlock(&req->rq_lock);

		ptlrpc_client_wake_req(req);
	}

	ptlrpc_req_finished(req);
}

/*
 * Client's incoming reply callback
 */
void reply_in_callback(lnet_event_t *ev)
{
	struct ptlrpc_cb_id   *cbid = ev->md.user_ptr;
	struct ptlrpc_request *req = cbid->cbid_arg;

	DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);

	LASSERT(ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK);
	LASSERT(ev->md.start == req->rq_repbuf);
	LASSERT(ev->offset + ev->mlength <= req->rq_repbuf_len);
	/* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests
	   for adaptive timeouts' early reply. */
	LASSERT((ev->md.options & LNET_MD_MANAGE_REMOTE) != 0);

	spin_lock(&req->rq_lock);

	req->rq_receiving_reply = 0;
	req->rq_early = 0;
	if (ev->unlinked)
		req->rq_must_unlink = 0;

	if (ev->status)
		goto out_wake;

	if (ev->type == LNET_EVENT_UNLINK) {
		LASSERT(ev->unlinked);
		DEBUG_REQ(D_NET, req, "unlink");
		goto out_wake;
	}

	if (ev->mlength < ev->rlength) {
		CDEBUG(D_RPCTRACE, "truncate req %p rpc %d - %d+%d\n", req,
		       req->rq_replen, ev->rlength, ev->offset);
		req->rq_reply_truncate = 1;
		req->rq_replied = 1;
		req->rq_status = -EOVERFLOW;
		req->rq_nob_received = ev->rlength + ev->offset;
		goto out_wake;
	}

	if ((ev->offset == 0) &&
	    ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))) {
		/* Early reply */
		DEBUG_REQ(D_ADAPTTO, req,
			  "Early reply received: mlen=%u offset=%d replen=%d "
			  "replied=%d unlinked=%d", ev->mlength, ev->offset,
			  req->rq_replen, req->rq_replied, ev->unlinked);

		req->rq_early_count++; /* number received, client side */

		if (req->rq_replied)   /* already got the real reply */
			goto out_wake;

		req->rq_early = 1;
		req->rq_reply_off = ev->offset;
		req->rq_nob_received = ev->mlength;
		/* And we're still receiving */
		req->rq_receiving_reply = 1;
	} else {
		/* Real reply */
		req->rq_rep_swab_mask = 0;
		req->rq_replied = 1;
		req->rq_reply_off = ev->offset;
		req->rq_nob_received = ev->mlength;
		/* LNetMDUnlink can't be called under the LNET_LOCK,
		   so we must unlink in ptlrpc_unregister_reply */
		DEBUG_REQ(D_INFO, req,
			  "reply in flags=%x mlen=%u offset=%d replen=%d",
			  lustre_msg_get_flags(req->rq_reqmsg),
			  ev->mlength, ev->offset, req->rq_replen);
	}

	req->rq_import->imp_last_reply_time = cfs_time_current_sec();

out_wake:
	/* NB don't unlock till after wakeup; req can disappear under us
	 * since we don't have our own ref */
	ptlrpc_client_wake_req(req);
	spin_unlock(&req->rq_lock);
}

/*
 * Client's bulk has been written/read
 */
void client_bulk_callback(lnet_event_t *ev)
{
	struct ptlrpc_cb_id     *cbid = ev->md.user_ptr;
	struct ptlrpc_bulk_desc *desc = cbid->cbid_arg;
	struct ptlrpc_request   *req;

	LASSERT((desc->bd_type == BULK_PUT_SINK &&
		 ev->type == LNET_EVENT_PUT) ||
		(desc->bd_type == BULK_GET_SOURCE &&
		 ev->type == LNET_EVENT_GET) ||
		ev->type == LNET_EVENT_UNLINK);
	LASSERT(ev->unlinked);

	if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB, CFS_FAIL_ONCE))
		ev->status = -EIO;

	if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2,CFS_FAIL_ONCE))
		ev->status = -EIO;

	CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
	       "event type %d, status %d, desc %p\n",
	       ev->type, ev->status, desc);

	spin_lock(&desc->bd_lock);
	req = desc->bd_req;
	LASSERT(desc->bd_md_count > 0);
	desc->bd_md_count--;

	if (ev->type != LNET_EVENT_UNLINK && ev->status == 0) {
		desc->bd_nob_transferred += ev->mlength;
		desc->bd_sender = ev->sender;
	} else {
		/* start reconnect and resend if network error hit */
		spin_lock(&req->rq_lock);
		req->rq_net_err = 1;
		spin_unlock(&req->rq_lock);
	}

	if (ev->status != 0)
		desc->bd_failure = 1;

	/* NB don't unlock till after wakeup; desc can disappear under us
	 * otherwise */
	if (desc->bd_md_count == 0)
		ptlrpc_client_wake_req(desc->bd_req);

	spin_unlock(&desc->bd_lock);
}

/*
 * We will have percpt request history list for ptlrpc service in upcoming
 * patches because we don't want to be serialized by current per-service
 * history operations. So we require history ID can (somehow) show arriving
 * order w/o grabbing global lock, and user can sort them in userspace.
 *
 * This is how we generate history ID for ptlrpc_request:
 * ----------------------------------------------------
 * |  32 bits  |  16 bits  | (16 - X)bits  |  X bits  |
 * ----------------------------------------------------
 * |  seconds  | usec / 16 |   sequence    | CPT id   |
 * ----------------------------------------------------
 *
 * it might not be precise but should be good enough.
 */

#define REQS_CPT_BITS(svcpt)	((svcpt)->scp_service->srv_cpt_bits)

#define REQS_SEC_SHIFT		32
#define REQS_USEC_SHIFT		16
#define REQS_SEQ_SHIFT(svcpt)	REQS_CPT_BITS(svcpt)

static void ptlrpc_req_add_history(struct ptlrpc_service_part *svcpt,
				   struct ptlrpc_request *req)
{
	__u64	sec = req->rq_arrival_time.tv_sec;
	__u32	usec = req->rq_arrival_time.tv_usec >> 4; /* usec / 16 */
	__u64	new_seq;

	/* set sequence ID for request and add it to history list,
	 * it must be called with hold svcpt::scp_lock */

	new_seq = (sec << REQS_SEC_SHIFT) |
		  (usec << REQS_USEC_SHIFT) |
		  (svcpt->scp_cpt < 0 ? 0 : svcpt->scp_cpt);

	if (new_seq > svcpt->scp_hist_seq) {
		/* This handles the initial case of scp_hist_seq == 0 or
		 * we just jumped into a new time window */
		svcpt->scp_hist_seq = new_seq;
	} else {
		LASSERT(REQS_SEQ_SHIFT(svcpt) < REQS_USEC_SHIFT);
		/* NB: increase sequence number in current usec bucket,
		 * however, it's possible that we used up all bits for
		 * sequence and jumped into the next usec bucket (future time),
		 * then we hope there will be less RPCs per bucket at some
		 * point, and sequence will catch up again */
		svcpt->scp_hist_seq += (1U << REQS_SEQ_SHIFT(svcpt));
		new_seq = svcpt->scp_hist_seq;
	}

	req->rq_history_seq = new_seq;

	list_add_tail(&req->rq_history_list, &svcpt->scp_hist_reqs);
}

/*
 * Server's incoming request callback
 */
void request_in_callback(lnet_event_t *ev)
{
	struct ptlrpc_cb_id		  *cbid = ev->md.user_ptr;
	struct ptlrpc_request_buffer_desc *rqbd = cbid->cbid_arg;
	struct ptlrpc_service_part	  *svcpt = rqbd->rqbd_svcpt;
	struct ptlrpc_service	     *service = svcpt->scp_service;
	struct ptlrpc_request	     *req;

	LASSERT(ev->type == LNET_EVENT_PUT ||
		ev->type == LNET_EVENT_UNLINK);
	LASSERT((char *)ev->md.start >= rqbd->rqbd_buffer);
	LASSERT((char *)ev->md.start + ev->offset + ev->mlength <=
		rqbd->rqbd_buffer + service->srv_buf_size);

	CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
	       "event type %d, status %d, service %s\n",
	       ev->type, ev->status, service->srv_name);

	if (ev->unlinked) {
		/* If this is the last request message to fit in the
		 * request buffer we can use the request object embedded in
		 * rqbd.  Note that if we failed to allocate a request,
		 * we'd have to re-post the rqbd, which we can't do in this
		 * context. */
		req = &rqbd->rqbd_req;
		memset(req, 0, sizeof(*req));
	} else {
		LASSERT(ev->type == LNET_EVENT_PUT);
		if (ev->status != 0) {
			/* We moaned above already... */
			return;
		}
		OBD_ALLOC_GFP(req, sizeof(*req), ALLOC_ATOMIC_TRY);
		if (req == NULL) {
			CERROR("Can't allocate incoming request descriptor: "
			       "Dropping %s RPC from %s\n",
			       service->srv_name,
			       libcfs_id2str(ev->initiator));
			return;
		}
	}

	/* NB we ABSOLUTELY RELY on req being zeroed, so pointers are NULL,
	 * flags are reset and scalars are zero.  We only set the message
	 * size to non-zero if this was a successful receive. */
	req->rq_xid = ev->match_bits;
	req->rq_reqbuf = ev->md.start + ev->offset;
	if (ev->type == LNET_EVENT_PUT && ev->status == 0)
		req->rq_reqdata_len = ev->mlength;
	do_gettimeofday(&req->rq_arrival_time);
	req->rq_peer = ev->initiator;
	req->rq_self = ev->target.nid;
	req->rq_rqbd = rqbd;
	req->rq_phase = RQ_PHASE_NEW;
	spin_lock_init(&req->rq_lock);
	INIT_LIST_HEAD(&req->rq_timed_list);
	INIT_LIST_HEAD(&req->rq_exp_list);
	atomic_set(&req->rq_refcount, 1);
	if (ev->type == LNET_EVENT_PUT)
		CDEBUG(D_INFO, "incoming req@%p x"LPU64" msgsize %u\n",
		       req, req->rq_xid, ev->mlength);

	CDEBUG(D_RPCTRACE, "peer: %s\n", libcfs_id2str(req->rq_peer));

	spin_lock(&svcpt->scp_lock);

	ptlrpc_req_add_history(svcpt, req);

	if (ev->unlinked) {
		svcpt->scp_nrqbds_posted--;
		CDEBUG(D_INFO, "Buffer complete: %d buffers still posted\n",
		       svcpt->scp_nrqbds_posted);

		/* Normally, don't complain about 0 buffers posted; LNET won't
		 * drop incoming reqs since we set the portal lazy */
		if (test_req_buffer_pressure &&
		    ev->type != LNET_EVENT_UNLINK &&
		    svcpt->scp_nrqbds_posted == 0)
			CWARN("All %s request buffers busy\n",
			      service->srv_name);

		/* req takes over the network's ref on rqbd */
	} else {
		/* req takes a ref on rqbd */
		rqbd->rqbd_refcount++;
	}

	list_add_tail(&req->rq_list, &svcpt->scp_req_incoming);
	svcpt->scp_nreqs_incoming++;

	/* NB everything can disappear under us once the request
	 * has been queued and we unlock, so do the wake now... */
	wake_up(&svcpt->scp_waitq);

	spin_unlock(&svcpt->scp_lock);
}

/*
 *  Server's outgoing reply callback
 */
void reply_out_callback(lnet_event_t *ev)
{
	struct ptlrpc_cb_id	  *cbid = ev->md.user_ptr;
	struct ptlrpc_reply_state *rs = cbid->cbid_arg;
	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;

	LASSERT(ev->type == LNET_EVENT_SEND ||
		ev->type == LNET_EVENT_ACK ||
		ev->type == LNET_EVENT_UNLINK);

	if (!rs->rs_difficult) {
		/* 'Easy' replies have no further processing so I drop the
		 * net's ref on 'rs' */
		LASSERT(ev->unlinked);
		ptlrpc_rs_decref(rs);
		return;
	}

	LASSERT(rs->rs_on_net);

	if (ev->unlinked) {
		/* Last network callback. The net's ref on 'rs' stays put
		 * until ptlrpc_handle_rs() is done with it */
		spin_lock(&svcpt->scp_rep_lock);
		spin_lock(&rs->rs_lock);

		rs->rs_on_net = 0;
		if (!rs->rs_no_ack ||
		    rs->rs_transno <=
		    rs->rs_export->exp_obd->obd_last_committed)
			ptlrpc_schedule_difficult_reply(rs);

		spin_unlock(&rs->rs_lock);
		spin_unlock(&svcpt->scp_rep_lock);
	}
}


static void ptlrpc_master_callback(lnet_event_t *ev)
{
	struct ptlrpc_cb_id *cbid = ev->md.user_ptr;
	void (*callback)(lnet_event_t *ev) = cbid->cbid_fn;

	/* Honestly, it's best to find out early. */
	LASSERT(cbid->cbid_arg != LP_POISON);
	LASSERT(callback == request_out_callback ||
		callback == reply_in_callback ||
		callback == client_bulk_callback ||
		callback == request_in_callback ||
		callback == reply_out_callback);

	callback(ev);
}

int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
			 lnet_process_id_t *peer, lnet_nid_t *self)
{
	int	       best_dist = 0;
	__u32	     best_order = 0;
	int	       count = 0;
	int	       rc = -ENOENT;
	int	       portals_compatibility;
	int	       dist;
	__u32	     order;
	lnet_nid_t	dst_nid;
	lnet_nid_t	src_nid;

	portals_compatibility = LNetCtl(IOC_LIBCFS_PORTALS_COMPATIBILITY, NULL);

	peer->pid = LUSTRE_SRV_LNET_PID;

	/* Choose the matching UUID that's closest */
	while (lustre_uuid_to_peer(uuid->uuid, &dst_nid, count++) == 0) {
		dist = LNetDist(dst_nid, &src_nid, &order);
		if (dist < 0)
			continue;

		if (dist == 0) {		/* local! use loopback LND */
			peer->nid = *self = LNET_MKNID(LNET_MKNET(LOLND, 0), 0);
			rc = 0;
			break;
		}

		if (rc < 0 ||
		    dist < best_dist ||
		    (dist == best_dist && order < best_order)) {
			best_dist = dist;
			best_order = order;

			if (portals_compatibility > 1) {
				/* Strong portals compatibility: Zero the nid's
				 * NET, so if I'm reading new config logs, or
				 * getting configured by (new) lconf I can
				 * still talk to old servers. */
				dst_nid = LNET_MKNID(0, LNET_NIDADDR(dst_nid));
				src_nid = LNET_MKNID(0, LNET_NIDADDR(src_nid));
			}
			peer->nid = dst_nid;
			*self = src_nid;
			rc = 0;
		}
	}

	CDEBUG(D_NET,"%s->%s\n", uuid->uuid, libcfs_id2str(*peer));
	return rc;
}

void ptlrpc_ni_fini(void)
{
	wait_queue_head_t	 waitq;
	struct l_wait_info  lwi;
	int		 rc;
	int		 retries;

	/* Wait for the event queue to become idle since there may still be
	 * messages in flight with pending events (i.e. the fire-and-forget
	 * messages == client requests and "non-difficult" server
	 * replies */

	for (retries = 0;; retries++) {
		rc = LNetEQFree(ptlrpc_eq_h);
		switch (rc) {
		default:
			LBUG();

		case 0:
			LNetNIFini();
			return;

		case -EBUSY:
			if (retries != 0)
				CWARN("Event queue still busy\n");

			/* Wait for a bit */
			init_waitqueue_head(&waitq);
			lwi = LWI_TIMEOUT(cfs_time_seconds(2), NULL, NULL);
			l_wait_event(waitq, 0, &lwi);
			break;
		}
	}
	/* notreached */
}

lnet_pid_t ptl_get_pid(void)
{
	lnet_pid_t	pid;

	pid = LUSTRE_SRV_LNET_PID;
	return pid;
}

int ptlrpc_ni_init(void)
{
	int	      rc;
	lnet_pid_t       pid;

	pid = ptl_get_pid();
	CDEBUG(D_NET, "My pid is: %x\n", pid);

	/* We're not passing any limits yet... */
	rc = LNetNIInit(pid);
	if (rc < 0) {
		CDEBUG(D_NET, "Can't init network interface: %d\n", rc);
		return (-ENOENT);
	}

	/* CAVEAT EMPTOR: how we process portals events is _radically_
	 * different depending on... */
	/* kernel LNet calls our master callback when there are new event,
	 * because we are guaranteed to get every event via callback,
	 * so we just set EQ size to 0 to avoid overhread of serializing
	 * enqueue/dequeue operations in LNet. */
	rc = LNetEQAlloc(0, ptlrpc_master_callback, &ptlrpc_eq_h);
	if (rc == 0)
		return 0;

	CERROR("Failed to allocate event queue: %d\n", rc);
	LNetNIFini();

	return (-ENOMEM);
}


int ptlrpc_init_portals(void)
{
	int   rc = ptlrpc_ni_init();

	if (rc != 0) {
		CERROR("network initialisation failed\n");
		return -EIO;
	}
	rc = ptlrpcd_addref();
	if (rc == 0)
		return 0;

	CERROR("rpcd initialisation failed\n");
	ptlrpc_ni_fini();
	return rc;
}

void ptlrpc_exit_portals(void)
{
	ptlrpcd_decref();
	ptlrpc_ni_fini();
}
Commit	Line	Data
d7e09d03 PT	1	/*
	2	* GPL HEADER START
	3	*
	4	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License version 2 only,
	8	* as published by the Free Software Foundation.
	9	*
	10	* This program is distributed in the hope that it will be useful, but
	11	* WITHOUT ANY WARRANTY; without even the implied warranty of
	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	* General Public License version 2 for more details (a copy is included
	14	* in the LICENSE file that accompanied this code).
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* version 2 along with this program; If not, see
	18	* http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
	19	*
	20	* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
	21	* CA 95054 USA or visit www.sun.com if you need additional information or
	22	* have any questions.
	23	*
	24	* GPL HEADER END
	25	*/
	26	/*
	27	* Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
	28	* Use is subject to license terms.
	29	*
	30	* Copyright (c) 2012, Intel Corporation.
	31	*/
	32	/*
	33	* This file is part of Lustre, http://www.lustre.org/
	34	* Lustre is a trademark of Sun Microsystems, Inc.
	35	*/
	36
	37	#define DEBUG_SUBSYSTEM S_RPC
	38
	39	# include <linux/libcfs/libcfs.h>
	40	# ifdef __mips64__
	41	# include <linux/kernel.h>
	42	# endif
	43
	44	#include <obd_class.h>
	45	#include <lustre_net.h>
	46	#include <lustre_sec.h>
	47	#include "ptlrpc_internal.h"
	48
	49	lnet_handle_eq_t ptlrpc_eq_h;
	50
	51	/*
	52	* Client's outgoing request callback
	53	*/
	54	void request_out_callback(lnet_event_t *ev)
	55	{
	56	struct ptlrpc_cb_id *cbid = ev->md.user_ptr;
	57	struct ptlrpc_request *req = cbid->cbid_arg;
d7e09d03	58
3949015e KM	59	LASSERT(ev->type == LNET_EVENT_SEND \|\|
	60	ev->type == LNET_EVENT_UNLINK);
	61	LASSERT(ev->unlinked);
d7e09d03 PT	62
	63	DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
	64
	65	sptlrpc_request_out_callback(req);
	66	req->rq_real_sent = cfs_time_current_sec();
	67
	68	if (ev->type == LNET_EVENT_UNLINK \|\| ev->status != 0) {
	69
	70	/* Failed send: make it seem like the reply timed out, just
	71	* like failing sends in client.c does currently... */
	72
	73	spin_lock(&req->rq_lock);
	74	req->rq_net_err = 1;
	75	spin_unlock(&req->rq_lock);
	76
	77	ptlrpc_client_wake_req(req);
	78	}
	79
	80	ptlrpc_req_finished(req);
d7e09d03 PT	81	}
	82
	83	/*
	84	* Client's incoming reply callback
	85	*/
	86	void reply_in_callback(lnet_event_t *ev)
	87	{
	88	struct ptlrpc_cb_id *cbid = ev->md.user_ptr;
	89	struct ptlrpc_request *req = cbid->cbid_arg;
d7e09d03 PT	90
	91	DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
	92
3949015e KM	93	LASSERT(ev->type == LNET_EVENT_PUT \|\| ev->type == LNET_EVENT_UNLINK);
	94	LASSERT(ev->md.start == req->rq_repbuf);
	95	LASSERT(ev->offset + ev->mlength <= req->rq_repbuf_len);
d7e09d03 PT	96	/* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests
	97	for adaptive timeouts' early reply. */
	98	LASSERT((ev->md.options & LNET_MD_MANAGE_REMOTE) != 0);
	99
	100	spin_lock(&req->rq_lock);
	101
	102	req->rq_receiving_reply = 0;
	103	req->rq_early = 0;
	104	if (ev->unlinked)
	105	req->rq_must_unlink = 0;
	106
	107	if (ev->status)
	108	goto out_wake;
	109
	110	if (ev->type == LNET_EVENT_UNLINK) {
	111	LASSERT(ev->unlinked);
	112	DEBUG_REQ(D_NET, req, "unlink");
	113	goto out_wake;
	114	}
	115
3949015e	116	if (ev->mlength < ev->rlength) {
d7e09d03 PT	117	CDEBUG(D_RPCTRACE, "truncate req %p rpc %d - %d+%d\n", req,
	118	req->rq_replen, ev->rlength, ev->offset);
	119	req->rq_reply_truncate = 1;
	120	req->rq_replied = 1;
	121	req->rq_status = -EOVERFLOW;
	122	req->rq_nob_received = ev->rlength + ev->offset;
	123	goto out_wake;
	124	}
	125
	126	if ((ev->offset == 0) &&
	127	((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))) {
	128	/* Early reply */
	129	DEBUG_REQ(D_ADAPTTO, req,
	130	"Early reply received: mlen=%u offset=%d replen=%d "
	131	"replied=%d unlinked=%d", ev->mlength, ev->offset,
	132	req->rq_replen, req->rq_replied, ev->unlinked);
	133
	134	req->rq_early_count++; /* number received, client side */
	135
	136	if (req->rq_replied) /* already got the real reply */
	137	goto out_wake;
	138
	139	req->rq_early = 1;
	140	req->rq_reply_off = ev->offset;
	141	req->rq_nob_received = ev->mlength;
	142	/* And we're still receiving */
	143	req->rq_receiving_reply = 1;
	144	} else {
	145	/* Real reply */
	146	req->rq_rep_swab_mask = 0;
	147	req->rq_replied = 1;
	148	req->rq_reply_off = ev->offset;
	149	req->rq_nob_received = ev->mlength;
	150	/* LNetMDUnlink can't be called under the LNET_LOCK,
	151	so we must unlink in ptlrpc_unregister_reply */
	152	DEBUG_REQ(D_INFO, req,
	153	"reply in flags=%x mlen=%u offset=%d replen=%d",
	154	lustre_msg_get_flags(req->rq_reqmsg),
	155	ev->mlength, ev->offset, req->rq_replen);
	156	}
	157
	158	req->rq_import->imp_last_reply_time = cfs_time_current_sec();
	159
	160	out_wake:
	161	/* NB don't unlock till after wakeup; req can disappear under us
	162	* since we don't have our own ref */
	163	ptlrpc_client_wake_req(req);
	164	spin_unlock(&req->rq_lock);
d7e09d03 PT	165	}
	166
	167	/*
	168	* Client's bulk has been written/read
	169	*/
3949015e	170	void client_bulk_callback(lnet_event_t *ev)
d7e09d03 PT	171	{
	172	struct ptlrpc_cb_id *cbid = ev->md.user_ptr;
	173	struct ptlrpc_bulk_desc *desc = cbid->cbid_arg;
	174	struct ptlrpc_request *req;
d7e09d03	175
3949015e KM	176	LASSERT((desc->bd_type == BULK_PUT_SINK &&
	177	ev->type == LNET_EVENT_PUT) \|\|
	178	(desc->bd_type == BULK_GET_SOURCE &&
	179	ev->type == LNET_EVENT_GET) \|\|
	180	ev->type == LNET_EVENT_UNLINK);
	181	LASSERT(ev->unlinked);
d7e09d03 PT	182
	183	if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB, CFS_FAIL_ONCE))
	184	ev->status = -EIO;
	185
	186	if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2,CFS_FAIL_ONCE))
	187	ev->status = -EIO;
	188
	189	CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
	190	"event type %d, status %d, desc %p\n",
	191	ev->type, ev->status, desc);
	192
	193	spin_lock(&desc->bd_lock);
	194	req = desc->bd_req;
	195	LASSERT(desc->bd_md_count > 0);
	196	desc->bd_md_count--;
	197
	198	if (ev->type != LNET_EVENT_UNLINK && ev->status == 0) {
	199	desc->bd_nob_transferred += ev->mlength;
	200	desc->bd_sender = ev->sender;
	201	} else {
	202	/* start reconnect and resend if network error hit */
	203	spin_lock(&req->rq_lock);
	204	req->rq_net_err = 1;
	205	spin_unlock(&req->rq_lock);
	206	}
	207
	208	if (ev->status != 0)
	209	desc->bd_failure = 1;
	210
	211	/* NB don't unlock till after wakeup; desc can disappear under us
	212	* otherwise */
	213	if (desc->bd_md_count == 0)
	214	ptlrpc_client_wake_req(desc->bd_req);
	215
	216	spin_unlock(&desc->bd_lock);
d7e09d03 PT	217	}
	218
	219	/*
	220	* We will have percpt request history list for ptlrpc service in upcoming
	221	* patches because we don't want to be serialized by current per-service
	222	* history operations. So we require history ID can (somehow) show arriving
	223	* order w/o grabbing global lock, and user can sort them in userspace.
	224	*
	225	* This is how we generate history ID for ptlrpc_request:
	226	* ----------------------------------------------------
	227	* \| 32 bits \| 16 bits \| (16 - X)bits \| X bits \|
	228	* ----------------------------------------------------
	229	* \| seconds \| usec / 16 \| sequence \| CPT id \|
	230	* ----------------------------------------------------
	231	*
	232	* it might not be precise but should be good enough.
	233	*/
	234
	235	#define REQS_CPT_BITS(svcpt) ((svcpt)->scp_service->srv_cpt_bits)
	236
	237	#define REQS_SEC_SHIFT 32
	238	#define REQS_USEC_SHIFT 16
	239	#define REQS_SEQ_SHIFT(svcpt) REQS_CPT_BITS(svcpt)
	240
	241	static void ptlrpc_req_add_history(struct ptlrpc_service_part *svcpt,
	242	struct ptlrpc_request *req)
	243	{
	244	__u64 sec = req->rq_arrival_time.tv_sec;
	245	__u32 usec = req->rq_arrival_time.tv_usec >> 4; /* usec / 16 */
	246	__u64 new_seq;
	247
	248	/* set sequence ID for request and add it to history list,
	249	* it must be called with hold svcpt::scp_lock */
	250
	251	new_seq = (sec << REQS_SEC_SHIFT) \|
	252	(usec << REQS_USEC_SHIFT) \|
	253	(svcpt->scp_cpt < 0 ? 0 : svcpt->scp_cpt);
	254
	255	if (new_seq > svcpt->scp_hist_seq) {
	256	/* This handles the initial case of scp_hist_seq == 0 or
	257	* we just jumped into a new time window */
	258	svcpt->scp_hist_seq = new_seq;
	259	} else {
	260	LASSERT(REQS_SEQ_SHIFT(svcpt) < REQS_USEC_SHIFT);
	261	/* NB: increase sequence number in current usec bucket,
	262	* however, it's possible that we used up all bits for
	263	* sequence and jumped into the next usec bucket (future time),
	264	* then we hope there will be less RPCs per bucket at some
	265	* point, and sequence will catch up again */
	266	svcpt->scp_hist_seq += (1U << REQS_SEQ_SHIFT(svcpt));
	267	new_seq = svcpt->scp_hist_seq;
	268	}
	269
	270	req->rq_history_seq = new_seq;
	271
	272	list_add_tail(&req->rq_history_list, &svcpt->scp_hist_reqs);
	273	}
	274
	275	/*
	276	* Server's incoming request callback
	277	*/
	278	void request_in_callback(lnet_event_t *ev)
	279	{
	280	struct ptlrpc_cb_id *cbid = ev->md.user_ptr;
281	struct ptlrpc_request_buffer_desc *rqbd = cbid->cbid_arg;
282	struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt;
283	struct ptlrpc_service *service = svcpt->scp_service;
284	struct ptlrpc_request *req;
d7e09d03	285
3949015e KM	286	LASSERT(ev->type == LNET_EVENT_PUT \|\|
	287	ev->type == LNET_EVENT_UNLINK);
	288	LASSERT((char *)ev->md.start >= rqbd->rqbd_buffer);
	289	LASSERT((char *)ev->md.start + ev->offset + ev->mlength <=
	290	rqbd->rqbd_buffer + service->srv_buf_size);
d7e09d03 PT	291
	292	CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
	293	"event type %d, status %d, service %s\n",
	294	ev->type, ev->status, service->srv_name);
	295
	296	if (ev->unlinked) {
	297	/* If this is the last request message to fit in the
	298	* request buffer we can use the request object embedded in
	299	* rqbd. Note that if we failed to allocate a request,
	300	* we'd have to re-post the rqbd, which we can't do in this
	301	* context. */
	302	req = &rqbd->rqbd_req;
3949015e	303	memset(req, 0, sizeof(*req));
d7e09d03	304	} else {
3949015e	305	LASSERT(ev->type == LNET_EVENT_PUT);
d7e09d03 PT	306	if (ev->status != 0) {
	307	/* We moaned above already... */
	308	return;
	309	}
	310	OBD_ALLOC_GFP(req, sizeof(*req), ALLOC_ATOMIC_TRY);
	311	if (req == NULL) {
	312	CERROR("Can't allocate incoming request descriptor: "
	313	"Dropping %s RPC from %s\n",
	314	service->srv_name,
	315	libcfs_id2str(ev->initiator));
	316	return;
	317	}
	318	}
	319
	320	/* NB we ABSOLUTELY RELY on req being zeroed, so pointers are NULL,
	321	* flags are reset and scalars are zero. We only set the message
	322	* size to non-zero if this was a successful receive. */
	323	req->rq_xid = ev->match_bits;
	324	req->rq_reqbuf = ev->md.start + ev->offset;
	325	if (ev->type == LNET_EVENT_PUT && ev->status == 0)
	326	req->rq_reqdata_len = ev->mlength;
	327	do_gettimeofday(&req->rq_arrival_time);
	328	req->rq_peer = ev->initiator;
	329	req->rq_self = ev->target.nid;
	330	req->rq_rqbd = rqbd;
	331	req->rq_phase = RQ_PHASE_NEW;
	332	spin_lock_init(&req->rq_lock);
	333	INIT_LIST_HEAD(&req->rq_timed_list);
	334	INIT_LIST_HEAD(&req->rq_exp_list);
	335	atomic_set(&req->rq_refcount, 1);
	336	if (ev->type == LNET_EVENT_PUT)
	337	CDEBUG(D_INFO, "incoming req@%p x"LPU64" msgsize %u\n",
	338	req, req->rq_xid, ev->mlength);
	339
	340	CDEBUG(D_RPCTRACE, "peer: %s\n", libcfs_id2str(req->rq_peer));
	341
	342	spin_lock(&svcpt->scp_lock);
	343
	344	ptlrpc_req_add_history(svcpt, req);
	345
	346	if (ev->unlinked) {
	347	svcpt->scp_nrqbds_posted--;
	348	CDEBUG(D_INFO, "Buffer complete: %d buffers still posted\n",
	349	svcpt->scp_nrqbds_posted);
	350
	351	/* Normally, don't complain about 0 buffers posted; LNET won't
	352	* drop incoming reqs since we set the portal lazy */
	353	if (test_req_buffer_pressure &&
	354	ev->type != LNET_EVENT_UNLINK &&
	355	svcpt->scp_nrqbds_posted == 0)
	356	CWARN("All %s request buffers busy\n",
	357	service->srv_name);
	358
	359	/* req takes over the network's ref on rqbd */
	360	} else {
	361	/* req takes a ref on rqbd */
	362	rqbd->rqbd_refcount++;
	363	}
	364
	365	list_add_tail(&req->rq_list, &svcpt->scp_req_incoming);
	366	svcpt->scp_nreqs_incoming++;
	367
	368	/* NB everything can disappear under us once the request
	369	* has been queued and we unlock, so do the wake now... */
370	wake_up(&svcpt->scp_waitq);
371
372	spin_unlock(&svcpt->scp_lock);
d7e09d03 PT	373	}
	374
	375	/*
	376	* Server's outgoing reply callback
	377	*/
	378	void reply_out_callback(lnet_event_t *ev)
	379	{
	380	struct ptlrpc_cb_id *cbid = ev->md.user_ptr;
	381	struct ptlrpc_reply_state *rs = cbid->cbid_arg;
	382	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
d7e09d03	383
3949015e KM	384	LASSERT(ev->type == LNET_EVENT_SEND \|\|
	385	ev->type == LNET_EVENT_ACK \|\|
	386	ev->type == LNET_EVENT_UNLINK);
d7e09d03 PT	387
	388	if (!rs->rs_difficult) {
	389	/* 'Easy' replies have no further processing so I drop the
	390	* net's ref on 'rs' */
3949015e	391	LASSERT(ev->unlinked);
d7e09d03	392	ptlrpc_rs_decref(rs);
d7e09d03 PT	393	return;
	394	}
	395
3949015e	396	LASSERT(rs->rs_on_net);
d7e09d03 PT	397
	398	if (ev->unlinked) {
	399	/* Last network callback. The net's ref on 'rs' stays put
	400	* until ptlrpc_handle_rs() is done with it */
	401	spin_lock(&svcpt->scp_rep_lock);
	402	spin_lock(&rs->rs_lock);
	403
	404	rs->rs_on_net = 0;
	405	if (!rs->rs_no_ack \|\|
	406	rs->rs_transno <=
	407	rs->rs_export->exp_obd->obd_last_committed)
	408	ptlrpc_schedule_difficult_reply(rs);
	409
	410	spin_unlock(&rs->rs_lock);
	411	spin_unlock(&svcpt->scp_rep_lock);
	412	}
d7e09d03 PT	413	}
	414
	415
	416	static void ptlrpc_master_callback(lnet_event_t *ev)
	417	{
	418	struct ptlrpc_cb_id *cbid = ev->md.user_ptr;
	419	void (callback)(lnet_event_t ev) = cbid->cbid_fn;
	420
	421	/* Honestly, it's best to find out early. */
3949015e KM	422	LASSERT(cbid->cbid_arg != LP_POISON);
	423	LASSERT(callback == request_out_callback \|\|
	424	callback == reply_in_callback \|\|
	425	callback == client_bulk_callback \|\|
	426	callback == request_in_callback \|\|
	427	callback == reply_out_callback);
	428
	429	callback(ev);
d7e09d03 PT	430	}
d7e09d03 PT	431
3949015e	432	int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
d7e09d03 PT	433	lnet_process_id_t peer, lnet_nid_t self)
	434	{
	435	int best_dist = 0;
	436	__u32 best_order = 0;
	437	int count = 0;
	438	int rc = -ENOENT;
	439	int portals_compatibility;
	440	int dist;
	441	__u32 order;
	442	lnet_nid_t dst_nid;
	443	lnet_nid_t src_nid;
	444
	445	portals_compatibility = LNetCtl(IOC_LIBCFS_PORTALS_COMPATIBILITY, NULL);
	446
	447	peer->pid = LUSTRE_SRV_LNET_PID;
	448
	449	/* Choose the matching UUID that's closest */
	450	while (lustre_uuid_to_peer(uuid->uuid, &dst_nid, count++) == 0) {
	451	dist = LNetDist(dst_nid, &src_nid, &order);
	452	if (dist < 0)
	453	continue;
	454
	455	if (dist == 0) { /* local! use loopback LND */
	456	peer->nid = *self = LNET_MKNID(LNET_MKNET(LOLND, 0), 0);
	457	rc = 0;
	458	break;
	459	}
	460
	461	if (rc < 0 \|\|
	462	dist < best_dist \|\|
	463	(dist == best_dist && order < best_order)) {
	464	best_dist = dist;
	465	best_order = order;
	466
	467	if (portals_compatibility > 1) {
	468	/* Strong portals compatibility: Zero the nid's
	469	* NET, so if I'm reading new config logs, or
	470	* getting configured by (new) lconf I can
	471	* still talk to old servers. */
	472	dst_nid = LNET_MKNID(0, LNET_NIDADDR(dst_nid));
	473	src_nid = LNET_MKNID(0, LNET_NIDADDR(src_nid));
	474	}
	475	peer->nid = dst_nid;
	476	*self = src_nid;
	477	rc = 0;
	478	}
	479	}
	480
	481	CDEBUG(D_NET,"%s->%s\n", uuid->uuid, libcfs_id2str(*peer));
	482	return rc;
	483	}
	484
	485	void ptlrpc_ni_fini(void)
	486	{
	487	wait_queue_head_t waitq;
	488	struct l_wait_info lwi;
	489	int rc;
	490	int retries;
	491
	492	/* Wait for the event queue to become idle since there may still be
	493	* messages in flight with pending events (i.e. the fire-and-forget
	494	* messages == client requests and "non-difficult" server
	495	* replies */
	496
497	for (retries = 0;; retries++) {
498	rc = LNetEQFree(ptlrpc_eq_h);
499	switch (rc) {
500	default:
501	LBUG();
502
503	case 0:
504	LNetNIFini();
505	return;
506
507	case -EBUSY:
508	if (retries != 0)
509	CWARN("Event queue still busy\n");
510
511	/* Wait for a bit */
512	init_waitqueue_head(&waitq);
513	lwi = LWI_TIMEOUT(cfs_time_seconds(2), NULL, NULL);
514	l_wait_event(waitq, 0, &lwi);
515	break;
516	}
517	}
518	/* notreached */
519	}
520
521	lnet_pid_t ptl_get_pid(void)
522	{
523	lnet_pid_t pid;
524
525	pid = LUSTRE_SRV_LNET_PID;
526	return pid;
527	}
528
529	int ptlrpc_ni_init(void)
530	{
531	int rc;
532	lnet_pid_t pid;
533
534	pid = ptl_get_pid();
535	CDEBUG(D_NET, "My pid is: %x\n", pid);
536
537	/* We're not passing any limits yet... */
538	rc = LNetNIInit(pid);
539	if (rc < 0) {
3949015e	540	CDEBUG(D_NET, "Can't init network interface: %d\n", rc);
d7e09d03 PT	541	return (-ENOENT);
	542	}
	543
	544	/* CAVEAT EMPTOR: how we process portals events is _radically_
	545	* different depending on... */
	546	/* kernel LNet calls our master callback when there are new event,
	547	* because we are guaranteed to get every event via callback,
	548	* so we just set EQ size to 0 to avoid overhread of serializing
	549	* enqueue/dequeue operations in LNet. */
	550	rc = LNetEQAlloc(0, ptlrpc_master_callback, &ptlrpc_eq_h);
	551	if (rc == 0)
	552	return 0;
	553
3949015e	554	CERROR("Failed to allocate event queue: %d\n", rc);
d7e09d03 PT	555	LNetNIFini();
	556
	557	return (-ENOMEM);
	558	}
	559
	560
	561	int ptlrpc_init_portals(void)
	562	{
	563	int rc = ptlrpc_ni_init();
	564
	565	if (rc != 0) {
	566	CERROR("network initialisation failed\n");
	567	return -EIO;
	568	}
	569	rc = ptlrpcd_addref();
	570	if (rc == 0)
	571	return 0;
	572
	573	CERROR("rpcd initialisation failed\n");
	574	ptlrpc_ni_fini();
	575	return rc;
	576	}
	577
	578	void ptlrpc_exit_portals(void)
	579	{
	580	ptlrpcd_decref();
	581	ptlrpc_ni_fini();
	582	}