http:///https:///api.php?action=feedcontributions&user=Psomogyi%40gamax.hu&feedformat=atomSambaWiki - User contributions [en]2024-03-28T13:54:32ZUser contributionsMediaWiki 1.39.5https://wiki.samba.org/index.php?title=CTDB_Project_ibwrapper&diff=1839CTDB Project ibwrapper2006-12-18T19:57:27Z<p>Psomogyi@gamax.hu: </p>
<hr />
<div>File ibwrapper.h:<br />
<br />
/*<br />
* Unix SMB/CIFS implementation.<br />
* Wrap Infiniband calls.<br />
*<br />
* Copyright (C) Sven Oehme <oehmes@de.ibm.com> 2006<br />
*<br />
* Major code contributions by Peter Somogyi <psomogyi@gamax.hu><br />
*<br />
* This program is free software; you can redistribute it and/or modify<br />
* it under the terms of the GNU General Public License as published by<br />
* the Free Software Foundation; either version 2 of the License, or<br />
* (at your option) any later version.<br />
*<br />
* This program is distributed in the hope that it will be useful,<br />
* but WITHOUT ANY WARRANTY; without even the implied warranty of<br />
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br />
* GNU General Public License for more details.<br />
*<br />
* You should have received a copy of the GNU General Public License<br />
* along with this program; if not, write to the Free Software<br />
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.<br />
*/<br />
<br />
/* Server communication state */<br />
enum ibw_state_ctx {<br />
IBWS_INIT = 0, /* ctx start - after ibw_init */<br />
IBWS_READY, /* after ibw_bind & ibw_listen */<br />
IBWS_CONNECT_REQUEST, /* after [IBWS_READY + incoming request] */<br />
/* => [(ibw_accept)IBWS_READY | (ibw_disconnect)STOPPED | ERROR] */<br />
IBWS_STOPPED, /* normal stop <= ibw_disconnect+(IBWS_READY | IBWS_CONNECT_REQUEST) */<br />
IBWS_ERROR /* abnormal state; ibw_stop must be called after this */<br />
};<br />
<br />
/* Connection state */<br />
struct ibw_ctx {<br />
void *ctx_userdata; /* see ibw_init */<br />
<br />
enum ibw_state_ctx state;<br />
void *internal;<br />
<br />
struct ibw_conn *conn_list; /* 1st elem of double linked list */<br />
};<br />
<br />
enum ibw_state_conn {<br />
IBWC_INIT = 0, /* conn start - internal state */<br />
IBWC_CONNECTED, /* after ibw_accept or ibw_connect */<br />
IBWC_DISCONNECTED, /* after ibw_disconnect */<br />
IBWC_ERROR<br />
};<br />
<br />
struct ibw_conn {<br />
struct ibw_ctx *ctx;<br />
enum ibw_state_conn state;<br />
<br />
void *conn_userdata; /* see ibw_connect and ibw_accept */<br />
void *internal;<br />
<br />
struct ibw_conn *prev, *next;<br />
};<br />
<br />
/*<br />
* (name, value) pair for array param of ibw_init<br />
*/<br />
struct ibw_initattr {<br />
const char *name;<br />
const char *value;<br />
};<br />
<br />
/*<br />
* Callback function definition which should inform you about<br />
* connection state change<br />
* This callback is invoked whenever server or client connection changes.<br />
* Both <conn> and <ctx> can be NULL if their state didn't change.<br />
* Return nonzero on error.<br />
*/<br />
typedef int (*ibw_connstate_fn_t)(struct ibw_ctx *ctx, struct ibw_conn *conn);<br />
<br />
/*<br />
* Callback function definition which should process incoming packets<br />
* This callback is invoked whenever any message arrives.<br />
* Return nonzero on error.<br />
*<br />
* Important: you mustn't store buf pointer for later use.<br />
* Process its contents before returning.<br />
*/<br />
typedef int (*ibw_receive_fn_t)(struct ibw_conn *conn, void *buf, int n);<br />
<br />
/*<br />
* settings: array of (name, value) pairs<br />
* where name is one of:<br />
* max_send_wr [default is 256]<br />
* max_recv_wr [default is 1024]<br />
* <...><br />
*<br />
* Must be called _ONCE_ for each node.<br />
*<br />
* max_msg_size is the maximum size of a message<br />
* (max_send_wr + max_recv_wr) * max_msg_size bytes allocated per connection<br />
*<br />
* returns non-NULL on success<br />
*<br />
* talloc_free must be called for the result in IBWS_STOPPED;<br />
* it will close resources by destructor<br />
* connections(ibw_conn *) must have been closed prior talloc_free<br />
*/<br />
struct ibw_ctx *ibw_init(struct ibw_initattr *attr, int nattr,<br />
void *ctx_userdata,<br />
ibw_connstate_fn_t ibw_connstate,<br />
ibw_receive_fn_t ibw_receive,<br />
struct event_context *ectx);<br />
<br />
/*<br />
* Must be called in states of (IBWS_ERROR, IBWS_READY, IBWS_CONNECT_REQUEST)<br />
*<br />
* It will send out disconnect requests and free up ibw_conn structures.<br />
* The ctx->state will transit to IBWS_STOPPED after every conn are disconnected.<br />
* During that time, you mustn't send/recv/disconnect any more.<br />
* Only after ctx->state=IBWS_STOPPED you can talloc_free the ctx.<br />
*/<br />
int ibw_stop(struct ibw_ctx *ctx);<br />
<br />
/*************** connection initiation - like stream sockets *****/<br />
<br />
/*<br />
* works like socket bind<br />
* needs a normal internet address here<br />
*<br />
* return 0 on success<br />
*/<br />
int ibw_bind(struct ibw_ctx *ctx, struct sockaddr_in *my_addr);<br />
<br />
/*<br />
* works like socket listen<br />
* non-blocking<br />
* enables accepting incoming connections (after IBWS_READY)<br />
* (it doesn't touch ctx->state by itself)<br />
*<br />
* returns 0 on success<br />
*/<br />
int ibw_listen(struct ibw_ctx *ctx, int backlog);<br />
<br />
/*<br />
* works like socket accept<br />
* initializes a connection to a client<br />
* must be called when state=IBWS_CONNECT_REQUEST<br />
*<br />
* returns 0 on success<br />
*<br />
* You have +1 waiting here: you will get ibw_conn (having the<br />
* same <conn_userdata> member) structure in ibw_connstate_fn_t.<br />
*<br />
* Important: you won't get remote IP address (only internal conn info)<br />
*/<br />
int ibw_accept(struct ibw_ctx *ctx, struct ibw_conn *conn, void *conn_userdata);<br />
<br />
/*<br />
* Needs a normal internet address here<br />
* can be called within IBWS_READY|IBWS_CONNECT_REQUEST<br />
*<br />
* returns non-NULL on success<br />
*<br />
* You have +1 waiting here: you will get ibw_conn (having the<br />
* same <conn_userdata> member) structure in ibw_connstate_fn_t.<br />
*/<br />
int ibw_connect(struct ibw_ctx *ctx, struct sockaddr_in *serv_addr, void *conn_userdata);<br />
<br />
/*<br />
* Sends out a disconnect request.<br />
* You should process fds after calling this function<br />
* and then process it with ibw_process_event normally<br />
* until you get conn->state = IBWC_DISCONNECTED<br />
*<br />
* You mustn't talloc_free <conn> yet right after this,<br />
* first wait for IBWC_DISCONNECTED.<br />
*/<br />
int ibw_disconnect(struct ibw_conn *conn);<br />
<br />
/************ Infiniband specific event loop wrapping ******************/<br />
<br />
/*<br />
* You have to use this buf to fill in before send.<br />
* It's just to avoid memcpy.in ibw_send.<br />
* Use the same (buf, key) pair with ibw_send.<br />
* Don't use more space than maxsize (see ibw_init).<br />
*<br />
* Returns 0 on success.<br />
*/<br />
int ibw_alloc_send_buf(struct ibw_conn *conn, void **buf, void **key, int n);<br />
<br />
/*<br />
* Send the message in one<br />
* Can be invoked any times (should fit into buffers) and at any time<br />
* (in conn->state=IBWC_CONNECTED)<br />
* n must be less or equal than max_msg_size (see ibw_init)<br />
*<br />
* You mustn't use (buf, key) any more for sending.<br />
*/<br />
int ibw_send(struct ibw_conn *conn, void *buf, void *key, int n);<br />
<br />
/*<br />
* Retrieves the last error<br />
* result: always non-zero, mustn't be freed (static)<br />
*/<br />
const char *ibw_getLastError(void);</div>Psomogyi@gamax.huhttps://wiki.samba.org/index.php?title=CTDB_Project_ibwrapper&diff=1834CTDB Project ibwrapper2006-12-13T14:05:00Z<p>Psomogyi@gamax.hu: </p>
<hr />
<div>File ibwrapper.h:<br />
<br />
/*<br />
* Unix SMB/CIFS implementation.<br />
* Wrap Infiniband calls.<br />
*<br />
* Copyright (C) Sven Oehme <oehmes@de.ibm.com> 2006<br />
*<br />
* Major code contributions by Peter Somogyi <psomogyi@gamax.hu><br />
*<br />
* This program is free software; you can redistribute it and/or modify<br />
* it under the terms of the GNU General Public License as published by<br />
* the Free Software Foundation; either version 2 of the License, or<br />
* (at your option) any later version.<br />
*<br />
* This program is distributed in the hope that it will be useful,<br />
* but WITHOUT ANY WARRANTY; without even the implied warranty of<br />
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br />
* GNU General Public License for more details.<br />
*<br />
* You should have received a copy of the GNU General Public License<br />
* along with this program; if not, write to the Free Software<br />
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.<br />
*/<br />
<br />
/* Server communication state */<br />
enum ibw_state_ctx {<br />
IBWS_INIT = 0, /* ctx start - after ibw_init */<br />
IBWS_READY, /* after ibw_bind & ibw_listen */<br />
IBWS_CONNECT_REQUEST, /* after [IBWS_READY + incoming request] */<br />
/* => [(ibw_accept)IBWS_READY | (ibw_disconnect)STOPPED | ERROR] */<br />
IBWS_STOPPED, /* normal stop <= ibw_disconnect+(IBWS_READY | IBWS_CONNECT_REQUEST) */<br />
IBWS_ERROR /* abnormal state; ibw_stop must be called after this */<br />
};<br />
<br />
/* Connection state */<br />
struct ibw_ctx {<br />
void *ctx_userdata; /* see ibw_init */<br />
<br />
enum ibw_state_ctx state;<br />
void *internal;<br />
<br />
struct ibw_conn *conn_list; /* 1st elem of double linked list */<br />
};<br />
<br />
enum ibw_state_conn {<br />
IBWC_INIT = 0, /* conn start - internal state */<br />
IBWC_CONNECTED, /* after ibw_accept or ibw_connect */<br />
IBWC_DISCONNECTED, /* after ibw_disconnect */<br />
IBWC_ERROR<br />
};<br />
<br />
struct ibw_conn {<br />
struct ibw_ctx *ctx;<br />
enum ibw_state_conn state;<br />
<br />
void *conn_userdata; /* see ibw_connect and ibw_accept */<br />
void *internal;<br />
<br />
struct ibw_conn *prev, *next;<br />
};<br />
<br />
/*<br />
* (name, value) pair for array param of ibw_init<br />
*/<br />
struct ibw_initattr {<br />
const char *name;<br />
const char *value;<br />
};<br />
<br />
/*<br />
* Callback function definition which should inform you about<br />
* connection state change<br />
* This callback is invoked whenever server or client connection changes.<br />
* Both <conn> and <ctx> can be NULL if their state didn't change.<br />
* Return nonzero on error.<br />
*/<br />
typedef int (*ibw_connstate_fn_t)(struct ibw_ctx *ctx, struct ibw_conn *conn);<br />
<br />
/*<br />
* Callback function definition which should process incoming packets<br />
* This callback is invoked whenever any message arrives.<br />
* Return nonzero on error.<br />
*<br />
* Important: you mustn't store buf pointer for later use.<br />
* Process its contents before returning.<br />
*/<br />
typedef int (*ibw_receive_fn_t)(struct ibw_conn *conn, void *buf, int n);<br />
<br />
/*<br />
* settings: array of (name, value) pairs<br />
* where name is one of:<br />
* max_send_wr [default is 256]<br />
* max_recv_wr [default is 1024]<br />
* <...><br />
*<br />
* Must be called _ONCE_ for each node.<br />
*<br />
* max_msg_size is the maximum size of a message<br />
* (max_send_wr + max_recv_wr) * max_msg_size bytes allocated per connection<br />
*<br />
* returns non-NULL on success<br />
*<br />
* talloc_free must be called for the result in IBWS_STOPPED;<br />
* it will close resources by destructor<br />
* connections(ibw_conn *) must have been closed prior talloc_free<br />
*/<br />
struct ibw_ctx *ibw_init(struct ibw_initattr *attr, int nattr,<br />
void *ctx_userdata,<br />
ibw_connstate_fn_t ibw_connstate,<br />
ibw_receive_fn_t ibw_receive,<br />
struct event_context *ectx,<br />
int max_msg_size);<br />
<br />
/*<br />
* Must be called in states of (IBWS_ERROR, IBWS_READY, IBWS_CONNECT_REQUEST)<br />
*<br />
* It will send out disconnect requests and free up ibw_conn structures.<br />
* The ctx->state will transit to IBWS_STOPPED after every conn are disconnected.<br />
* During that time, you mustn't send/recv/disconnect any more.<br />
* Only after ctx->state=IBWS_STOPPED you can talloc_free the ctx.<br />
*/<br />
int ibw_stop(struct ibw_ctx *ctx);<br />
<br />
/*************** connection initiation - like stream sockets *****/<br />
<br />
/*<br />
* works like socket bind<br />
* needs a normal internet address here<br />
*<br />
* return 0 on success<br />
*/<br />
int ibw_bind(struct ibw_ctx *ctx, struct sockaddr_in *my_addr);<br />
<br />
/*<br />
* works like socket listen<br />
* non-blocking<br />
* enables accepting incoming connections (after IBWS_READY)<br />
* (it doesn't touch ctx->state by itself)<br />
*<br />
* returns 0 on success<br />
*/<br />
int ibw_listen(struct ibw_ctx *ctx, int backlog);<br />
<br />
/*<br />
* works like socket accept<br />
* initializes a connection to a client<br />
* must be called when state=IBWS_CONNECT_REQUEST<br />
*<br />
* returns 0 on success<br />
*<br />
* You have +1 waiting here: you will get ibw_conn (having the<br />
* same <conn_userdata> member) structure in ibw_connstate_fn_t.<br />
*<br />
* Important: you won't get remote IP address (only internal conn info)<br />
*/<br />
int ibw_accept(struct ibw_ctx *ctx, struct ibw_conn *conn, void *conn_userdata);<br />
<br />
/*<br />
* Needs a normal internet address here<br />
* can be called within IBWS_READY|IBWS_CONNECT_REQUEST<br />
*<br />
* returns non-NULL on success<br />
*<br />
* You have +1 waiting here: you will get ibw_conn (having the<br />
* same <conn_userdata> member) structure in ibw_connstate_fn_t.<br />
*/<br />
int ibw_connect(struct ibw_ctx *ctx, struct sockaddr_in *serv_addr, void *conn_userdata);<br />
<br />
/*<br />
* Sends out a disconnect request.<br />
* You should process fds after calling this function<br />
* and then process it with ibw_process_event normally<br />
* until you get conn->state = IBWC_DISCONNECTED<br />
*<br />
* You mustn't talloc_free <conn> yet right after this,<br />
* first wait for IBWC_DISCONNECTED.<br />
*/<br />
int ibw_disconnect(struct ibw_conn *conn);<br />
<br />
/************ Infiniband specific event loop wrapping ******************/<br />
<br />
/*<br />
* You have to use this buf to fill in before send.<br />
* It's just to avoid memcpy.in ibw_send.<br />
* Use the same (buf, key) pair with ibw_send.<br />
* Don't use more space than maxsize (see ibw_init).<br />
*<br />
* Returns 0 on success.<br />
*/<br />
int ibw_alloc_send_buf(struct ibw_conn *conn, void **buf, void **key);<br />
<br />
/*<br />
* Send the message in one<br />
* Can be invoked any times (should fit into buffers) and at any time<br />
* (in conn->state=IBWC_CONNECTED)<br />
* n must be less or equal than max_msg_size (see ibw_init)<br />
*<br />
* You mustn't use (buf, key) any more for sending.<br />
*/<br />
int ibw_send(struct ibw_conn *conn, void *buf, void *key, int n);<br />
<br />
/*<br />
* Retrieves the last error<br />
* result: always non-zero, mustn't be freed (static)<br />
*/<br />
const char *ibw_getLastError(void);</div>Psomogyi@gamax.huhttps://wiki.samba.org/index.php?title=CTDB_Project_ibwrapper&diff=1833CTDB Project ibwrapper2006-12-13T10:06:55Z<p>Psomogyi@gamax.hu: </p>
<hr />
<div>File ibwrapper.h:<br />
<br />
/*<br />
* Unix SMB/CIFS implementation.<br />
* Wrap Infiniband calls.<br />
*<br />
* Copyright (C) Sven Oehme <oehmes@de.ibm.com> 2006<br />
*<br />
* Major code contributions by Peter Somogyi <psomogyi@gamax.hu><br />
*<br />
* This program is free software; you can redistribute it and/or modify<br />
* it under the terms of the GNU General Public License as published by<br />
* the Free Software Foundation; either version 2 of the License, or<br />
* (at your option) any later version.<br />
*<br />
* This program is distributed in the hope that it will be useful,<br />
* but WITHOUT ANY WARRANTY; without even the implied warranty of<br />
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br />
* GNU General Public License for more details.<br />
*<br />
* You should have received a copy of the GNU General Public License<br />
* along with this program; if not, write to the Free Software<br />
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.<br />
*/<br />
<br />
/* Server communication state */<br />
enum ibw_state_ctx {<br />
IBWS_INIT = 0, /* ctx start - after ibw_init */<br />
IBWS_READY, /* after ibw_bind & ibw_listen */<br />
IBWS_CONNECT_REQUEST, /* after [IBWS_READY + incoming request] */<br />
/* => [(ibw_accept)IBWS_READY | (ibw_disconnect)STOPPED | ERROR] */<br />
IBWS_STOPPED, /* normal stop <= ibw_disconnect+(IBWS_READY | IBWS_CONNECT_REQUEST) */<br />
IBWS_ERROR /* abnormal state; ibw_stop must be called after this */<br />
};<br />
<br />
/* Connection state */<br />
typedef struct ibw_ctx {<br />
void *ctx_userdata; /* see ibw_init */<br />
<br />
struct ibw_state_ctx state;<br />
void *internal;<br />
<br />
struct ibw_conn *conn_list; /* 1st elem of double linked list */<br />
};<br />
<br />
enum ibw_state_conn {<br />
IBWC_INIT = 0, /* conn start - internal state */<br />
IBWC_CONNECTED, /* after ibw_accept or ibw_connect */<br />
IBWC_DISCONNECTED, /* after ibw_disconnect */<br />
IBWC_ERROR<br />
};<br />
<br />
struct ibw_conn {<br />
struct ibw_ctx *ctx;<br />
struct ibw_state_conn state;<br />
<br />
void *conn_userdata; /* see ibw_connect and ibw_accept */<br />
void *internal;<br />
<br />
struct ibw_conn *prev, next;<br />
};<br />
<br />
/*<br />
* (name, value) pair for array param of ibw_init<br />
*/<br />
struct ibw_initattr {<br />
const char *name;<br />
const char *value;<br />
};<br />
<br />
/*<br />
* Callback function definition which should inform you about<br />
* connection state change<br />
* This callback is invoked whenever server or client connection changes.<br />
* Both <conn> and <ctx> can be NULL if their state didn't change.<br />
* Return nonzero on error.<br />
*/<br />
typedef int (*ibw_connstate_fn_t)(ibw_ctx *ctx, ibw_conn *conn);<br />
<br />
/*<br />
* Callback function definition which should process incoming packets<br />
* This callback is invoked whenever any message arrives.<br />
* Return nonzero on error.<br />
*<br />
* Important: you mustn't store buf pointer for later use.<br />
* Process its contents before returning.<br />
*/<br />
typedef int (*ibw_receive_fn_t)(struct ibw_conn *conn, void *buf, int n);<br />
<br />
/*<br />
* settings: array of (name, value) pairs<br />
* where name is one of:<br />
* max_send_wr [default is 256]<br />
* max_recv_wr [default is 1024]<br />
* <...><br />
*<br />
* Must be called _ONCE_ for each node.<br />
*<br />
* max_msg_size is the maximum size of a message<br />
* (max_send_wr + max_recv_wr) * max_msg_size bytes allocated per connection<br />
*<br />
* returns non-NULL on success<br />
*<br />
* talloc_free must be called for the result in IBWS_STOPPED;<br />
* it will close resources by destructor<br />
* connections(ibw_conn *) must have been closed prior talloc_free<br />
*/<br />
struct ibw_ctx *ibw_init(struct ibw_initattr *attr, int nattr,<br />
void *ctx_userdata,<br />
ibw_connstate_fn_t ibw_connstate,<br />
ibw_receive_fn_t ibw_receive,<br />
struct event_context *ectx,<br />
int max_msg_size);<br />
<br />
/*<br />
* Must be called in states of (IBWS_ERROR, IBWS_READY, IBWS_CONNECT_REQUEST)<br />
*<br />
* It will send out disconnect requests and free up ibw_conn structures.<br />
* The ctx->state will transit to IBWS_STOPPED after every conn are disconnected.<br />
* During that time, you mustn't send/recv/disconnect any more.<br />
* Only after ctx->state=IBWS_STOPPED you can talloc_free the ctx.<br />
*/<br />
int ibw_stop(struct ibw_ctx *ctx);<br />
<br />
/*************** connection initiation - like stream sockets *****/<br />
<br />
/*<br />
* works like socket bind<br />
* needs a normal internet address here<br />
*<br />
* return 0 on success<br />
*/<br />
int ibw_bind(struct ibw_ctx *ctx, struct sockaddr_in *my_addr);<br />
<br />
/*<br />
* works like socket listen<br />
* non-blocking<br />
* enables accepting incoming connections (after IBWS_READY)<br />
* (it doesn't touch ctx->state by itself)<br />
*<br />
* returns 0 on success<br />
*/<br />
int ibw_listen(struct ibw_ctx *ctx, int backlog);<br />
<br />
/*<br />
* works like socket accept<br />
* initializes a connection to a client<br />
* must be called when state=IBWS_CONNECT_REQUEST<br />
*<br />
* returns 0 on success<br />
*<br />
* You have +1 waiting here: you will get ibw_conn (having the<br />
* same <conn_userdata> member) structure in ibw_connstate_fn_t.<br />
*<br />
* Important: you won't get remote IP address (only internal conn info)<br />
*/<br />
int ibw_accept(struct ibw_ctx *ctx, struct ibw_conn *conn, void *conn_userdata);<br />
<br />
/*<br />
* Needs a normal internet address here<br />
* can be called within IBWS_READY|IBWS_CONNECT_REQUEST<br />
*<br />
* returns non-NULL on success<br />
*<br />
* You have +1 waiting here: you will get ibw_conn (having the<br />
* same <conn_userdata> member) structure in ibw_connstate_fn_t.<br />
*/<br />
int ibw_connect(struct ibw_ctx *ctx, struct sockaddr_in *serv_addr, void *conn_userdata);<br />
<br />
/*<br />
* Sends out a disconnect request.<br />
* You should process fds after calling this function<br />
* and then process it with ibw_process_event normally<br />
* until you get conn->state = IBWC_DISCONNECTED<br />
*<br />
* You mustn't talloc_free <conn> yet right after this,<br />
* first wait for IBWC_DISCONNECTED.<br />
*/<br />
void ibw_disconnect(struct ibw_conn *conn);<br />
<br />
/************ Infiniband specific event loop wrapping ******************/<br />
<br />
/*<br />
* You have to use this buf to fill in before send.<br />
* It's just to avoid memcpy.in ibw_send.<br />
* Use the same (buf, key) pair with ibw_send.<br />
* Don't use more space than maxsize (see ibw_init).<br />
*<br />
* Returns 0 on success.<br />
*/<br />
int ibw_alloc_send_buf(struct ibw_conn *conn, void **buf, void **key);<br />
<br />
/*<br />
* Send the message in one<br />
* Can be invoked any times (should fit into buffers) and at any time<br />
* (in conn->state=IBWC_CONNECTED)<br />
* n must be less or equal than max_msg_size (see ibw_init)<br />
*<br />
* You mustn't use (buf, key) any more for sending.<br />
*/<br />
int ibw_send(struct ibw_conn *conn, void *buf, void *key, int n);<br />
<br />
/*<br />
* Retrieves the last error<br />
* result: always non-zero, mustn't be freed (static)<br />
*/<br />
const char *ibw_getLastError();</div>Psomogyi@gamax.huhttps://wiki.samba.org/index.php?title=CTDB_Project_ibwrapper&diff=1832CTDB Project ibwrapper2006-12-12T18:13:28Z<p>Psomogyi@gamax.hu: </p>
<hr />
<div>File ibwrapper.h:<br />
<br />
/*<br />
* Unix SMB/CIFS implementation.<br />
* Wrap Infiniband calls.<br />
*<br />
* Copyright (C) Sven Oehme <oehmes@de.ibm.com> 2006<br />
*<br />
* Major code contributions by Peter Somogyi <psomogyi@gamax.hu><br />
*<br />
* This program is free software; you can redistribute it and/or modify<br />
* it under the terms of the GNU General Public License as published by<br />
* the Free Software Foundation; either version 2 of the License, or<br />
* (at your option) any later version.<br />
*<br />
* This program is distributed in the hope that it will be useful,<br />
* but WITHOUT ANY WARRANTY; without even the implied warranty of<br />
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br />
* GNU General Public License for more details.<br />
*<br />
* You should have received a copy of the GNU General Public License<br />
* along with this program; if not, write to the Free Software<br />
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.<br />
*/<br />
<br />
/* Server communication state */<br />
typedef enum {<br />
IBWS_INIT = 0, /* ctx start - after ibw_init */<br />
IBWS_READY, /* after ibw_bind & ibw_listen */<br />
IBWS_CONNECT_REQUEST, /* after [IBWS_READY + incoming request] */<br />
/* => [(ibw_accept)IBWS_READY | (ibw_disconnect)STOPPED | ERROR] */<br />
IBWS_STOPPED, /* normal stop <= ibw_disconnect+(IBWS_READY | IBWS_CONNECT_REQUEST) */<br />
IBWS_ERROR /* abnormal state; ibw_stop must be called after this */<br />
} ibw_state_ctx;<br />
<br />
/* Connection state */<br />
typedef struct _ibw_ctx {<br />
void *ctx_userdata; /* see ibw_init */<br />
<br />
ibw_state_ctx state;<br />
void *internal;<br />
<br />
ibw_conn *conn_list; /* 1st elem of double linked list */<br />
} ibw_ctx;<br />
<br />
typedef enum {<br />
IBWC_INIT = 0, /* conn start - internal state */<br />
IBWC_CONNECTED, /* after ibw_accept or ibw_connect */<br />
IBWC_DISCONNECTED, /* after ibw_disconnect */<br />
IBWC_ERROR<br />
} ibw_state_conn;<br />
<br />
typedef struct _ibw_conn {<br />
ibw_ctx *ctx;<br />
ibw_state_conn state;<br />
<br />
void *conn_userdata; /* see ibw_connect and ibw_accept */<br />
void *internal;<br />
<br />
ibw_conn *prev, next;<br />
} ibw_conn;<br />
<br />
/*<br />
* (name, value) pair for array param of ibw_init<br />
*/<br />
typedef struct _ibw_initattr {<br />
const char *name;<br />
const char *value;<br />
} ibw_initattr;<br />
<br />
/*<br />
* Callback function definition which should inform you about<br />
* connection state change<br />
* This callback is invoked from within ibw_process_event.<br />
* Both <conn> and <ctx> can be NULL if their state didn't change.<br />
* Return nonzero on error.<br />
*/<br />
typedef int (*ibw_connstate_fn_t)(ibw_ctx *ctx, ibw_conn *conn);<br />
<br />
/*<br />
* Callback function definition which should process incoming packets<br />
* This callback is invoked from within ibw_process_event.<br />
* Return nonzero on error.<br />
*<br />
* Important: you mustn't store buf pointer for later use. Process its contents before returning.<br />
*/<br />
typedef int (*ibw_receive_fn_t)(ibw_conn *conn, void *buf, int n);<br />
<br />
/*<br />
* settings: array of (name, value) pairs<br />
* where name is one of:<br />
* max_send_wr [default is 256]<br />
* max_recv_wr [default is 1024]<br />
* <...><br />
*<br />
* Must be called _ONCE_ for each node.<br />
*<br />
* max_msg_size is the maximum size of a message<br />
* (max_send_wr + max_recv_wr) * max_msg_size bytes allocated per connection<br />
*<br />
* returns non-NULL on success<br />
*<br />
* talloc_free must be called for the result in IBWS_STOPPED;<br />
* it will close resources by destructor<br />
* connections(ibw_conn *) must have been closed prior talloc_free<br />
*/<br />
ibw_ctx *ibw_init(ibw_initattr *attr, int nattr,<br />
void *ctx_userdata,<br />
ibw_connstate_fn_t ibw_connstate,<br />
ibw_receive_fn_t ibw_receive,<br />
event_content *ectx,<br />
int max_msg_size);<br />
<br />
/*<br />
* Must be called in states of (IBWS_ERROR, IBWS_READY, IBWS_CONNECT_REQUEST)<br />
*<br />
* It will send out disconnect requests and free up ibw_conn structures.<br />
* The ctx->state will transit to IBWS_STOPPED after every conn are disconnected.<br />
* During that time, you mustn't send/recv/disconnect any more.<br />
* Only after ctx->state=IBWS_STOPPED you can talloc_free the ctx.<br />
*/<br />
int ibw_stop(ibw_ctx *ctx);<br />
<br />
/*************** connection initiation - like stream sockets *****/<br />
<br />
/*<br />
* works like socket bind<br />
* needs a normal internet address here<br />
*<br />
* return 0 on success<br />
*/<br />
int ibw_bind(ibw_ctx *ctx, struct sockaddr_in *my_addr);<br />
<br />
/*<br />
* works like socket listen<br />
* non-blocking<br />
* enables accepting incoming connections (after IBWS_READY)<br />
* (it doesn't touch ctx->state by itself)<br />
*<br />
* returns 0 on success<br />
*/<br />
int ibw_listen(ibw_ctx *ctx, int backlog);<br />
<br />
/*<br />
* works like socket accept<br />
* initializes a connection to a client<br />
* must be called when state=IBWS_CONNECT_REQUEST<br />
*<br />
* returns 0 on success<br />
*<br />
* You have +1 waiting here: you will get ibw_conn (having the<br />
* same <conn_userdata> member) structure in ibw_connstate_fn_t.<br />
*<br />
* Important: you won't get remote IP address (only internal conn info)<br />
*/<br />
int ibw_accept(ibw_ctx *ctx, ibw_conn *conn, void *conn_userdata);<br />
<br />
/*<br />
* Needs a normal internet address here<br />
* can be called within IBWS_READY|IBWS_CONNECT_REQUEST<br />
*<br />
* returns non-NULL on success<br />
*<br />
* You have +1 waiting here: you will get ibw_conn (having the<br />
* same <conn_userdata> member) structure in ibw_connstate_fn_t.<br />
*/<br />
int ibw_connect(ibw_ctx *ctx, struct sockaddr_in *serv_addr, void *conn_userdata);<br />
<br />
/*<br />
* Sends out a disconnect request.<br />
* You should process fds after calling this function<br />
* and then process it with ibw_process_event normally<br />
* until you get conn->state = IBWC_DISCONNECTED<br />
*<br />
* You mustn't talloc_free <conn> yet right after this,<br />
* first wait for IBWC_DISCONNECTED.<br />
*/<br />
void ibw_disconnect(ibw_conn *conn);<br />
<br />
/************ Infiniband specific event loop wrapping ******************/<br />
<br />
/*<br />
* You have to use this buf to fill in before send.<br />
* It's just to avoid memcpy.in ibw_send.<br />
* Use the same (buf, key) pair with ibw_send.<br />
* Don't use more space than maxsize (see ibw_init).<br />
*<br />
* Returns 0 on success.<br />
*/<br />
int ibw_alloc_send_buf(ibw_conn *conn, void **buf, void **key);<br />
<br />
/*<br />
* Send the message in one<br />
* Can be invoked any times (should fit into buffers) and at any time<br />
* (in conn->state=IBWC_CONNECTED)<br />
* n must be less or equal than max_msg_size (see ibw_init)<br />
*<br />
* You mustn't use (buf, key) any more for sending.<br />
*/<br />
int ibw_send(ibw_conn *conn, void *buf, void *key, int n);<br />
<br />
/*<br />
* Retrieves the last error<br />
* result: always non-zero, mustn't be freed (static)<br />
*/<br />
const char *ibw_getLastError();</div>Psomogyi@gamax.huhttps://wiki.samba.org/index.php?title=CTDB_Project_ibwrapper&diff=1826CTDB Project ibwrapper2006-12-06T17:55:23Z<p>Psomogyi@gamax.hu: </p>
<hr />
<div>File ibwrapper.h:<br />
<br />
/*<br />
* Unix SMB/CIFS implementation.<br />
* Wrap Infiniband calls.<br />
*<br />
* Copyright (C) Sven Oehme <oehmes@de.ibm.com> 2006<br />
*<br />
* Major code contributions by Peter Somogyi <psomogyi@gamax.hu><br />
*<br />
* This program is free software; you can redistribute it and/or modify<br />
* it under the terms of the GNU General Public License as published by<br />
* the Free Software Foundation; either version 2 of the License, or<br />
* (at your option) any later version.<br />
*<br />
* This program is distributed in the hope that it will be useful,<br />
* but WITHOUT ANY WARRANTY; without even the implied warranty of<br />
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br />
* GNU General Public License for more details.<br />
*<br />
* You should have received a copy of the GNU General Public License<br />
* along with this program; if not, write to the Free Software<br />
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.<br />
*/<br />
<br />
/* Server communication state */<br />
typedef enum {<br />
IBWS_INIT = 0, /* ctx start - after ibw_init */<br />
IBWS_READY, /* after ibw_bind & ibw_listen */<br />
IBWS_CONNECT_REQUEST, /* after [IBWS_READY + incoming request] */<br />
/* => [(ibw_accept)IBWS_READY | (ibw_disconnect)STOPPED | ERROR] */<br />
IBWS_STOPPED, /* normal stop <= ibw_disconnect+(IBWS_READY | IBWS_CONNECT_REQUEST) */<br />
IBWS_ERROR /* abnormal state; ibw_stop must be called after this */<br />
} ibw_state_ctx;<br />
<br />
/* Connection state */<br />
typedef struct _ibw_ctx {<br />
void *ctx_userdata; /* see ibw_init */<br />
<br />
ibw_state_ctx state;<br />
void *internal;<br />
<br />
ibw_conn *conn_list; /* 1st elem of double linked list */<br />
} ibw_ctx;<br />
<br />
typedef enum {<br />
IBWC_INIT = 0, /* conn start - internal state */<br />
IBWC_CONNECTED, /* after ibw_accept or ibw_connect */<br />
IBWC_DISCONNECTED, /* after ibw_disconnect */<br />
IBWC_ERROR<br />
} ibw_state_conn;<br />
<br />
typedef struct _ibw_conn {<br />
ibw_ctx *ctx;<br />
ibw_state_conn state;<br />
<br />
void *conn_userdata; /* see ibw_connect and ibw_accept */<br />
void *internal;<br />
<br />
ibw_conn *prev, next;<br />
} ibw_conn;<br />
<br />
/*<br />
* (name, value) pair for array param of ibw_init<br />
*/<br />
typedef struct _ibw_initattr {<br />
const char *name;<br />
const char *value;<br />
} ibw_initattr;<br />
<br />
/*<br />
* Callback function definition which should inform you about<br />
* connection state change<br />
* This callback is invoked from within ibw_process_event.<br />
* Both <conn> and <ctx> can be NULL if their state didn't change.<br />
* Return nonzero on error.<br />
*/<br />
typedef int (*ibw_connstate_fn_t)(ibw_ctx *ctx, ibw_conn *conn);<br />
<br />
/*<br />
* Callback function definition which should process incoming packets<br />
* This callback is invoked from within ibw_process_event.<br />
* Return nonzero on error.<br />
*<br />
* Important: you mustn't store buf pointer for later use. Process its contents before returning.<br />
*/<br />
typedef int (*ibw_receive_fn_t)(ibw_conn *conn, void *buf, int n);<br />
<br />
/*<br />
* settings: array of (name, value) pairs<br />
* where name is one of:<br />
* dev_name [default is the first one]<br />
* rx_depth [default is 500]<br />
* mtu [default is 1024]<br />
* ib_port [default is 1]<br />
*<br />
* Must be called _ONCE_ for each node.<br />
*<br />
* returns non-NULL on success<br />
*<br />
* talloc_free must be called for the result in IBWS_STOPPED;<br />
* it will close resources by destructor<br />
* connections(ibw_conn *) must have been closed prior talloc_free<br />
*/<br />
ibw_ctx *ibw_init(ibw_initattr *attr, int nattr,<br />
void *ctx_userdata,<br />
ibw_connstate_fn_t ibw_connstate,<br />
ibw_receive_fn_t ibw_receive,<br />
event_content *ectx);<br />
<br />
/*<br />
* Must be called in states of (IBWS_ERROR, IBWS_READY, IBWS_CONNECT_REQUEST)<br />
*<br />
* It will send out disconnect requests and free up ibw_conn structures.<br />
* The ctx->state will transit to IBWS_STOPPED after every conn are disconnected.<br />
* During that time, you mustn't send/recv/disconnect any more.<br />
* Only after ctx->state=IBWS_STOPPED you can talloc_free the ctx.<br />
*/<br />
int ibw_stop(ibw_ctx *ctx);<br />
<br />
/*************** connection initiation - like stream sockets *****/<br />
<br />
/*<br />
* works like socket bind<br />
* needs a normal internet address here<br />
*<br />
* return 0 on success<br />
*/<br />
int ibw_bind(ibw_ctx *ctx, struct sockaddr_in *my_addr);<br />
<br />
/*<br />
* works like socket listen<br />
* non-blocking<br />
* enables accepting incoming connections (after IBWS_READY)<br />
* (it doesn't touch ctx->state by itself)<br />
*<br />
* returns 0 on success<br />
*/<br />
int ibw_listen(ibw_ctx *ctx, int backlog);<br />
<br />
/*<br />
* works like socket accept<br />
* initializes a connection to a client<br />
* must be called when state=IBWS_CONNECT_REQUEST<br />
*<br />
* returns 0 on success<br />
*<br />
* You have +1 waiting here: you will get ibw_conn (having the<br />
* same <conn_userdata> member) structure in ibw_connstate_fn_t.<br />
*<br />
* Important: you won't get remote IP address (only internal conn info)<br />
*/<br />
int ibw_accept(ibw_ctx *ctx, ibw_conn *conn, void *conn_userdata);<br />
<br />
/*<br />
* Needs a normal internet address here<br />
* can be called within IBWS_READY|IBWS_CONNECT_REQUEST<br />
*<br />
* returns non-NULL on success<br />
*<br />
* You have +1 waiting here: you will get ibw_conn (having the<br />
* same <conn_userdata> member) structure in ibw_connstate_fn_t.<br />
*/<br />
int ibw_connect(ibw_ctx *ctx, struct sockaddr_in *serv_addr, void *conn_userdata);<br />
<br />
/*<br />
* Sends out a disconnect request.<br />
* You should process fds after calling this function<br />
* and then process it with ibw_process_event normally<br />
* until you get conn->state = IBWC_DISCONNECTED<br />
*<br />
* You mustn't talloc_free <conn> yet right after this,<br />
* first wait for IBWC_DISCONNECTED.<br />
*/<br />
void ibw_disconnect(ibw_conn *conn);<br />
<br />
/************ Infiniband specific event loop wrapping ******************/<br />
<br />
/*<br />
* You have to use this buf to fill in before send.<br />
* It's just to avoid memcpy.in ibw_send.<br />
* Use the same (buf, key) pair with ibw_send.<br />
*<br />
* Returns 0 on success.<br />
*/<br />
int ibw_alloc_send_buf(ibw_conn *conn, void **buf, void **key, int n);<br />
<br />
/*<br />
* Send the message in one<br />
* Can be invoked any times (should fit into buffers) and at any time<br />
* (in conn->state=IBWC_CONNECTED)<br />
*<br />
* You mustn't use (buf, key) any more for sending.<br />
*/<br />
int ibw_send(ibw_conn *conn, void *buf, void *key, int n);<br />
<br />
/*<br />
* Retrieves the last error<br />
* result: always non-zero, mustn't be freed (static)<br />
*/<br />
const char *ibw_getLastError();</div>Psomogyi@gamax.huhttps://wiki.samba.org/index.php?title=CTDB_Project_ibwrapper&diff=1817CTDB Project ibwrapper2006-12-04T13:17:28Z<p>Psomogyi@gamax.hu: </p>
<hr />
<div>File ibwrapper.h:<br />
<br />
/*<br />
* Unix SMB/CIFS implementation.<br />
* Wrap Infiniband calls.<br />
*<br />
* Copyright (C) Sven Oehme <oehmes@de.ibm.com> 2006<br />
*<br />
* Major code contributions by Peter Somogyi <psomogyi@gamax.hu><br />
*<br />
* This program is free software; you can redistribute it and/or modify<br />
* it under the terms of the GNU General Public License as published by<br />
* the Free Software Foundation; either version 2 of the License, or<br />
* (at your option) any later version.<br />
*<br />
* This program is distributed in the hope that it will be useful,<br />
* but WITHOUT ANY WARRANTY; without even the implied warranty of<br />
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br />
* GNU General Public License for more details.<br />
*<br />
* You should have received a copy of the GNU General Public License<br />
* along with this program; if not, write to the Free Software<br />
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.<br />
*/<br />
<br />
/* Server communication state */<br />
typedef enum {<br />
IBWS_INIT = 0, /* ctx start - after ibw_init */<br />
IBWS_READY, /* after ibw_bind & ibw_listen */<br />
IBWS_CONNECT_REQUEST, /* after [IBWS_READY + incoming request] */<br />
/* => [(ibw_accept)IBWS_READY | (ibw_disconnect)STOPPED | ERROR] */<br />
IBWS_STOPPED, /* normal stop <= ibw_disconnect+(IBWS_READY | IBWS_CONNECT_REQUEST) */<br />
IBWS_ERROR /* abnormal state; ibw_stop must be called after this */<br />
} ibw_state_ctx;<br />
<br />
/* Connection state */<br />
typedef struct _ibw_ctx {<br />
void *ctx_userdata; /* see ibw_init */<br />
<br />
ibw_state_ctx state;<br />
void *internal;<br />
} ibw_ctx;<br />
<br />
typedef enum {<br />
IBWC_INIT = 0, /* conn start - internal state */<br />
IBWC_CONNECTED, /* after ibw_accept or ibw_connect */<br />
IBWC_DISCONNECTED, /* after ibw_disconnect */<br />
IBWC_ERROR<br />
} ibw_state_conn;<br />
<br />
typedef struct _ibw_conn {<br />
ibw_ctx *ctx;<br />
ibw_state_conn state;<br />
<br />
void *conn_userdata; /* see ibw_connect and ibw_accept */<br />
void *internal;<br />
} ibw_conn;<br />
<br />
/*<br />
* (name, value) pair for array param of ibw_init<br />
*/<br />
typedef struct _ibw_initattr {<br />
const char *name;<br />
const char *value;<br />
} ibw_initattr;<br />
<br />
/*<br />
* Callback function definition which should inform you about<br />
* connection state change<br />
* This callback is invoked from within ibw_process_event.<br />
* Both <conn> and <ctx> can be NULL if their state didn't change.<br />
* Return nonzero on error.<br />
*/<br />
typedef int (*ibw_connstate_fn_t)(ibw_ctx *ctx, ibw_conn *conn);<br />
<br />
/*<br />
* Callback function definition which should process incoming packets<br />
* This callback is invoked from within ibw_process_event.<br />
* Return nonzero on error.<br />
*<br />
* Important: you mustn't store buf pointer for later use. Process<br />
* its contents before returning.<br />
*/<br />
typedef int (*ibw_receive_fn_t)(ibw_conn *conn, void *buf, int n);<br />
<br />
/*<br />
* settings: array of (name, value) pairs<br />
* where name is one of:<br />
* dev_name [default is the first one]<br />
* rx_depth [default is 500]<br />
* mtu [default is 1024]<br />
* ib_port [default is 1]<br />
*<br />
* Must be called _ONCE_ for each node.<br />
*<br />
* returns non-NULL on success<br />
*<br />
* talloc_free must be called for the result in IBWS_STOPPED;<br />
* it will close resources by destructor<br />
* connections(ibw_conn *) must have been closed prior talloc_free<br />
*/<br />
ibw_ctx *ibw_init(ibw_initattr *attr, int nattr,<br />
void *ctx_userdata,<br />
ibw_connstate_fn_t ibw_connstate,<br />
ibw_receive_fn_t ibw_receive);<br />
<br />
/*<br />
* Must be called in states of (IBWS_ERROR, IBWS_READY, IBWS_CONNECT_REQUEST)<br />
*<br />
* It will send out disconnect requests and free up ibw_conn structures.<br />
* The ctx->state will transit to IBWS_STOPPED after every conn are disconnected.<br />
* During that time, you mustn't send/recv/disconnect any more.<br />
* Only after ctx->state=IBWS_STOPPED you can talloc_free the ctx.<br />
*/<br />
int ibw_stop(ibw_ctx *ctx);<br />
<br />
/*<br />
* works like socket bind<br />
* needs a normal internet address here<br />
*<br />
* return 0 on success<br />
*/<br />
int ibw_bind(ibw_ctx *ctx, struct sockaddr_in *my_addr);<br />
<br />
/*<br />
* works like socket listen<br />
* non-blocking<br />
* enables accepting incoming connections (after IBWS_READY)<br />
* (it doesn't touch ctx->state by itself)<br />
*<br />
* returns 0 on success<br />
*/<br />
int ibw_listen(ibw_ctx *ctx, int backlog);<br />
<br />
/*<br />
* works like socket accept<br />
* non-blocking<br />
* initializes a connection to a client<br />
* must be called when state=IBWS_CONNECT_REQUEST<br />
*<br />
* returns 0 on success<br />
*<br />
* You have +1 waiting here: you will get ibw_conn (having the<br />
* same <conn_userdata> member) structure in ibw_connstate_fn_t.<br />
*<br />
* Important: you won't get remote IP address (only internal conn info)<br />
*/<br />
int ibw_accept(ibw_ctx *ctx, void *conn_userdata);<br />
<br />
/*<br />
* Needs a normal internet address here<br />
* can be called within IBWS_READY|IBWS_CONNECT_REQUEST<br />
*<br />
* returns non-NULL on success<br />
*<br />
* You have +1 waiting here: you will get ibw_conn (having the<br />
* same <conn_userdata> member) structure in ibw_connstate_fn_t.<br />
*/<br />
int ibw_connect(ibw_ctx *ctx, struct sockaddr_in *serv_addr, void *conn_userdata);<br />
<br />
/*<br />
* Sends out a disconnect request.<br />
* You should process fds after calling this function<br />
* and then process it with ibw_process_event normally<br />
* until you get conn->state = IBWC_DISCONNECTED<br />
*<br />
* You mustn't talloc_free <conn> yet right after this,<br />
* first wait for IBWC_DISCONNECTED.<br />
*/<br />
void ibw_disconnect(ibw_conn *conn);<br />
<br />
/*<br />
* You have to use this buf to fill in before send.<br />
* It's just to avoid memcpy.in ibw_send.<br />
* Use the same (buf, key) pair with ibw_send.<br />
*<br />
* Returns 0 on success.<br />
*/<br />
int ibw_alloc_send_buf(ibw_conn *conn, void **buf, void **key, int n);<br />
<br />
/*<br />
* Send the message in one<br />
* Can be invoked any times (should fit into buffers) and at any time<br />
* (in conn->state=IBWC_CONNECTED)<br />
*<br />
* You mustn't use (buf, key) any more for sending.<br />
*/<br />
int ibw_send(ibw_conn *conn, void *buf, void *key, int n);<br />
<br />
/*<br />
* Retrieves the last error<br />
* result: always non-zero, mustn't be freed (static)<br />
*/<br />
const char *ibw_getLastError();</div>Psomogyi@gamax.huhttps://wiki.samba.org/index.php?title=CTDB_Project_ibwrapper&diff=1814CTDB Project ibwrapper2006-11-30T18:10:58Z<p>Psomogyi@gamax.hu: </p>
<hr />
<div>File ibwrapper.h:<br />
<br />
/*<br />
* Infiniband Verbs API socket-like wrapper<br />
* Copyright (C) Peter Somogyi 2006<br />
*<br />
* This library is free software; you can redistribute it and/or<br />
* modify it under the terms of the GNU Lesser General Public<br />
* License as published by the Free Software Foundation; either<br />
* version 2 of the License, or (at your option) any later version.<br />
*<br />
* This library is distributed in the hope that it will be useful,<br />
* but WITHOUT ANY WARRANTY; without even the implied warranty of<br />
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU<br />
* Lesser General Public License for more details.<br />
*<br />
* You should have received a copy of the GNU Lesser General Public<br />
* License along with this library; if not, write to the Free Software<br />
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA<br />
*/<br />
<br />
/* Server communication state */<br />
typedef enum {<br />
IBWS_INIT = 0, /* ctx start - after ibw_init */<br />
IBWS_READY, /* after ibw_bind & ibw_listen */<br />
IBWS_CONNECT_REQUEST, /* after [IBWS_READY + incoming request] */<br />
/* => [(ibw_accept)IBWS_READY | (ibw_disconnect)STOPPED | ERROR] */<br />
IBWS_STOPPED, /* normal stop <= ibw_disconnect+(IBWS_READY | IBWS_CONNECT_REQUEST) */<br />
IBWS_ERROR /* abnormal state; ibw_stop must be called after this */<br />
} ibw_state_ctx;<br />
<br />
/* Connection state */<br />
typedef struct _ibw_ctx {<br />
void *ctx_userdata; /* see ibw_init */<br />
<br />
ibw_state_ctx state;<br />
void *internal;<br />
} ibw_ctx;<br />
<br />
typedef enum {<br />
IBWC_INIT = 0, /* conn start - internal state */<br />
IBWC_CONNECTED, /* after ibw_accept or ibw_connect */<br />
IBWC_DISCONNECTED, /* after ibw_disconnect */<br />
IBWC_ERROR<br />
} ibw_state_conn;<br />
<br />
typedef struct _ibw_conn {<br />
ibw_ctx *ctx;<br />
ibw_state_conn state;<br />
<br />
void *conn_userdata; /* see ibw_connect and ibw_accept */<br />
void *internal;<br />
} ibw_conn;<br />
<br />
/*<br />
* (name, value) pair for array param of ibw_init<br />
*/<br />
typedef struct _ibw_initattr {<br />
const char *name;<br />
const char *value;<br />
} ibw_initattr;<br />
<br />
/*<br />
* Callback function definition which should inform you about<br />
* connection state change<br />
* This callback is invoked from within ibw_process_event.<br />
* Both <conn> and <ctx> can be NULL if their state didn't change.<br />
* Return nonzero on error.<br />
*/<br />
typedef int (*ibw_connstate_fn_t)(ibw_ctx *ctx, ibw_conn *conn);<br />
<br />
/*<br />
* Callback function definition which should process incoming packets<br />
* This callback is invoked from within ibw_process_event.<br />
* Return nonzero on error.<br />
*<br />
* Important: you mustn't store buf pointer for later use. Process its contents before returning.<br />
*/<br />
typedef int (*ibw_receive_fn_t)(ibw_conn *conn, void *buf, int n);<br />
<br />
/*<br />
* Callback function to request ibw_process_event to be called with fd_index<br />
* when this fd is set.<br />
* This is necessary because upper layer shouldn't be aware of when to use that fd.<br />
* fd_index is a unique value (normally index of the internal fd array)<br />
*<br />
* It's up to the actual implementation of this interface when this callback is called<br />
* (e.g. in ibw_init, ibw_bind, ibw_listen ...)<br />
*/<br />
typedef int (*ibw_add_event_fn_t)(int fd, int fd_index, void *ctx_userdata);<br />
<br />
/*<br />
* Callback function to request this fd shouldn't be used any more.<br />
*<br />
* fd_index is a unique value (normally index of the internal fd array) - upper<br />
* layer possibly needn't use this.<br />
*<br />
* It's up to the actual implementation of this interface when this callback is called<br />
* (e.g. in ibw_destroy, talloc_free, ...)<br />
*/<br />
typedef int (*ibw_rm_event_fn_t)(int fd, int fd_index, void *ctx_userdata);<br />
<br />
/*<br />
* settings: array of (name, value) pairs<br />
* where name is one of:<br />
* dev_name [default is the first one]<br />
* rx_depth [default is 500]<br />
* mtu [default is 1024]<br />
* ib_port [default is 1]<br />
*<br />
* Must be called _ONCE_ for each node.<br />
*<br />
* returns non-NULL on success<br />
*<br />
* talloc_free must be called for the result in IBWS_STOPPED;<br />
* it will close resources by destructor<br />
* connections(ibw_conn *) must have been closed prior talloc_free<br />
*/<br />
ibw_ctx *ibw_init(ibw_initattr *attr, int nattr,<br />
void *ctx_userdata,<br />
ibw_connstate_fn_t ibw_connstate,<br />
ibw_receive_fn_t ibw_receive,<br />
ibw_add_event_fn_t ibw_add_event,<br />
ibw_rm_event_fn_t ibw_rm_event);<br />
<br />
/*<br />
* Must be called in states of (IBWS_ERROR, IBWS_READY, IBWS_CONNECT_REQUEST)<br />
*<br />
* It will send out disconnect requests and free up ibw_conn structures.<br />
* The ctx->state will transit to IBWS_STOPPED after every conn are disconnected.<br />
* During that time, you mustn't send/recv/disconnect any more.<br />
* Only after ctx->state=IBWS_STOPPED you can talloc_free the ctx.<br />
*/<br />
int ibw_stop(ibw_ctx *ctx);<br />
<br />
/*************** connection initiation - like stream sockets *****/<br />
<br />
/*<br />
* works like socket bind<br />
* needs a normal internet address here<br />
*<br />
* return 0 on success<br />
*/<br />
int ibw_bind(ibw_ctx *ctx, struct sockaddr_in *my_addr);<br />
<br />
/*<br />
* works like socket listen<br />
* non-blocking<br />
* enables accepting incoming connections (after IBWS_READY)<br />
* (it doesn't touch ctx->state by itself)<br />
*<br />
* returns 0 on success<br />
*/<br />
int ibw_listen(ibw_ctx *ctx, int backlog);<br />
<br />
/*<br />
* works like socket accept<br />
* initializes a connection to a client<br />
* must be called when state=IBWS_CONNECT_REQUEST<br />
*<br />
* returns 0 on success<br />
*<br />
* You have +1 waiting here: you will get ibw_conn (having the<br />
* same <conn_userdata> member) structure in ibw_connstate_fn_t.<br />
*<br />
* Important: you won't get remote IP address (only internal conn info)<br />
*/<br />
int ibw_accept(ibw_ctx *ctx, void *conn_userdata);<br />
<br />
/*<br />
* Needs a normal internet address here<br />
* can be called within IBWS_READY|IBWS_CONNECT_REQUEST<br />
*<br />
* returns non-NULL on success<br />
*<br />
* You have +1 waiting here: you will get ibw_conn (having the<br />
* same <conn_userdata> member) structure in ibw_connstate_fn_t.<br />
*/<br />
int ibw_connect(ibw_ctx *ctx, struct sockaddr_in *serv_addr, void *conn_userdata);<br />
<br />
/*<br />
* Sends out a disconnect request.<br />
* You should process fds after calling this function<br />
* and then process it with ibw_process_event normally<br />
* until you get conn->state = IBWC_DISCONNECTED<br />
*<br />
* You mustn't talloc_free <conn> yet right after this,<br />
* first wait for IBWC_DISCONNECTED.<br />
*/<br />
void ibw_disconnect(ibw_conn *conn);<br />
<br />
/************ Infiniband specific event loop wrapping ******************/<br />
<br />
/* <br />
* Must be called in all cases after selecting/polling<br />
* for FDs set via ibw_add_event_fn_t.<br />
*<br />
* fd_index: fd identifier passed in ibw_add_event_fn_t<br />
* with the same fd was set there.<br />
*/<br />
int ibw_process_event(ibw_ctx *ctx, int fd_index);<br />
<br />
/*<br />
* You have to use this buf to fill in before send.<br />
* It's just to avoid memcpy.in ibw_send.<br />
* Use the same (buf, key) pair with ibw_send.<br />
*<br />
* Returns 0 on success.<br />
*/<br />
int ibw_alloc_send_buf(ibw_conn *conn, void **buf, void **key, int n);<br />
<br />
/*<br />
* Send the message in one<br />
* Can be invoked any times (should fit into buffers) and at any time<br />
* (in conn->state=IBWC_CONNECTED)<br />
*<br />
* You mustn't use (buf, key) any more for sending.<br />
*/<br />
int ibw_send(ibw_conn *conn, void *buf, void *key, int n);<br />
<br />
/*<br />
* Retrieves the last error<br />
* result: always non-zero, mustn't be freed (static)<br />
*/<br />
const char *ibw_getLastError();</div>Psomogyi@gamax.huhttps://wiki.samba.org/index.php?title=CTDB_Project&diff=1813CTDB Project2006-11-30T16:15:21Z<p>Psomogyi@gamax.hu: /* The basic features we want from a messaging API are: */</p>
<hr />
<div>= CTDB Project =<br />
<br />
This project aims to produce an implementation of the CTDB protocol described in the [[Samba & Clustering]] page<br />
<br />
== Project Members ==<br />
<br />
Sven Oehme (project leader) <br />
Andrew Tridgell (technical lead)<br />
Alexander Bokovoy<br />
Aleksey Fedoseev<br />
Jim McDonough<br />
Peter Somogyi<br />
<br />
== Project Outline ==<br />
<br />
The initial work will focus on an implementation as part of tdb itself. Integration with the Samba source tree will happen at a later date. Work will probably happen in a bzr tree, but the details have not been worked out yet. Check back here for updates.<br />
<br />
= Project Tasks =<br />
<br />
== Hardware acceleration ==<br />
<br />
(note: Peter is looking at this one)<br />
<br />
We want CTDB to be very fast on hardware that supports fast messaging. In particular we are interested in good use of infiniband adapters, where we expect to get messaging latencies of the order of 3 to 5 microseconds. <br />
<br />
From discussions so far it looks like the 'verbs' API, perhaps with a modification to allow us to hook it into epoll(), will be the right choice. Basic information on this API is available at https://openib.org/tiki/tiki-index.php<br />
===The basic features we want from a messaging API are:===<br />
<br />
* low latency. We would like to get it down to just a few microseconds per message. Messages will vary in size, but typically be small (say between 64 and 512 bytes).<br />
<br />
* non-blocking. We would really like an API that hooks into poll, so we can use epoll(), poll() or select(). <br />
<br />
* If we can't have an API that hooks into poll() or epoll(), then a callback or signal based API would do if the overheads are small enough. In the same code we also need to be working on a unix domain socket (datagram socket) so we'd like the overhead of dealing with both the infiniband messages and the local datagrams to be low.<br />
<br />
* What we definately don't want to use is an API that chews a lot of CPU. So we don't want to be spinning in userspace on a set a mapped registers in the hope that a message might come along. The CPU will be needed for other tasks. Using mapped registers for send would probably be fine, but we'd probably need some kernel mediated mechanism for receive unless you can suggest a way to avoid it.<br />
<br />
* ideally we'd have reliable delivery, or at least be told when delivery has failed on a send, but if that is too expensive then we'll do our own reliable delivery mechanism.<br />
<br />
* we need to be able to add/remove nodes from the cluster. The Samba clustering code will have its own recovery protocol.<br />
<br />
* a 'message' like API would suite us better than a 'remote DMA' style API, unless the remote DMA API is significantly more efficient. Ring buffers would be fine.<br />
<br />
An abstract interface can be found here: [[CTDB_Project_ibwrapper]] Please note this interface should be able to cover more possible implementations.<br />
<br />
===TODOs regarding this interface:===<br />
<br />
* verify implementability<br />
* reduction<br />
<br />
== Flesh out CTDB API ==<br />
<br />
(note: Alexander and Aleksey are looking at this)<br />
<br />
By this I mean the C api in the "Clustered TDB API" section of the<br />
wiki page. The API as given there now is missing some pieces, and I<br />
think can be greatly improved.<br />
<br />
This is likely to feed back into the CTDB protocol description as<br />
well. Ideally we'd get rid of these calls:<br />
<br />
CTDB_REQ_FETCH_LOCKED<br />
CTDB_REPLY_FETCH_LOCKED<br />
CTDB_REQ_UNLOCK<br />
CTDB_REPLY_UNLOCK<br />
<br />
assuming we can demonstrate they aren't needed. I also think we can<br />
combine the CTDB_REQ_CONDITIONAL_APPEND and the CTDB_REQ_FETCH call<br />
into a single CTDB_REQ_REQUEST call which takes a key, a blob of data<br />
and a condition ID as input, and returns a blob of data and a status<br />
code as output. For a fetch call the input blob of data would be zero<br />
length.<br />
<br />
== Code s3/s4 opendb and brlock on top of ctdb api ==<br />
<br />
Whoever does this would pick either s3 or s4 initially, I don't think<br />
there is any point in doing them in parallel (we are bound to make the<br />
same sorts of mistakes on both if we did that).<br />
<br />
This will also feed a lot into the previous line item, working out the API.<br />
<br />
== Code CTDB api on top of dumb tdb ==<br />
<br />
This also feeds into the API discussion. It should be a very simple<br />
and dumb implementation, aiming to be used to allow the s3/s4<br />
implementation to have something to test against.<br />
<br />
== Prototype CTDB library on top of UDP/TCP ==<br />
<br />
(note: tridge is looking at this task)<br />
<br />
The initial implementation of the CTDB protocol will be on top of UDP/TCP<br />
<br />
Status: prototype work on this has begun. You can watch progress at http://build.samba.org/?tree=ctdb&function=Recent+Checkins<br />
<br />
== Setup standalone test environment ==<br />
<br />
This test environment is meant for non-clustered usage, instead emulating a cluster using<br />
IP on loopback. It will need to run multiple instances talking over 127.0.0.X interfaces. <br />
This will involve some shell scripting, plus some work on<br />
adding/removing nodes from the cluster. It might be easiest to add a<br />
CTDB protocol request asking a node to 'go quiet', then asking it to<br />
become active again later to simulate a node dying and coming back.<br />
<br />
== Code CTDB test suite ==<br />
(note: jim is looking at this one)<br />
<br />
This reflects the fact that I want this project to concentrate on<br />
building ctdb on tdb + messaging, and not concentrate on the "whole<br />
problem" involving Samba until later. We'll do a basic s3/s4 backend<br />
implementation to make sure the ideas can work, but I want the major<br />
testing effort to involve simple tests directly against the ctdb<br />
API. It will be so much easier to simulate exotic error conditions<br />
that way.<br />
<br />
== Flesh out recovery part of ctdb protocol == <br />
<br />
(note: tridge is looking at this one)<br />
<br />
I explained on the phone that I think the simplest recovery process<br />
will be something like this:<br />
<br />
* global sync and pick 'master' for recovery<br />
* every node sends all records from its local tdb to the LMASTER<br />
* master waits till all nodes say they are done<br />
* global sync and restart<br />
<br />
The recovery phase will need to very carefully cope with lots of<br />
corner cases, like when a node goes down during recovery.<br />
<br />
== Work out details for persistent tdbs ==<br />
<br />
this will need some more thought - its not our top priority, but<br />
eventually the long lived databases will matter.<br />
<br />
== Wireshark dissector ==<br />
<br />
We'll need a wireshark dissector, but only once the protocol settles down a little.</div>Psomogyi@gamax.huhttps://wiki.samba.org/index.php?title=CTDB_Project&diff=1812CTDB Project2006-11-30T16:12:14Z<p>Psomogyi@gamax.hu: /* TODOs regarding this interface: */</p>
<hr />
<div>= CTDB Project =<br />
<br />
This project aims to produce an implementation of the CTDB protocol described in the [[Samba & Clustering]] page<br />
<br />
== Project Members ==<br />
<br />
Sven Oehme (project leader) <br />
Andrew Tridgell (technical lead)<br />
Alexander Bokovoy<br />
Aleksey Fedoseev<br />
Jim McDonough<br />
Peter Somogyi<br />
<br />
== Project Outline ==<br />
<br />
The initial work will focus on an implementation as part of tdb itself. Integration with the Samba source tree will happen at a later date. Work will probably happen in a bzr tree, but the details have not been worked out yet. Check back here for updates.<br />
<br />
= Project Tasks =<br />
<br />
== Hardware acceleration ==<br />
<br />
(note: Peter is looking at this one)<br />
<br />
We want CTDB to be very fast on hardware that supports fast messaging. In particular we are interested in good use of infiniband adapters, where we expect to get messaging latencies of the order of 3 to 5 microseconds. <br />
<br />
From discussions so far it looks like the 'verbs' API, perhaps with a modification to allow us to hook it into epoll(), will be the right choice. Basic information on this API is available at https://openib.org/tiki/tiki-index.php<br />
===The basic features we want from a messaging API are:===<br />
<br />
* low latency. We would like to get it down to just a few microseconds per message. Messages will vary in size, but typically be small (say between 64 and 512 bytes).<br />
<br />
* non-blocking. We would really like an API that hooks into poll, so we can use epoll(), poll() or select(). <br />
<br />
* If we can't have an API that hooks into poll() or epoll(), then a callback or signal based API would do if the overheads are small enough. In the same code we also need to be working on a unix domain socket (datagram socket) so we'd like the overhead of dealing with both the infiniband messages and the local datagrams to be low.<br />
<br />
* What we definately don't want to use is an API that chews a lot of CPU. So we don't want to be spinning in userspace on a set a mapped registers in the hope that a message might come along. The CPU will be needed for other tasks. Using mapped registers for send would probably be fine, but we'd probably need some kernel mediated mechanism for receive unless you can suggest a way to avoid it.<br />
<br />
* ideally we'd have reliable delivery, or at least be told when delivery has failed on a send, but if that is too expensive then we'll do our own reliable delivery mechanism.<br />
<br />
* we need to be able to add/remove nodes from the cluster. The Samba clustering code will have its own recovery protocol.<br />
<br />
* a 'message' like API would suite us better than a 'remote DMA' style API, unless the remote DMA API is significantly more efficient. Ring buffers would be fine.<br />
<br />
An abstract interface can be found here: [[CTDB_Project_ibwrapper]]<br />
<br />
===TODOs regarding this interface:===<br />
<br />
* verify implementability<br />
* reduction<br />
<br />
== Flesh out CTDB API ==<br />
<br />
(note: Alexander and Aleksey are looking at this)<br />
<br />
By this I mean the C api in the "Clustered TDB API" section of the<br />
wiki page. The API as given there now is missing some pieces, and I<br />
think can be greatly improved.<br />
<br />
This is likely to feed back into the CTDB protocol description as<br />
well. Ideally we'd get rid of these calls:<br />
<br />
CTDB_REQ_FETCH_LOCKED<br />
CTDB_REPLY_FETCH_LOCKED<br />
CTDB_REQ_UNLOCK<br />
CTDB_REPLY_UNLOCK<br />
<br />
assuming we can demonstrate they aren't needed. I also think we can<br />
combine the CTDB_REQ_CONDITIONAL_APPEND and the CTDB_REQ_FETCH call<br />
into a single CTDB_REQ_REQUEST call which takes a key, a blob of data<br />
and a condition ID as input, and returns a blob of data and a status<br />
code as output. For a fetch call the input blob of data would be zero<br />
length.<br />
<br />
== Code s3/s4 opendb and brlock on top of ctdb api ==<br />
<br />
Whoever does this would pick either s3 or s4 initially, I don't think<br />
there is any point in doing them in parallel (we are bound to make the<br />
same sorts of mistakes on both if we did that).<br />
<br />
This will also feed a lot into the previous line item, working out the API.<br />
<br />
== Code CTDB api on top of dumb tdb ==<br />
<br />
This also feeds into the API discussion. It should be a very simple<br />
and dumb implementation, aiming to be used to allow the s3/s4<br />
implementation to have something to test against.<br />
<br />
== Prototype CTDB library on top of UDP/TCP ==<br />
<br />
(note: tridge is looking at this task)<br />
<br />
The initial implementation of the CTDB protocol will be on top of UDP/TCP<br />
<br />
Status: prototype work on this has begun. You can watch progress at http://build.samba.org/?tree=ctdb&function=Recent+Checkins<br />
<br />
== Setup standalone test environment ==<br />
<br />
This test environment is meant for non-clustered usage, instead emulating a cluster using<br />
IP on loopback. It will need to run multiple instances talking over 127.0.0.X interfaces. <br />
This will involve some shell scripting, plus some work on<br />
adding/removing nodes from the cluster. It might be easiest to add a<br />
CTDB protocol request asking a node to 'go quiet', then asking it to<br />
become active again later to simulate a node dying and coming back.<br />
<br />
== Code CTDB test suite ==<br />
(note: jim is looking at this one)<br />
<br />
This reflects the fact that I want this project to concentrate on<br />
building ctdb on tdb + messaging, and not concentrate on the "whole<br />
problem" involving Samba until later. We'll do a basic s3/s4 backend<br />
implementation to make sure the ideas can work, but I want the major<br />
testing effort to involve simple tests directly against the ctdb<br />
API. It will be so much easier to simulate exotic error conditions<br />
that way.<br />
<br />
== Flesh out recovery part of ctdb protocol == <br />
<br />
(note: tridge is looking at this one)<br />
<br />
I explained on the phone that I think the simplest recovery process<br />
will be something like this:<br />
<br />
* global sync and pick 'master' for recovery<br />
* every node sends all records from its local tdb to the LMASTER<br />
* master waits till all nodes say they are done<br />
* global sync and restart<br />
<br />
The recovery phase will need to very carefully cope with lots of<br />
corner cases, like when a node goes down during recovery.<br />
<br />
== Work out details for persistent tdbs ==<br />
<br />
this will need some more thought - its not our top priority, but<br />
eventually the long lived databases will matter.<br />
<br />
== Wireshark dissector ==<br />
<br />
We'll need a wireshark dissector, but only once the protocol settles down a little.</div>Psomogyi@gamax.huhttps://wiki.samba.org/index.php?title=CTDB_Project_ibwrapper&diff=1811CTDB Project ibwrapper2006-11-30T16:08:36Z<p>Psomogyi@gamax.hu: </p>
<hr />
<div>File ibwrapper.h:<br />
<br />
/*<br />
* Infiniband Verbs API socket-like wrapper<br />
* Copyright (C) Peter Somogyi 2006<br />
*<br />
* This library is free software; you can redistribute it and/or<br />
* modify it under the terms of the GNU Lesser General Public<br />
* License as published by the Free Software Foundation; either<br />
* version 2 of the License, or (at your option) any later version.<br />
*<br />
* This library is distributed in the hope that it will be useful,<br />
* but WITHOUT ANY WARRANTY; without even the implied warranty of<br />
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU<br />
* Lesser General Public License for more details.<br />
*<br />
* You should have received a copy of the GNU Lesser General Public<br />
* License along with this library; if not, write to the Free Software<br />
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA<br />
*/<br />
<br />
/* Server communication state */<br />
typedef enum {<br />
IBWS_INIT = 0, /* ctx start - after ibw_init */<br />
IBWS_READY, /* after ibw_bind & ibw_listen */<br />
IBWS_CONNECT_REQUEST, /* after [IBWS_READY + incoming request] */<br />
/* => [(ibw_accept)IBWS_READY | (ibw_disconnect)STOPPED | ERROR] */<br />
IBWS_STOPPED, /* normal stop <= ibw_disconnect+(IBWS_READY | IBWS_CONNECT_REQUEST) */<br />
IBWS_ERROR /* abnormal state; ibw_stop must be called after this */<br />
} ibw_state_ctx;<br />
<br />
#define IBW_NUM_FDS 3<br />
<br />
/* Connection state */<br />
typedef struct _ibw_ctx {<br />
int fds[IBW_NUM_FDS]; /* read fds */<br />
/* ibw_process_event must be _always_ invoked for _any_ of above set */<br />
/* when one of these fds is set after a select/poll */<br />
/* don't use the fd which is -1 (not used by underlying implementation) */<br />
<br />
void *ctx_userdata; /* see ibw_init */<br />
<br />
ibw_state_ctx state;<br />
void *internal;<br />
} ibw_ctx;<br />
<br />
typedef enum {<br />
IBWC_INIT = 0, /* conn start - internal state */<br />
IBWC_CONNECTED, /* after ibw_accept or ibw_connect */<br />
IBWC_DISCONNECTED, /* after ibw_disconnect */<br />
IBWC_ERROR<br />
} ibw_state_conn;<br />
<br />
typedef struct _ibw_conn {<br />
ibw_ctx *ctx;<br />
ibw_state_conn state;<br />
<br />
void *conn_userdata; /* see ibw_connect and ibw_accept */<br />
void *internal;<br />
} ibw_conn;<br />
<br />
typedef struct _ibw_initattr {<br />
const char *name;<br />
const char *value;<br />
} ibw_initattr;<br />
<br />
/*<br />
* Callback function definition which should inform you about<br />
* connection state change<br />
* This callback is invoked from within ibw_process_event.<br />
* Both <conn> and <ctx> can be NULL if their state didn't change.<br />
* Return nonzero on error.<br />
*/<br />
typedef int (*ibw_connstate_fn_t)(ibw_ctx *ctx, ibw_conn *conn, void *ctx_userdata);<br />
<br />
/*<br />
* Callback function definition which should process incoming packets<br />
* This callback is invoked from within ibw_process_event.<br />
* Return nonzero on error.<br />
*<br />
* Important: you mustn't store buf pointer for later use. Process its contents before returning.<br />
*/<br />
typedef int (*ibw_receive_fn_t)(ibw_conn *conn, void *buf, int n);<br />
<br />
/*<br />
* settings: array of (name, value) pairs<br />
* where name is one of:<br />
* dev_name [default is the first one]<br />
* rx_depth [default is 500]<br />
* mtu [default is 1024]<br />
* ib_port [default is 1]<br />
*<br />
* Must be called _ONCE_ for each node.<br />
*<br />
* returns non-NULL on success<br />
*<br />
* talloc_free must be called for the result in IBWS_STOPPED;<br />
* it will close resources by destructor<br />
* connections(ibw_conn *) must have been closed prior talloc_free<br />
*/<br />
ibw_ctx *ibw_init(ibw_initattr *attr, int nattr,<br />
ibw_connstate_fn_t ibw_connstate,<br />
void *ctx_userdata,<br />
ibw_receive_fn_t ibw_receive);<br />
<br />
/*<br />
* Must be called in states of (IBWS_ERROR, IBWS_READY, IBWS_CONNECT_REQUEST)<br />
*<br />
* It will send out disconnect requests and free up ibw_conn structures.<br />
* The ctx->state will transit to IBWS_STOPPED after every conn are disconnected.<br />
* During that time, you mustn't send/recv/disconnect any more.<br />
* Only after ctx->state=IBWS_STOPPED you can talloc_free the ctx.<br />
*/<br />
int ibw_stop(ibw_ctx *ctx);<br />
<br />
/*************** connection initiation - like stream sockets *****/<br />
<br />
/*<br />
* works like socket bind<br />
* needs a normal internet address here<br />
*<br />
* return 0 on success<br />
*/<br />
int ibw_bind(ibw_ctx *ctx, struct sockaddr_in *my_addr);<br />
<br />
/*<br />
* works like socket listen<br />
* non-blocking<br />
* enables accepting incoming connections (after IBWS_READY)<br />
* (it doesn't touch ctx->state by itself)<br />
*<br />
* returns ctx->cm_fd<br />
*/<br />
int ibw_listen(ibw_ctx *ctx, int backlog);<br />
<br />
/*<br />
* works like socket accept<br />
* initializes a connection to a client<br />
* must be called when state=IBWS_CONNECT_REQUEST<br />
*<br />
* returns 0 on success<br />
*<br />
* You have +1 waiting here: you will get ibw_conn (having the<br />
* same <conn_userdata> member) structure in ibw_connstate_fn_t.<br />
*<br />
* Important: you won't get remote IP address (only internal conn info)<br />
*/<br />
int ibw_accept(ibw_ctx *ctx, void *conn_userdata);<br />
<br />
/*<br />
* Needs a normal internet address here<br />
* can be called within IBWS_READY|IBWS_CONNECT_REQUEST<br />
*<br />
* returns non-NULL on success<br />
*<br />
* You have +1 waiting here: you will get ibw_conn (having the<br />
* same <conn_userdata> member) structure in ibw_connstate_fn_t.<br />
*/<br />
int ibw_connect(ibw_ctx *ctx, struct sockaddr_in *serv_addr, void *conn_userdata);<br />
<br />
/*<br />
* Sends out a disconnect request.<br />
* You should process ctx->cm_fd ater calling this function<br />
* and then process it with ibw_process_event normally<br />
* (until you get conn->state = IBWC_DISCONNECTED)<br />
*<br />
* You mustn't talloc_free <conn> after this.<br />
*/<br />
void ibw_disconnect(ibw_conn *conn);<br />
<br />
/************ Infiniband specific event loop wrapping ******************/<br />
<br />
/* <br />
* Must be called in all cases after selecting/polling for ctx->fd_events is set.<br />
* @fd_index: value between [0..IBW_NUM_FDS-1] according to<br />
* which ctx->fds[fd_index] was set<br />
*/<br />
int ibw_process_event(ibw_ctx *ctx, int fd_index);<br />
<br />
/*<br />
* You have to use this buf to fill in before send.<br />
* It's just to avoid memcpy.in ibw_send.<br />
* Use the same (buf, key) pair with ibw_send.<br />
*<br />
* Returns 0 on success.<br />
*/<br />
int ibw_alloc_send_buf(ibw_conn *conn, void **buf, void **key, int n);<br />
<br />
/*<br />
* Send the message in one<br />
* Can be invoked any times (should fit into buffers) and at any time<br />
* (in conn->state=IBWC_CONNECTED)<br />
*<br />
* You mustn't use (buf, key) any more for sending.<br />
*/<br />
int ibw_send(ibw_conn *conn, void *buf, void *key, int n);<br />
<br />
/*<br />
* Retrieves the last error<br />
* result: always non-zero, mustn't be freed (static)<br />
*/<br />
const char *ibw_getLastError();</div>Psomogyi@gamax.huhttps://wiki.samba.org/index.php?title=CTDB_Project_ibwrapper&diff=1810CTDB Project ibwrapper2006-11-28T12:37:52Z<p>Psomogyi@gamax.hu: </p>
<hr />
<div>File ibwrapper.h:<br />
<br />
/*<br />
* Infiniband Verbs API socket-like wrapper<br />
* Copyright (C) Peter Somogyi 2006<br />
*<br />
* This library is free software; you can redistribute it and/or<br />
* modify it under the terms of the GNU Lesser General Public<br />
* License as published by the Free Software Foundation; either<br />
* version 2 of the License, or (at your option) any later version.<br />
*<br />
* This library is distributed in the hope that it will be useful,<br />
* but WITHOUT ANY WARRANTY; without even the implied warranty of<br />
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU<br />
* Lesser General Public License for more details.<br />
*<br />
* You should have received a copy of the GNU Lesser General Public<br />
* License along with this library; if not, write to the Free Software<br />
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA<br />
*/<br />
<br />
<br />
/* Server communication state */<br />
typedef enum {<br />
IBWS_INIT = 0, /* ctx start - after ibw_init */<br />
IBWS_ADDR_RESOLVED, /* after bind - should proceed to IBWS_READY processing the next fd_cm event - internally */<br />
IBWS_READY, /* after bind 2.; call ibw_listen on this - once */<br />
IBWS_CONNECT_REQUEST, /* after [IBWS_READY + incoming request] */<br />
/* => [(ibw_accept)IBWS_READY | (ibw_disconnect)STOPPED | ERROR] */<br />
IBWS_STOPPED, /* normal stop <= ibw_disconnect+(IBWS_READY | IBWS_CONNECT_REQUEST) */<br />
IBWS_ERROR /* abnormal state; ibw_stop must be called after this */<br />
} ibw_state_ctx;<br />
<br />
/* Connection state */<br />
typedef struct _ibw_ctx {<br />
int fd_events; /* read fd of verbs events */<br />
/* ibw_process_event must be _always_ invoked */<br />
/* when this fd is set after a select/poll */<br />
<br />
int fd_cm; /* read fd about a cm state change */<br />
/* call ibw_process_statechange after it's set */<br />
<br />
void *ctx_userdata; /* see ibw_init */<br />
<br />
ibw_state_ctx state;<br />
void *internal;<br />
} ibw_ctx;<br />
<br />
typedef enum {<br />
IBWC_INIT = 0, /* conn start - internal state */<br />
IBWC_CONNECTED, /* after ibw_accept or ibw_connect */<br />
IBWC_DISCONNECTED, /* after ibw_disconnect */<br />
IBWC_ERROR<br />
} ibw_state_conn;<br />
<br />
typedef struct _ibw_conn {<br />
ibw_ctx *ctx;<br />
ibw_state_conn state;<br />
void *conn_userdata; /* see ibw_connect and ibw_accept */<br />
void *internal;<br />
} ibw_conn;<br />
<br />
typedef struct _ibw_initattr {<br />
const char *name;<br />
const char *value;<br />
} ibw_initattr;<br />
<br />
/*<br />
* Callback function definition which should inform you about<br />
* connection state change<br />
* This callback is invoked from within ibw_process_statechange.<br />
* Both <conn> and <ctx> can be NULL if their state didn't change.<br />
* Return nonzero on error.<br />
*/<br />
typedef int (*ibw_connstate_fn_t)(ibw_ctx *ctx, ibw_conn *conn, void *ctx_userdata);<br />
<br />
/*<br />
* Callback function definition which should process incoming packets<br />
* This callback is invoked from within ibw_process_event.<br />
* Return nonzero on error.<br />
*<br />
* Important: you mustn't store buf pointer for later use. Process its contents before returning.<br />
*/<br />
typedef int (*ibw_receive_fn_t)(ibw_conn *conn, void *buf, int n);<br />
<br />
/*<br />
* settings: array of (name, value) pairs<br />
* where name is one of:<br />
* dev_name [default is the first one]<br />
* rx_depth [default is 500]<br />
* mtu [default is 1024]<br />
* ib_port [default is 1]<br />
*<br />
* Must be called _ONCE_ for each node.<br />
*<br />
* returns non-NULL on success<br />
*<br />
* talloc_free must be called for the result in IBWS_STOPPED;<br />
* it will close resources by destructor<br />
* connections(ibw_conn *) must have been closed prior talloc_free<br />
*/<br />
ibw_ctx *ibw_init(ibw_initattr *attr, int nattr,<br />
ibw_connstate_fn_t ibw_connstate,<br />
void *ctx_userdata,<br />
ibw_receive_fn_t ibw_receive);<br />
<br />
/*<br />
* Must be called in states of (IBWS_ERROR, IBWS_READY, IBWS_CONNECT_REQUEST)<br />
*<br />
* It will send out disconnect requests and free up ibw_conn structures.<br />
* The ctx->state will transit to IBWS_STOPPED after every conn are disconnected.<br />
* During that time, you mustn't send/recv/disconnect any more.<br />
* Only after ctx->state=IBWS_STOPPED you can talloc_free the ctx.<br />
*/<br />
int ibw_stop(ibw_ctx *ctx);<br />
<br />
/*************** connection initiation - like stream sockets *****/<br />
<br />
/*<br />
* works like socket bind<br />
* needs a normal internet address here<br />
*<br />
* return 0 on success<br />
*/<br />
int ibw_bind(ibw_ctx *ctx, struct sockaddr_in *my_addr);<br />
<br />
/*<br />
* works like socket listen<br />
* non-blocking<br />
* enables accepting incoming connections (after IBWS_READY)<br />
* (it doesn't touch ctx->state by itself)<br />
*<br />
* returns ctx->cm_fd<br />
*/<br />
int ibw_listen(ibw_ctx *ctx, int backlog);<br />
<br />
/*<br />
* works like socket accept<br />
* initializes a connection to a client<br />
* must be called when state=IBWS_CONNECT_REQUEST<br />
*<br />
* returns 0 on success<br />
*<br />
* You have +1 waiting here: you will get ibw_conn (having the<br />
* same <conn_userdata> member) structure in ibw_connstate_fn_t.<br />
*<br />
* Important: you won't get remote IP address (only internal conn info)<br />
*/<br />
int ibw_accept(ibw_ctx *ctx, void *conn_userdata);<br />
<br />
/*<br />
* Needs a normal internet address here<br />
* can be called within IBWS_READY|IBWS_CONNECT_REQUEST<br />
*<br />
* returns non-NULL on success<br />
*<br />
* You have +1 waiting here: you will get ibw_conn (having the<br />
* same <conn_userdata> member) structure in ibw_connstate_fn_t.<br />
*/<br />
int ibw_connect(ibw_ctx *ctx, struct sockaddr_in *serv_addr, void *conn_userdata);<br />
<br />
/*<br />
* Sends out a disconnect request.<br />
* You should process ctx->cm_fd ater calling this function<br />
* and then process it with ibw_process_event normally<br />
* (until you get conn->state = IBWC_DISCONNECTED)<br />
*<br />
* You mustn't talloc_free <conn> after this.<br />
*/<br />
void ibw_disconnect(ibw_conn *conn);<br />
<br />
/************ Infiniband specific event loop wrapping ******************/<br />
<br />
/* <br />
* Must be called in all cases after selecting/polling for ctx->fd_events is set.<br />
*/<br />
int ibw_process_event(ibw_ctx *ctx);<br />
<br />
/* <br />
* Must be called in all cases after selecting/polling for ctx->fd_cm is set.<br />
*/<br />
int ibw_process_statechange(ibw_ctx *ctx);<br />
<br />
/*<br />
* You have to use this buf to fill in before send.<br />
* It's just to avoid memcpy.in ibw_send.<br />
*<br />
* Returns 0 on success.<br />
*/<br />
int ibw_alloc_send_buf(ibw_conn *conn, void **buf, void **key, int n);<br />
<br />
/*<br />
* Send the message in one<br />
* Can be invoked any times (should fit into buffers) and at any time<br />
* (in conn->state=IBWC_CONNECTED)<br />
*<br />
* You mustn't use (buf, key) any more for sending.<br />
*/<br />
int ibw_send(ibw_conn *conn, void *buf, void *key, int n);<br />
<br />
/*<br />
* Retrieves the last error<br />
* result: always non-zero, mustn't be freed (static)<br />
*/<br />
const char *ibw_getLastError();</div>Psomogyi@gamax.huhttps://wiki.samba.org/index.php?title=CTDB_Project_ibwrapper&diff=1809CTDB Project ibwrapper2006-11-28T12:03:26Z<p>Psomogyi@gamax.hu: </p>
<hr />
<div>File ibwrapper.h:<br />
<br />
/*<br />
* Infiniband Verbs API socket-like wrapper<br />
* Copyright (C) Peter Somogyi 2006<br />
*<br />
* This library is free software; you can redistribute it and/or<br />
* modify it under the terms of the GNU Lesser General Public<br />
* License as published by the Free Software Foundation; either<br />
* version 2 of the License, or (at your option) any later version.<br />
*<br />
* This library is distributed in the hope that it will be useful,<br />
* but WITHOUT ANY WARRANTY; without even the implied warranty of<br />
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU<br />
* Lesser General Public License for more details.<br />
*<br />
* You should have received a copy of the GNU Lesser General Public<br />
* License along with this library; if not, write to the Free Software<br />
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA<br />
*/<br />
<br />
<br />
/* Server communication state */<br />
typedef enum {<br />
IBWS_INIT = 0, /* ctx start - after ibw_init */<br />
IBWS_ADDR_RESOLVED, /* after bind - should proceed to IBWS_READY processing the next fd_cm event - internally */<br />
IBWS_READY, /* after bind 2.; call ibw_listen on this - once */<br />
IBWS_CONNECT_REQUEST, /* after [IBWS_READY + incoming request] */<br />
/* => [(ibw_accept)IBWS_READY | (ibw_disconnect)STOPPED | ERROR] */<br />
IBWS_STOPPED, /* normal stop <= ibw_disconnect+(IBWS_READY | IBWS_CONNECT_REQUEST) */<br />
IBWS_ERROR /* abnormal state; ibw_stop must be called after this */<br />
} ibw_state_ctx;<br />
<br />
/* Connection state */<br />
typedef struct _ibw_ctx {<br />
int fd_events; /* read fd of verbs events */<br />
/* ibw_process_event must be _always_ invoked */<br />
/* when this fd is set after a select/poll */<br />
<br />
int fd_cm; /* read fd about a cm state change */<br />
/* call ibw_process_statechange after it's set */<br />
<br />
void *ctx_userdata; /* see ibw_init */<br />
<br />
ibw_state_ctx state;<br />
void *internal;<br />
} ibw_ctx;<br />
<br />
typedef enum {<br />
IBWC_INIT = 0, /* conn start - internal state */<br />
IBWC_CONNECTED, /* after ibw_accept or ibw_connect */<br />
IBWC_DISCONNECTED, /* after ibw_disconnect */<br />
IBWC_ERROR<br />
} ibw_state_conn;<br />
<br />
typedef struct _ibw_conn {<br />
ibw_ctx *ctx;<br />
ibw_state_conn state;<br />
void *conn_userdata; /* see ibw_connect and ibw_accept */<br />
void *internal;<br />
} ibw_conn;<br />
<br />
typedef struct _ibw_initattr {<br />
const char *name;<br />
const char *value;<br />
} ibw_initattr;<br />
<br />
/*<br />
* Callback function definition which should inform you about<br />
* connection state change<br />
* This callback is invoked from within ibw_process_statechange.<br />
* Both <conn> and <ctx> can be NULL if their state didn't change.<br />
* Return nonzero on error.<br />
*/<br />
typedef int (*ibw_connstate_fn_t)(ibw_ctx *ctx, ibw_conn *conn, void *ctx_userdata);<br />
<br />
/*<br />
* Callback function definition which should process incoming packets<br />
* This callback is invoked from within ibw_process_event.<br />
* Return nonzero on error.<br />
*/<br />
typedef int (*ibw_receive_fn_t)(ibw_conn *conn, void *buf, int n);<br />
<br />
/*<br />
* settings: array of (name, value) pairs<br />
* where name is one of:<br />
* dev_name [default is the first one]<br />
* rx_depth [default is 500]<br />
* mtu [default is 1024]<br />
* ib_port [default is 1]<br />
*<br />
* Must be called _ONCE_ for each node.<br />
*<br />
* returns non-NULL on success<br />
*<br />
* talloc_free must be called for the result in IBWS_STOPPED;<br />
* it will close resources by destructor<br />
* connections(ibw_conn *) must have been closed prior talloc_free<br />
*/<br />
ibw_ctx *ibw_init(ibw_initattr *attr, int nattr,<br />
ibw_connstate_fn_t ibw_connstate,<br />
void *ctx_userdata,<br />
ibw_receive_fn_t ibw_receive);<br />
<br />
/*<br />
* Must be called in states of (IBWS_ERROR, IBWS_READY, IBWS_CONNECT_REQUEST)<br />
*<br />
* It will send out disconnect requests and free up ibw_conn structures.<br />
* The ctx->state will transit to IBWS_STOPPED after every conn are disconnected.<br />
* During that time, you mustn't send/recv/disconnect any more.<br />
* Only after ctx->state=IBWS_STOPPED you can talloc_free the ctx.<br />
*/<br />
int ibw_stop(ibw_ctx *ctx);<br />
<br />
/*************** connection initiation - like stream sockets *****/<br />
<br />
/*<br />
* works like socket bind<br />
* needs a normal internet address here<br />
*<br />
* return 0 on success<br />
*/<br />
int ibw_bind(ibw_ctx *ctx, struct sockaddr_in *my_addr);<br />
<br />
/*<br />
* works like socket listen<br />
* non-blocking<br />
* enables accepting incoming connections (after IBWS_READY)<br />
* (it doesn't touch ctx->state by itself)<br />
*<br />
* returns ctx->cm_fd<br />
*/<br />
int ibw_listen(ibw_ctx *ctx, int backlog);<br />
<br />
/*<br />
* works like socket accept<br />
* initializes a connection to a client<br />
* must be called when state=IBWS_CONNECT_REQUEST<br />
*<br />
* returns 0 on success<br />
*<br />
* You have +1 waiting here: you will get ibw_conn (having the<br />
* same <conn_userdata> member) structure in ibw_connstate_fn_t.<br />
*<br />
* Important: you won't get remote IP address (only internal conn info)<br />
*/<br />
int ibw_accept(ibw_ctx *ctx, void *conn_userdata);<br />
<br />
/*<br />
* Needs a normal internet address here<br />
* can be called within IBWS_READY|IBWS_CONNECT_REQUEST<br />
*<br />
* returns non-NULL on success<br />
*<br />
* You have +1 waiting here: you will get ibw_conn (having the<br />
* same <conn_userdata> member) structure in ibw_connstate_fn_t.<br />
*/<br />
int ibw_connect(ibw_ctx *ctx, struct sockaddr_in *serv_addr, void *conn_userdata);<br />
<br />
/*<br />
* Sends out a disconnect request.<br />
* You should process ctx->cm_fd ater calling this function<br />
* and then process it with ibw_process_event normally<br />
* (until you get conn->state = IBWC_DISCONNECTED)<br />
*<br />
* You mustn't talloc_free <conn> after this.<br />
*/<br />
void ibw_disconnect(ibw_conn *conn);<br />
<br />
/************ Infiniband specific event loop wrapping ******************/<br />
<br />
/* <br />
* Must be called in all cases after selecting/polling for ctx->fd_events is set.<br />
*/<br />
int ibw_process_event(ibw_ctx *ctx);<br />
<br />
/* <br />
* Must be called in all cases after selecting/polling for ctx->fd_cm is set.<br />
*/<br />
int ibw_process_statechange(ibw_ctx *ctx);<br />
<br />
/*<br />
* Send the message in one<br />
* Can be invoked any times (should fit into buffers) and at any time<br />
* (in conn->state=IBWC_CONNECTED)<br />
*/<br />
int ibw_send(ibw_conn *conn, void *buf, int n);<br />
<br />
/*<br />
* Retrieves the last error<br />
* result: always non-zero, mustn't be freed (static)<br />
*/<br />
const char *ibw_getLastError();</div>Psomogyi@gamax.huhttps://wiki.samba.org/index.php?title=CTDB_Project_ibwrapper&diff=1808CTDB Project ibwrapper2006-11-27T17:38:55Z<p>Psomogyi@gamax.hu: </p>
<hr />
<div>File ibwrapper.h:<br />
<br />
/*<br />
* Infiniband Verbs API socket-like wrapper<br />
* Copyright (C) Peter Somogyi 2006<br />
*<br />
* This library is free software; you can redistribute it and/or<br />
* modify it under the terms of the GNU Lesser General Public<br />
* License as published by the Free Software Foundation; either<br />
* version 2 of the License, or (at your option) any later version.<br />
*<br />
* This library is distributed in the hope that it will be useful,<br />
* but WITHOUT ANY WARRANTY; without even the implied warranty of<br />
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU<br />
* Lesser General Public License for more details.<br />
*<br />
* You should have received a copy of the GNU Lesser General Public<br />
* License along with this library; if not, write to the Free Software<br />
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA<br />
*/<br />
<br />
<br />
/* Server communication state */<br />
typedef enum {<br />
IBWS_INIT = 0, /* ctx start - after ibw_init */<br />
IBWS_ADDR_RESOLVED, /* after bind - should proceed to IBWS_READY processing the next fd_cm event - internally */<br />
IBWS_READY, /* after bind 2. */<br />
IBWS_CONNECT_REQUEST, /* after [IBWS_ROUTE_RESOLVED + listen]<br />
=> [IBWS_READY | STOPPED | ERROR] */<br />
IBWS_STOPPED, /* normal stop */<br />
IBWS_ERROR /* abnormal state */<br />
} ibw_state_ctx;<br />
<br />
/* Connection state */<br />
typedef struct _ibw_ctx {<br />
int fd_events; /* read fd of verbs events */<br />
/* ibw_process_event must be _always_ invoked */<br />
/* when this fd is set after a select/poll */<br />
<br />
int fd_cm; /* read fd about a cm state change */<br />
/* call ibw_process_statechange after it's set */<br />
<br />
void *ctx_userdata; /* see ibw_init */<br />
<br />
ibw_state_ctx state;<br />
void *internal;<br />
} ibw_ctx;<br />
<br />
typedef enum {<br />
IBWC_INIT = 0, /* conn start - internal state */<br />
IBWC_CONNECTED, /* after ibw_accept or ibw_connect */<br />
IBWC_DISCONNECTED, /* after ibw_disconnect */<br />
IBWC_ERROR<br />
} ibw_state_conn;<br />
<br />
typedef struct _ibw_conn {<br />
ibw_ctx *ctx;<br />
ibw_state_conn state;<br />
void *conn_userdata; /* see ibw_connect and ibw_accept */<br />
void *internal;<br />
} ibw_conn;<br />
<br />
/*<br />
* Retrieves the last error<br />
* result: always non-zero, mustn't be freed (static)<br />
*/<br />
const char *ibw_getLastError();<br />
<br />
<br />
typedef struct _ibw_initattr {<br />
const char *name;<br />
const char *value;<br />
} ibw_initattr;<br />
<br />
/*<br />
* Callback function definition which should inform you about<br />
* connection state change<br />
* invoked from within ibw_process_statechange<br />
* Both <conn> and <ctx> can be NULL if their state didn't change.<br />
* Return nonzero on error.<br />
*/<br />
typedef int (*ibw_connstate_fn_t)(ibw_ctx *ctx, ibw_conn *conn, void *ctx_userdata);<br />
<br />
/*<br />
* Callback function definition which should process incoming packets<br />
* invoked from within ibw_process_event.<br />
* Return nonzero on error.<br />
*/<br />
typedef int (*ibw_receive_fn_t)(ibw_conn *conn, void *buf, int n);<br />
<br />
/*<br />
* settings: array of (name, value) pairs<br />
* where name is one of:<br />
* dev_name [default is the first one]<br />
* rx_depth [default is 500]<br />
* mtu [default is 1024]<br />
* ib_port [default is 1]<br />
*<br />
* Must be called for each NODE _ONCE_<br />
*<br />
* returns non-NULL on success<br />
* talloc_free must be called for the result<br />
*/<br />
ibw_ctx *ibw_init(ibw_initattr *attr, int nattr,<br />
ibw_connstate_fn_t ibw_connstate,<br />
void *ctx_userdata,<br />
ibw_receive_fn_t ibw_receive);<br />
<br />
<br />
/*************** connection initiation - like stream sockets *****/<br />
<br />
/*<br />
* works like socket bind<br />
* needs a normal internet address here<br />
*<br />
* return 0 on success<br />
*/<br />
int ibw_bind(ibw_ctx *ctx, struct sockaddr_in *my_addr);<br />
<br />
/*<br />
* works like socket listen<br />
* !!!: it's NON-blocking; use ctx->fd_cm + ibw_process_statechange +<br />
* ibw_connstate_fn_t to wait for a conn state change (=IBWS_READY)<br />
*<br />
* returns ctx->cm_fd<br />
*/<br />
int ibw_listen(ibw_ctx *ctx, int sockfd, int backlog);<br />
<br />
/*<br />
* works like socket accept<br />
* initializes a connection<br />
* Normally should be called from ibw_connstate_fn_t callback<br />
* when state=IBWS_CONNECT_REQUEST<br />
*<br />
* returns non-NULL on success<br />
*<br />
* userdata: will be put into ibw_conn (see also ibw_callback_fn_t)<br />
*/<br />
ibw_conn *ibw_accept(ibw_ctx *ctx, void *conn_userdata);<br />
<br />
/*<br />
* Needs a normal internet address here<br />
* can be called within IBWS_READY (or IBWS_CONNECT_REQUEST)<br />
*<br />
* returns non-NULL on success<br />
*<br />
* userdata: will be put into ibw_conn (see also ibw_callback_fn_t)<br />
*/<br />
ibw_conn *ibw_connect(ibw_ctx *ctx, struct sockaddr_in *serv_addr, void *conn_userdata);<br />
<br />
/*<br />
* Sends a disconnect request<br />
* You should process ctx->cm_fd ater calling this function<br />
* and then process it with ibw_process_event normally<br />
* (until you get conn->state = IBWC_DISCONNECTED)<br />
*<br />
* You have to talloc_free <conn> after this (even in the callback).<br />
*/<br />
void ibw_disconnect(ibw_conn *conn);<br />
<br />
/************ Infiniband specific event loop wrapping ******************/<br />
<br />
/* <br />
* !!! Must be called in all cases after selecting/polling for ctx->fd_events is set.<br />
*/<br />
int ibw_process_event(ibw_ctx *ctx);<br />
<br />
/* <br />
* !!! Must be called in all cases after selecting/polling for ctx->fd_cm is set.<br />
*/<br />
int ibw_process_statechange(ibw_ctx *ctx);<br />
<br />
/*<br />
* Send the message in one<br />
* Can be invoked any times (should fit into buffers) and at any time<br />
*/<br />
int ibw_send(ibw_conn *conn, void *buf, int n);</div>Psomogyi@gamax.huhttps://wiki.samba.org/index.php?title=CTDB_Project&diff=1772CTDB Project2006-11-20T15:24:56Z<p>Psomogyi@gamax.hu: /* Hardware acceleration */</p>
<hr />
<div>= CTDB Project =<br />
<br />
This project aims to produce an implementation of the CTDB protocol described in the [[Samba & Clustering]] page<br />
<br />
== Project Members ==<br />
<br />
Sven Oehme (project leader) <br />
Andrew Tridgell (technical lead)<br />
Alexander Bokovoy<br />
Aleksey Fedoseev<br />
Jim McDonough<br />
Peter Somogyi<br />
<br />
== Project Outline ==<br />
<br />
The initial work will focus on an implementation as part of tdb itself. Integration with the Samba source tree will happen at a later date. Work will probably happen in a bzr tree, but the details have not been worked out yet. Check back here for updates.<br />
<br />
= Project Tasks =<br />
<br />
== Hardware acceleration ==<br />
<br />
(note: Peter is looking at this one)<br />
<br />
We want CTDB to be very fast on hardware that supports fast messaging. In particular we are interested in good use of infiniband adapters, where we expect to get messaging latencies of the order of 3 to 5 microseconds. <br />
<br />
From discussions so far it looks like the 'verbs' API, perhaps with a modification to allow us to hook it into epoll(), will be the right choice. Basic information on this API is available at https://openib.org/tiki/tiki-index.php<br />
<br />
The basic features we want from a messaging API are:<br />
<br />
- low latency. We would like to get it down to just a few<br />
microseconds per message. Messages will vary in size, but typically<br />
be small (say between 64 and 512 bytes).<br />
<br />
- non-blocking. We would really like an API that hooks into poll, so<br />
we can use epoll(), poll() or select(). <br />
<br />
- If we can't have an API that hooks into poll() or epoll(), then a<br />
callback or signal based API would do if the overheads are small<br />
enough. In the same code we also need to be working on a unix<br />
domain socket (datagram socket) so we'd like the overhead of<br />
dealing with both the infiniband messages and the local datagrams<br />
to be low.<br />
<br />
- What we definately don't want to use is an API that chews a lot of<br />
CPU. So we don't want to be spinning in userspace on a set of<br />
mapped registers in the hope that a message might come along. The<br />
CPU will be needed for other tasks. Using mapped registers for send<br />
would probably be fine, but we'd probably need some kernel mediated<br />
mechanism for receive unless you can suggest a way to avoid it.<br />
<br />
- ideally we'd have reliable delivery, or at least be told when<br />
delivery has failed on a send, but if that is too expensive then<br />
we'll do our own reliable delivery mechanism.<br />
<br />
- we need to be able to add/remove nodes from the cluster. The Samba<br />
clustering code will have its own recovery protocol.<br />
<br />
- a 'message' like API would suite us better than a 'remote DMA'<br />
style API, unless the remote DMA API is significantly more<br />
efficient. Ring buffers would be fine.<br />
<br />
An abstract interface can be found here: [[CTDB_Project_ibwrapper]]<br />
<br />
TODOs regarding this interface:<br />
<br />
- verify how connection build-up suits us - or shall I use CM instead? - analyzing libibcm for this...<br />
<br />
- enumerate nodes - I'm gathering info for this<br />
<br />
== Flesh out CTDB API ==<br />
<br />
(note: Alexander and Aleksey are looking at this)<br />
<br />
By this I mean the C api in the "Clustered TDB API" section of the<br />
wiki page. The API as given there now is missing some pieces, and I<br />
think can be greatly improved.<br />
<br />
This is likely to feed back into the CTDB protocol description as<br />
well. Ideally we'd get rid of these calls:<br />
<br />
CTDB_REQ_FETCH_LOCKED<br />
CTDB_REPLY_FETCH_LOCKED<br />
CTDB_REQ_UNLOCK<br />
CTDB_REPLY_UNLOCK<br />
<br />
assuming we can demonstrate they aren't needed. I also think we can<br />
combine the CTDB_REQ_CONDITIONAL_APPEND and the CTDB_REQ_FETCH call<br />
into a single CTDB_REQ_REQUEST call which takes a key, a blob of data<br />
and a condition ID as input, and returns a blob of data and a status<br />
code as output. For a fetch call the input blob of data would be zero<br />
length.<br />
<br />
== Code s3/s4 opendb and brlock on top of ctdb api ==<br />
<br />
Whoever does this would pick either s3 or s4 initially, I don't think<br />
there is any point in doing them in parallel (we are bound to make the<br />
same sorts of mistakes on both if we did that).<br />
<br />
This will also feed a lot into the previous line item, working out the API.<br />
<br />
== Code CTDB api on top of dumb tdb ==<br />
<br />
This also feeds into the API discussion. It should be a very simple<br />
and dumb implementation, aiming to be used to allow the s3/s4<br />
implementation to have something to test against.<br />
<br />
== Prototype CTDB library on top of UDP/TCP ==<br />
<br />
(note: tridge is looking at this task)<br />
<br />
The initial implementation of the CTDB protocol will be on top of UDP/TCP<br />
<br />
Status: prototype work on this has begun. You can watch progress at http://build.samba.org/?tree=ctdb&function=Recent+Checkins<br />
<br />
== Setup standalone test environment ==<br />
<br />
This test environment is meant for non-clustered usage, instead emulating a cluster using<br />
IP on loopback. It will need to run multiple instances talking over 127.0.0.X interfaces. <br />
This will involve some shell scripting, plus some work on<br />
adding/removing nodes from the cluster. It might be easiest to add a<br />
CTDB protocol request asking a node to 'go quiet', then asking it to<br />
become active again later to simulate a node dying and coming back.<br />
<br />
== Code CTDB test suite ==<br />
(note: jim is looking at this one)<br />
<br />
This reflects the fact that I want this project to concentrate on<br />
building ctdb on tdb + messaging, and not concentrate on the "whole<br />
problem" involving Samba until later. We'll do a basic s3/s4 backend<br />
implementation to make sure the ideas can work, but I want the major<br />
testing effort to involve simple tests directly against the ctdb<br />
API. It will be so much easier to simulate exotic error conditions<br />
that way.<br />
<br />
== Flesh out recovery part of ctdb protocol == <br />
<br />
(note: tridge is looking at this one)<br />
<br />
I explained on the phone that I think the simplest recovery process<br />
will be something like this:<br />
<br />
- global sync and pick 'master' for recovery<br />
- every node sends all records from its local tdb to the LMASTER<br />
- master waits till all nodes say they are done<br />
- global sync and restart<br />
<br />
The recovery phase will need to very carefully cope with lots of<br />
corner cases, like when a node goes down during recovery.<br />
<br />
== Work out details for persistent tdbs ==<br />
<br />
this will need some more thought - its not our top priority, but<br />
eventually the long lived databases will matter.<br />
<br />
== Wireshark dissector ==<br />
<br />
We'll need a wireshark dissector, but only once the protocol settles down a little.</div>Psomogyi@gamax.huhttps://wiki.samba.org/index.php?title=CTDB_Project_ibwrapper&diff=1771CTDB Project ibwrapper2006-11-20T13:42:20Z<p>Psomogyi@gamax.hu: </p>
<hr />
<div>File ibwrapper.h:<br />
<br />
/*<br />
* Infiniband Verbs API socket-like wrapper<br />
* Copyright (C) Peter Somogyi 2006<br />
*<br />
* This library is free software; you can redistribute it and/or<br />
* modify it under the terms of the GNU Lesser General Public<br />
* License as published by the Free Software Foundation; either<br />
* version 2 of the License, or (at your option) any later version.<br />
*<br />
* This library is distributed in the hope that it will be useful,<br />
* but WITHOUT ANY WARRANTY; without even the implied warranty of<br />
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU<br />
* Lesser General Public License for more details.<br />
*<br />
* You should have received a copy of the GNU Lesser General Public<br />
* License along with this library; if not, write to the Free Software<br />
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA<br />
*/<br />
<br />
/*<br />
* Basically, traditional socket is chosen for exchanging<br />
* infiniband/verbs-specific info when connecting a client.<br />
*<br />
* The socket-like functions call the real socket functions, with some<br />
* ib wrapping and error and state checking. Must be used "normally" ...<br />
*<br />
* However, ibw_write and ibw_read use real infiniband/verbs calls only.<br />
*/<br />
<br />
typedef struct _ibw_ctx {<br />
int fd; /* read fd of verbs events */<br />
/* ibw_process_event must be _always_ invoked */<br />
/* when this fd is set after a select/poll */<br />
void *internal;<br />
} ibw_ctx;<br />
<br />
typedef struct _ibw_conn {<br />
ibw_ctx *ctx;<br />
void *userdata; /* see also ibw_connect and ibw_accept */<br />
void *internal;<br />
} ibw_conn;<br />
<br />
/*<br />
* Retrieves the last error<br />
* result: always non-zero, mustn't be freed (static)<br />
*/<br />
const char *ibw_getLastError();<br />
<br />
<br />
typedef struct _ibw_initattr {<br />
const char *name;<br />
const char *value;<br />
} ibw_initattr;<br />
<br />
/*<br />
* Callback function definition which should process incoming packets<br />
* It's called from within ibw_process_event.<br />
*/<br />
typedef int (*ibw_receive_fn_t)(ibw_conn *conn, void *buf, int nsize);<br />
<br />
/*<br />
* settings: array of (name, value) pairs<br />
* where name is one of:<br />
* dev_name [default is the first one]<br />
* rx_depth [default is 500]<br />
* mtu [default is 1024]<br />
* ib_port [default is 1]<br />
*<br />
* Must be called for each NODE _ONCE_<br />
*<br />
* returns non-NULL on success<br />
* talloc_free must be called for the result<br />
*/<br />
ibw_ctx *ibw_init(ibw_initattr *attr, int nattr, ibw_receive_fn_t ibw_receive);<br />
<br />
<br />
/*************** connection initiation - like stream sockets *****/<br />
/* TODO: enum nodes + verify this connection method */<br />
<br />
/*<br />
* Call as the normal one (see man page)<br />
* returns a sockfd as the normal one<br />
*/<br />
int ibw_socket(ibw_ctx *ctx, int domain, int type, int protocol);<br />
<br />
/*<br />
* Needs a normal internet address here<br />
* return is a real socket fd<br />
*/<br />
int ibw_bind(ibw_ctx *ctx, struct sockaddr_in *my_addr);<br />
<br />
/*<br />
* sockfd here is a real sockfd<br />
* see also the man page<br />
* !!!: it's also blocking<br />
*/<br />
int ibw_listen(ibw_ctx *ctx, int sockfd, int backlog);<br />
<br />
/*<br />
* sockfd here is a real sockfd<br />
* see also the man page<br />
* !!!:<br />
* additionally, the server exchanges ib-specific<br />
* properties (lid, qpn, psn) here with the client<br />
* + initializes a connection<br />
*<br />
* returns non-NULL on success<br />
* talloc_free must be called for the result (which calls close)<br />
*<br />
* userdata: will be put into ibw_conn (see also ibw_callback_fn_t)<br />
*/<br />
ibw_conn *ibw_accept(ibw_ctx *ctx, int sockfd, struct sockaddr_in *cli_addr, void *userdata);<br />
<br />
/*<br />
* Needs a normal internet address here<br />
*<br />
* returns non-NULL on success<br />
* talloc_free must be called for the result (which calls close)<br />
*<br />
* userdata: will be put into ibw_conn (see also ibw_callback_fn_t)<br />
*/<br />
ibw_conn *ibw_connect(ibw_ctx *ctx, int sockfd, struct sockaddr_in *serv_addr, void *userdata);<br />
<br />
<br />
/************ Infiniband specific event loop wrapping ******************/<br />
<br />
/* <br />
* !!! Must be called in all cases after selecting/polling for ctx->fd is set.<br />
*/<br />
int ibw_process_event(ibw_ctx *ctx);<br />
<br />
<br />
/*<br />
* Send the message in one<br />
* Can be invoked any times (should fit into buffers) and at any time<br />
*/<br />
int ibw_send(ibw_conn *connctx, void *buf, int n);</div>Psomogyi@gamax.huhttps://wiki.samba.org/index.php?title=CTDB_Project&diff=1770CTDB Project2006-11-20T13:40:32Z<p>Psomogyi@gamax.hu: /* Hardware acceleration */</p>
<hr />
<div>= CTDB Project =<br />
<br />
This project aims to produce an implementation of the CTDB protocol described in the [[Samba & Clustering]] page<br />
<br />
== Project Members ==<br />
<br />
Sven Oehme (project leader) <br />
Andrew Tridgell (technical lead)<br />
Alexander Bokovoy<br />
Aleksey Fedoseev<br />
Jim McDonough<br />
Peter Somogyi<br />
<br />
== Project Outline ==<br />
<br />
The initial work will focus on an implementation as part of tdb itself. Integration with the Samba source tree will happen at a later date. Work will probably happen in a bzr tree, but the details have not been worked out yet. Check back here for updates.<br />
<br />
= Project Tasks =<br />
<br />
== Hardware acceleration ==<br />
<br />
(note: Peter is looking at this one)<br />
<br />
We want CTDB to be very fast on hardware that supports fast messaging. In particular we are interested in good use of infiniband adapters, where we expect to get messaging latencies of the order of 3 to 5 microseconds. <br />
<br />
From discussions so far it looks like the 'verbs' API, perhaps with a modification to allow us to hook it into epoll(), will be the right choice. Basic information on this API is available at https://openib.org/tiki/tiki-index.php<br />
<br />
The basic features we want from a messaging API are:<br />
<br />
- low latency. We would like to get it down to just a few<br />
microseconds per message. Messages will vary in size, but typically<br />
be small (say between 64 and 512 bytes).<br />
<br />
- non-blocking. We would really like an API that hooks into poll, so<br />
we can use epoll(), poll() or select(). <br />
<br />
- If we can't have an API that hooks into poll() or epoll(), then a<br />
callback or signal based API would do if the overheads are small<br />
enough. In the same code we also need to be working on a unix<br />
domain socket (datagram socket) so we'd like the overhead of<br />
dealing with both the infiniband messages and the local datagrams<br />
to be low.<br />
<br />
- What we definately don't want to use is an API that chews a lot of<br />
CPU. So we don't want to be spinning in userspace on a set of<br />
mapped registers in the hope that a message might come along. The<br />
CPU will be needed for other tasks. Using mapped registers for send<br />
would probably be fine, but we'd probably need some kernel mediated<br />
mechanism for receive unless you can suggest a way to avoid it.<br />
<br />
- ideally we'd have reliable delivery, or at least be told when<br />
delivery has failed on a send, but if that is too expensive then<br />
we'll do our own reliable delivery mechanism.<br />
<br />
- we need to be able to add/remove nodes from the cluster. The Samba<br />
clustering code will have its own recovery protocol.<br />
<br />
- a 'message' like API would suite us better than a 'remote DMA'<br />
style API, unless the remote DMA API is significantly more<br />
efficient. Ring buffers would be fine.<br />
<br />
An abstract interface can be found here: [[CTDB_Project_ibwrapper]]<br />
<br />
TODOs regarding this interface:<br />
<br />
- verify how connection build-up suits us<br />
<br />
- enumerate nodes<br />
<br />
== Flesh out CTDB API ==<br />
<br />
(note: Alexander and Aleksey are looking at this)<br />
<br />
By this I mean the C api in the "Clustered TDB API" section of the<br />
wiki page. The API as given there now is missing some pieces, and I<br />
think can be greatly improved.<br />
<br />
This is likely to feed back into the CTDB protocol description as<br />
well. Ideally we'd get rid of these calls:<br />
<br />
CTDB_REQ_FETCH_LOCKED<br />
CTDB_REPLY_FETCH_LOCKED<br />
CTDB_REQ_UNLOCK<br />
CTDB_REPLY_UNLOCK<br />
<br />
assuming we can demonstrate they aren't needed. I also think we can<br />
combine the CTDB_REQ_CONDITIONAL_APPEND and the CTDB_REQ_FETCH call<br />
into a single CTDB_REQ_REQUEST call which takes a key, a blob of data<br />
and a condition ID as input, and returns a blob of data and a status<br />
code as output. For a fetch call the input blob of data would be zero<br />
length.<br />
<br />
== Code s3/s4 opendb and brlock on top of ctdb api ==<br />
<br />
Whoever does this would pick either s3 or s4 initially, I don't think<br />
there is any point in doing them in parallel (we are bound to make the<br />
same sorts of mistakes on both if we did that).<br />
<br />
This will also feed a lot into the previous line item, working out the API.<br />
<br />
== Code CTDB api on top of dumb tdb ==<br />
<br />
This also feeds into the API discussion. It should be a very simple<br />
and dumb implementation, aiming to be used to allow the s3/s4<br />
implementation to have something to test against.<br />
<br />
== Prototype CTDB library on top of UDP/TCP ==<br />
<br />
(note: tridge is looking at this task)<br />
<br />
The initial implementation of the CTDB protocol will be on top of UDP/TCP<br />
<br />
Status: prototype work on this has begun. You can watch progress at http://build.samba.org/?tree=ctdb&function=Recent+Checkins<br />
<br />
== Setup standalone test environment ==<br />
<br />
This test environment is meant for non-clustered usage, instead emulating a cluster using<br />
IP on loopback. It will need to run multiple instances talking over 127.0.0.X interfaces. <br />
This will involve some shell scripting, plus some work on<br />
adding/removing nodes from the cluster. It might be easiest to add a<br />
CTDB protocol request asking a node to 'go quiet', then asking it to<br />
become active again later to simulate a node dying and coming back.<br />
<br />
== Code CTDB test suite ==<br />
(note: jim is looking at this one)<br />
<br />
This reflects the fact that I want this project to concentrate on<br />
building ctdb on tdb + messaging, and not concentrate on the "whole<br />
problem" involving Samba until later. We'll do a basic s3/s4 backend<br />
implementation to make sure the ideas can work, but I want the major<br />
testing effort to involve simple tests directly against the ctdb<br />
API. It will be so much easier to simulate exotic error conditions<br />
that way.<br />
<br />
== Flesh out recovery part of ctdb protocol == <br />
<br />
(note: tridge is looking at this one)<br />
<br />
I explained on the phone that I think the simplest recovery process<br />
will be something like this:<br />
<br />
- global sync and pick 'master' for recovery<br />
- every node sends all records from its local tdb to the LMASTER<br />
- master waits till all nodes say they are done<br />
- global sync and restart<br />
<br />
The recovery phase will need to very carefully cope with lots of<br />
corner cases, like when a node goes down during recovery.<br />
<br />
== Work out details for persistent tdbs ==<br />
<br />
this will need some more thought - its not our top priority, but<br />
eventually the long lived databases will matter.<br />
<br />
== Wireshark dissector ==<br />
<br />
We'll need a wireshark dissector, but only once the protocol settles down a little.</div>Psomogyi@gamax.huhttps://wiki.samba.org/index.php?title=CTDB_Project_ibwrapper&diff=1769CTDB Project ibwrapper2006-11-20T13:35:25Z<p>Psomogyi@gamax.hu: </p>
<hr />
<div>File ibwrapper.h:<br />
<br />
/*<br />
* Infiniband Verbs API socket-like wrapper<br />
* Copyright (C) Peter Somogyi 2006<br />
*<br />
* This library is free software; you can redistribute it and/or<br />
* modify it under the terms of the GNU Lesser General Public<br />
* License as published by the Free Software Foundation; either<br />
* version 2 of the License, or (at your option) any later version.<br />
*<br />
* This library is distributed in the hope that it will be useful,<br />
* but WITHOUT ANY WARRANTY; without even the implied warranty of<br />
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU<br />
* Lesser General Public License for more details.<br />
*<br />
* You should have received a copy of the GNU Lesser General Public<br />
* License along with this library; if not, write to the Free Software<br />
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA<br />
*/<br />
<br />
/*<br />
* Basically, traditional socket is chosen for exchanging<br />
* infiniband/verbs-specific info when connecting a client.<br />
*<br />
* The socket-like functions call the real socket functions, with some<br />
* ib wrapping and error and state checking. Must be used "normally" ...<br />
*<br />
* However, ibw_write and ibw_read use real infiniband/verbs calls only.<br />
*/<br />
<br />
typedef struct _ibw_ctx {<br />
int fd; /* read fd of verbs event, ibw_can_read must be invoked after that */<br />
/* ibw_process_event must be _always_ invoked */<br />
/* when this fd is set after a select/poll */<br />
void *internal;<br />
} ibw_ctx;<br />
<br />
typedef struct _ibw_conn {<br />
ibw_ctx *ctx;<br />
void *userdata; /* see also ibw_connect and ibw_accept */<br />
void *internal;<br />
} ibw_conn;<br />
<br />
/*<br />
* Retrieves the last error<br />
* result: always non-zero, mustn't be freed (static)<br />
*/<br />
const char *ibw_getLastError();<br />
<br />
<br />
typedef struct _ibw_initattr {<br />
const char *name;<br />
const char *value;<br />
} ibw_initattr;<br />
<br />
/*<br />
* Callback function definition which should process incoming packets<br />
* It's called from within ibw_process_event.<br />
*/<br />
typedef int (*ibw_receive_fn_t)(ibw_conn *conn, void *buf, int nsize);<br />
<br />
/*<br />
* settings: array of (name, value) pairs<br />
* where name is one of:<br />
* dev_name [default is the first one]<br />
* rx_depth [default is 500]<br />
* mtu [default is 1024]<br />
* ib_port [default is 1]<br />
*<br />
* Must be called for each NODE _ONCE_<br />
*<br />
* returns non-NULL on success<br />
* talloc_free must be called for the result<br />
*/<br />
ibw_ctx *ibw_init(ibw_initattr *attr, int nattr, ibw_receive_fn_t ibw_receive);<br />
<br />
<br />
/*************** connection initiation - like stream sockets *****/<br />
/* TODO: enum nodes + verify this connection method */<br />
<br />
/*<br />
* Call as the normal one (see man page)<br />
* returns a sockfd as the normal one<br />
*/<br />
int ibw_socket(ibw_ctx *ctx, int domain, int type, int protocol);<br />
<br />
/*<br />
* Needs a normal internet address here<br />
* return is a real socket fd<br />
*/<br />
int ibw_bind(ibw_ctx *ctx, struct sockaddr_in *my_addr);<br />
<br />
/*<br />
* sockfd here is a real sockfd<br />
* see also the man page<br />
* !!!: it's also blocking<br />
*/<br />
int ibw_listen(ibw_ctx *ctx, int sockfd, int backlog);<br />
<br />
/*<br />
* sockfd here is a real sockfd<br />
* see also the man page<br />
* !!!:<br />
* additionally, the server exchanges ib-specific<br />
* properties (lid, qpn, psn) here with the client<br />
* + initializes a connection<br />
*<br />
* returns non-NULL on success<br />
* talloc_free must be called for the result (which calls close)<br />
*<br />
* userdata: will be put into ibw_conn (see also ibw_callback_fn_t)<br />
*/<br />
ibw_conn *ibw_accept(ibw_ctx *ctx, int sockfd, struct sockaddr_in *cli_addr, void *userdata);<br />
<br />
/*<br />
* Needs a normal internet address here<br />
*<br />
* returns non-NULL on success<br />
* talloc_free must be called for the result (which calls close)<br />
*<br />
* userdata: will be put into ibw_conn (see also ibw_callback_fn_t)<br />
*/<br />
ibw_conn *ibw_connect(ibw_ctx *ctx, int sockfd, struct sockaddr_in *serv_addr, void *userdata);<br />
<br />
<br />
/************ Infiniband specific event loop wrapping ******************/<br />
<br />
/* <br />
* !!! Must be called in all cases after selecting/polling for ctx->fd is set.<br />
*/<br />
int ibw_process_event(ibw_ctx *ctx);<br />
<br />
<br />
/*<br />
* Send the message in one<br />
* Can be invoked any times (should fit into buffers) and at any time<br />
*/<br />
int ibw_send(ibw_conn *connctx, void *buf, int n);</div>Psomogyi@gamax.huhttps://wiki.samba.org/index.php?title=CTDB_Project_ibwrapper&diff=1766CTDB Project ibwrapper2006-11-17T15:25:36Z<p>Psomogyi@gamax.hu: </p>
<hr />
<div>File ibwrapper.h:<br />
<br />
/*<br />
* Infiniband Verbs API socket-like wrapper<br />
* Copyright (C) Peter Somogyi 2006<br />
*<br />
* This library is free software; you can redistribute it and/or<br />
* modify it under the terms of the GNU Lesser General Public<br />
* License as published by the Free Software Foundation; either<br />
* version 2 of the License, or (at your option) any later version.<br />
*<br />
* This library is distributed in the hope that it will be useful,<br />
* but WITHOUT ANY WARRANTY; without even the implied warranty of<br />
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU<br />
* Lesser General Public License for more details.<br />
*<br />
* You should have received a copy of the GNU Lesser General Public<br />
* License along with this library; if not, write to the Free Software<br />
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA<br />
*/<br />
<br />
/*<br />
* Basically, traditional socket is chosen for exchanging<br />
* infiniband/verbs-specific info when connecting a client.<br />
*<br />
* The socket-like functions call the real socket functions, with some<br />
* ib wrapping and error and state checking. Must be used "normally" ...<br />
*<br />
* However, ibw_write and ibw_read use real infiniband/verbs calls only.<br />
*/<br />
<br />
typedef struct _ibw_ctx {<br />
void *internal;<br />
} ibw_ctx;<br />
<br />
typedef struct _ibw_conn {<br />
int fd; /* normal fd */<br />
ibw_ctx *ctx;<br />
void *internal;<br />
} ibw_conn;<br />
<br />
/*<br />
* Retrieves the last error<br />
* result: always non-zero, mustn't be freed (static)<br />
*/<br />
const char *ibw_getLastError();<br />
<br />
/*<br />
* settings: tabbed text of <name>\t<value>\n<br />
* where name is one of:<br />
* dev_name [default is the first one]<br />
* rx_depth [default is 500]<br />
* mtu [default is 1024]<br />
* ib_port [default is 1]<br />
*<br />
* Must be called at each NODE _ONCE_<br />
* Starts a new process (for ib event loop).<br />
*<br />
* returns non-NULL on success<br />
* talloc_free must be called for the result<br />
*/<br />
ibw_ctx *ibw_init(const char *settings);<br />
<br />
/*<br />
* Call as the normal one (see man page)<br />
* returns a sockfd as the normal one<br />
*/<br />
int ibw_socket(ibw_ctx *ctx, int domain, int type, int protocol);<br />
<br />
/*<br />
* Needs a normal internet address here<br />
* return is a real socket fd<br />
*/<br />
int ibw_bind(ibw_ctx *ctx, struct sockaddr_in *my_addr);<br />
<br />
/*<br />
* sockfd here is a real sockfd<br />
* see also the man page<br />
* !!!: it's also blocking<br />
*/<br />
int ibw_listen(ibw_ctx *ctx, int sockfd, int backlog);<br />
<br />
/*<br />
* sockfd here is a real sockfd<br />
* see also the man page<br />
* !!!:<br />
* additionally, the server exchanges ib-specific<br />
* properties (lid, qpn, psn) here with the client<br />
* + initializes a connection<br />
*<br />
* returns non-NULL on success<br />
* talloc_free must be called for the result (which calls close)<br />
*/<br />
ibw_conn *ibw_accept(ibw_ctx *ctx, int sockfd, struct sockaddr_in *cli_addr);<br />
<br />
/*<br />
* Needs a normal internet address here<br />
*<br />
* returns non-NULL on success<br />
* talloc_free must be called for the result (which calls close)<br />
*/<br />
ibw_conn *ibw_connect(ibw_ctx *ctx, int sockfd, struct sockaddr_in *serv_addr);<br />
<br />
/*<br />
* Some prefetching will be performed here (to get the msg in one...)<br />
*/<br />
int ibw_read(ibw_conn *connctx, void *buf, int n);<br />
<br />
/*<br />
* Try to send the message in one (performance reason)<br />
*/<br />
int ibw_write(ibw_conn *connctx, void *buf, int n);</div>Psomogyi@gamax.huhttps://wiki.samba.org/index.php?title=CTDB_Project&diff=1764CTDB Project2006-11-17T13:36:41Z<p>Psomogyi@gamax.hu: /* Hardware acceleration */</p>
<hr />
<div>= CTDB Project =<br />
<br />
This project aims to produce an implementation of the CTDB protocol described in the [[Samba & Clustering]] page<br />
<br />
== Project Members ==<br />
<br />
Sven Oehme (project leader) <br />
Andrew Tridgell (technical lead)<br />
Alexander Bokovoy<br />
Aleksey Fedoseev<br />
Jim McDonough<br />
Peter Somogyi<br />
<br />
== Project Outline ==<br />
<br />
The initial work will focus on an implementation as part of tdb itself. Integration with the Samba source tree will happen at a later date. Work will probably happen in a bzr tree, but the details have not been worked out yet. Check back here for updates.<br />
<br />
= Project Tasks =<br />
<br />
== Hardware acceleration ==<br />
<br />
(note: Peter is looking at this one)<br />
<br />
We want CTDB to be very fast on hardware that supports fast messaging. In particular we are interested in good use of infiniband adapters, where we expect to get messaging latencies of the order of 3 to 5 microseconds. <br />
<br />
From discussions so far it looks like the 'verbs' API, perhaps with a modification to allow us to hook it into epoll(), will be the right choice. Basic information on this API is available at https://openib.org/tiki/tiki-index.php<br />
<br />
The basic features we want from a messaging API are:<br />
<br />
- low latency. We would like to get it down to just a few<br />
microseconds per message. Messages will vary in size, but typically<br />
be small (say between 64 and 512 bytes).<br />
<br />
- non-blocking. We would really like an API that hooks into poll, so<br />
we can use epoll(), poll() or select(). <br />
<br />
- If we can't have an API that hooks into poll() or epoll(), then a<br />
callback or signal based API would do if the overheads are small<br />
enough. In the same code we also need to be working on a unix<br />
domain socket (datagram socket) so we'd like the overhead of<br />
dealing with both the infiniband messages and the local datagrams<br />
to be low.<br />
<br />
- What we definately don't want to use is an API that chews a lot of<br />
CPU. So we don't want to be spinning in userspace on a set of<br />
mapped registers in the hope that a message might come along. The<br />
CPU will be needed for other tasks. Using mapped registers for send<br />
would probably be fine, but we'd probably need some kernel mediated<br />
mechanism for receive unless you can suggest a way to avoid it.<br />
<br />
- ideally we'd have reliable delivery, or at least be told when<br />
delivery has failed on a send, but if that is too expensive then<br />
we'll do our own reliable delivery mechanism.<br />
<br />
- we need to be able to add/remove nodes from the cluster. The Samba<br />
clustering code will have its own recovery protocol.<br />
<br />
- a 'message' like API would suite us better than a 'remote DMA'<br />
style API, unless the remote DMA API is significantly more<br />
efficient. Ring buffers would be fine.<br />
<br />
A socket-like wrapper interface is the current aim: [[CTDB_Project_ibwrapper]]<br />
<br />
== Flesh out CTDB API ==<br />
<br />
(note: Alexander and Aleksey are looking at this)<br />
<br />
By this I mean the C api in the "Clustered TDB API" section of the<br />
wiki page. The API as given there now is missing some pieces, and I<br />
think can be greatly improved.<br />
<br />
This is likely to feed back into the CTDB protocol description as<br />
well. Ideally we'd get rid of these calls:<br />
<br />
CTDB_REQ_FETCH_LOCKED<br />
CTDB_REPLY_FETCH_LOCKED<br />
CTDB_REQ_UNLOCK<br />
CTDB_REPLY_UNLOCK<br />
<br />
assuming we can demonstrate they aren't needed. I also think we can<br />
combine the CTDB_REQ_CONDITIONAL_APPEND and the CTDB_REQ_FETCH call<br />
into a single CTDB_REQ_REQUEST call which takes a key, a blob of data<br />
and a condition ID as input, and returns a blob of data and a status<br />
code as output. For a fetch call the input blob of data would be zero<br />
length.<br />
<br />
== Code s3/s4 opendb and brlock on top of ctdb api ==<br />
<br />
Whoever does this would pick either s3 or s4 initially, I don't think<br />
there is any point in doing them in parallel (we are bound to make the<br />
same sorts of mistakes on both if we did that).<br />
<br />
This will also feed a lot into the previous line item, working out the API.<br />
<br />
== Code CTDB api on top of dumb tdb ==<br />
<br />
This also feeds into the API discussion. It should be a very simple<br />
and dumb implementation, aiming to be used to allow the s3/s4<br />
implementation to have something to test against.<br />
<br />
== Prototype CTDB library on top of UDP/TCP ==<br />
<br />
(note: tridge is looking at this task)<br />
<br />
The initial implementation of the CTDB protocol will be on top of UDP/TCP<br />
<br />
== Setup standalone test environment ==<br />
<br />
This test environment is meant for non-clustered usage, instead emulating a cluster using<br />
IP on loopback. It will need to run multiple instances talking over 127.0.0.X interfaces. <br />
This will involve some shell scripting, plus some work on<br />
adding/removing nodes from the cluster. It might be easiest to add a<br />
CTDB protocol request asking a node to 'go quiet', then asking it to<br />
become active again later to simulate a node dying and coming back.<br />
<br />
== Code CTDB test suite ==<br />
(note: jim is looking at this one)<br />
<br />
This reflects the fact that I want this project to concentrate on<br />
building ctdb on tdb + messaging, and not concentrate on the "whole<br />
problem" involving Samba until later. We'll do a basic s3/s4 backend<br />
implementation to make sure the ideas can work, but I want the major<br />
testing effort to involve simple tests directly against the ctdb<br />
API. It will be so much easier to simulate exotic error conditions<br />
that way.<br />
<br />
== Flesh out recovery part of ctdb protocol == <br />
<br />
(note: tridge is looking at this one)<br />
<br />
I explained on the phone that I think the simplest recovery process<br />
will be something like this:<br />
<br />
- global sync and pick 'master' for recovery<br />
- every node sends all records from its local tdb to the LMASTER<br />
- master waits till all nodes say they are done<br />
- global sync and restart<br />
<br />
The recovery phase will need to very carefully cope with lots of<br />
corner cases, like when a node goes down during recovery.<br />
<br />
== Work out details for persistent tdbs ==<br />
<br />
this will need some more thought - its not our top priority, but<br />
eventually the long lived databases will matter.<br />
<br />
== Wireshark dissector ==<br />
<br />
We'll need a wireshark dissector, but only once the protocol settles down a little.</div>Psomogyi@gamax.huhttps://wiki.samba.org/index.php?title=CTDB_Project_ibwrapper&diff=1763CTDB Project ibwrapper2006-11-17T13:35:44Z<p>Psomogyi@gamax.hu: </p>
<hr />
<div>File ibwrapper.h:<br />
<br />
/*<br />
* Infiniband Verbs API socket-like wrapper<br />
* Copyright (C) Peter Somogyi 2006<br />
*<br />
* This library is free software; you can redistribute it and/or<br />
* modify it under the terms of the GNU Lesser General Public<br />
* License as published by the Free Software Foundation; either<br />
* version 2 of the License, or (at your option) any later version.<br />
*<br />
* This library is distributed in the hope that it will be useful,<br />
* but WITHOUT ANY WARRANTY; without even the implied warranty of<br />
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU<br />
* Lesser General Public License for more details.<br />
*<br />
* You should have received a copy of the GNU Lesser General Public<br />
* License along with this library; if not, write to the Free Software<br />
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA<br />
*/<br />
<br />
/*<br />
* Basically, traditional socket is chosen for exchanging<br />
* infiniband/verbs-specific info when connecting a client.<br />
*<br />
* The socket-like functions call the real functions, with some<br />
* ib wrapping and error and state checking. Must be used "normally" ...<br />
*/<br />
<br />
typedef struct _ibw_ctx {<br />
void *internal;<br />
} ibw_ctx;<br />
<br />
typedef struct _ibw_conn {<br />
int *pfd; /* !!! <-- use this fd to wait for an event + ibw_can_read after then */<br />
/* I'm not sure how often this can change... */<br />
ibw_ctx *ctx;<br />
void *internal;<br />
} ibw_conn;<br />
<br />
/*<br />
* Retrieves the last error<br />
* result: always non-zero, mustn't be freed (static)<br />
*/<br />
const char *ibw_getLastError();<br />
<br />
/*<br />
* settings: tabbed text of <name>\t<value>\n<br />
* where name is one of:<br />
* dev_name [default is the first one]<br />
* rx_depth [default is 500]<br />
* mtu [default is 1024]<br />
* ib_port [default is 1]<br />
*<br />
* must be set at client & server<br />
*<br />
* returns non-NULL on success<br />
*/<br />
ibw_ctx *ibw_init(const char *settings);<br />
<br />
void ibw_done(ibw_ctx *ctx);<br />
<br />
/*<br />
* Call as the normal one (see man page)<br />
* returns a sockfd as the normal one<br />
*/<br />
int ibw_socket(ibw_ctx *ctx, int domain, int type, int protocol);<br />
<br />
/*<br />
* Needs a normal internet address here<br />
* return is a real socket fd<br />
*/<br />
int ibw_bind(ibw_ctx *ctx, struct sockaddr_in *my_addr);<br />
<br />
/*<br />
* sockfd here is a real sockfd<br />
* see also the man page<br />
* !!!: it's also blocking<br />
*/<br />
int ibw_listen(ibw_ctx *ctx, int sockfd, int backlog);<br />
<br />
/*<br />
* sockfd here is a real sockfd<br />
* see also the man page<br />
* !!!:<br />
* additionally, the server exchanges ib-specific<br />
* properties (lid, qpn, psn) here with the client<br />
* + initializes a connection<br />
*/<br />
ibw_conn *ibw_accept(ibw_ctx *ctx, int sockfd, struct sockaddr_in *cli_addr);<br />
<br />
/*<br />
* Needs a normal internet address here<br />
*/<br />
ibw_conn *ibw_connect(ibw_ctx *ctx, int sockfd, struct sockaddr_in *serv_addr);<br />
<br />
/*<br />
* !!! Must be called after waiting for ibw_conn->fd<br />
* to see whether we really got the correct event for reading<br />
* mustn't call ibw_read if we mustn't read.<br />
*/<br />
int ibw_can_read(ibw_conn *connctx);<br />
<br />
/*<br />
* Some prefetching will be performed here (to get the msg in one...)<br />
*/<br />
int ibw_read(ibw_conn *connctx, void *buf, int n);<br />
<br />
/*<br />
* I'm not sure here what happens if the CQ is full... TODO: check<br />
* Also try to send the message in _one_<br />
*/<br />
int ibw_write(ibw_conn *connctx, void *buf, int n);<br />
<br />
void ibw_close(ibw_conn *connctx);</div>Psomogyi@gamax.huhttps://wiki.samba.org/index.php?title=CTDB_Project&diff=1762CTDB Project2006-11-17T13:24:16Z<p>Psomogyi@gamax.hu: /* Hardware acceleration */</p>
<hr />
<div>= CTDB Project =<br />
<br />
This project aims to produce an implementation of the CTDB protocol described in the [[Samba & Clustering]] page<br />
<br />
== Project Members ==<br />
<br />
Sven Oehme (project leader) <br />
Andrew Tridgell (technical lead)<br />
Alexander Bokovoy<br />
Aleksey Fedoseev<br />
Jim McDonough<br />
Peter Somogyi<br />
<br />
== Project Outline ==<br />
<br />
The initial work will focus on an implementation as part of tdb itself. Integration with the Samba source tree will happen at a later date. Work will probably happen in a bzr tree, but the details have not been worked out yet. Check back here for updates.<br />
<br />
= Project Tasks =<br />
<br />
== Hardware acceleration ==<br />
<br />
(note: Peter is looking at this one)<br />
<br />
We want CTDB to be very fast on hardware that supports fast messaging. In particular we are interested in good use of infiniband adapters, where we expect to get messaging latencies of the order of 3 to 5 microseconds. <br />
<br />
From discussions so far it looks like the 'verbs' API, perhaps with a modification to allow us to hook it into epoll(), will be the right choice. Basic information on this API is available at https://openib.org/tiki/tiki-index.php<br />
<br />
The basic features we want from a messaging API are:<br />
<br />
- low latency. We would like to get it down to just a few<br />
microseconds per message. Messages will vary in size, but typically<br />
be small (say between 64 and 512 bytes).<br />
<br />
- non-blocking. We would really like an API that hooks into poll, so<br />
we can use epoll(), poll() or select(). <br />
<br />
- If we can't have an API that hooks into poll() or epoll(), then a<br />
callback or signal based API would do if the overheads are small<br />
enough. In the same code we also need to be working on a unix<br />
domain socket (datagram socket) so we'd like the overhead of<br />
dealing with both the infiniband messages and the local datagrams<br />
to be low.<br />
<br />
- What we definately don't want to use is an API that chews a lot of<br />
CPU. So we don't want to be spinning in userspace on a set of<br />
mapped registers in the hope that a message might come along. The<br />
CPU will be needed for other tasks. Using mapped registers for send<br />
would probably be fine, but we'd probably need some kernel mediated<br />
mechanism for receive unless you can suggest a way to avoid it.<br />
<br />
- ideally we'd have reliable delivery, or at least be told when<br />
delivery has failed on a send, but if that is too expensive then<br />
we'll do our own reliable delivery mechanism.<br />
<br />
- we need to be able to add/remove nodes from the cluster. The Samba<br />
clustering code will have its own recovery protocol.<br />
<br />
- a 'message' like API would suite us better than a 'remote DMA'<br />
style API, unless the remote DMA API is significantly more<br />
efficient. Ring buffers would be fine.<br />
<br />
A socket-like wrapper interface is the current aim: [[Link title]]<br />
<br />
== Flesh out CTDB API ==<br />
<br />
(note: Alexander and Aleksey are looking at this)<br />
<br />
By this I mean the C api in the "Clustered TDB API" section of the<br />
wiki page. The API as given there now is missing some pieces, and I<br />
think can be greatly improved.<br />
<br />
This is likely to feed back into the CTDB protocol description as<br />
well. Ideally we'd get rid of these calls:<br />
<br />
CTDB_REQ_FETCH_LOCKED<br />
CTDB_REPLY_FETCH_LOCKED<br />
CTDB_REQ_UNLOCK<br />
CTDB_REPLY_UNLOCK<br />
<br />
assuming we can demonstrate they aren't needed. I also think we can<br />
combine the CTDB_REQ_CONDITIONAL_APPEND and the CTDB_REQ_FETCH call<br />
into a single CTDB_REQ_REQUEST call which takes a key, a blob of data<br />
and a condition ID as input, and returns a blob of data and a status<br />
code as output. For a fetch call the input blob of data would be zero<br />
length.<br />
<br />
== Code s3/s4 opendb and brlock on top of ctdb api ==<br />
<br />
Whoever does this would pick either s3 or s4 initially, I don't think<br />
there is any point in doing them in parallel (we are bound to make the<br />
same sorts of mistakes on both if we did that).<br />
<br />
This will also feed a lot into the previous line item, working out the API.<br />
<br />
== Code CTDB api on top of dumb tdb ==<br />
<br />
This also feeds into the API discussion. It should be a very simple<br />
and dumb implementation, aiming to be used to allow the s3/s4<br />
implementation to have something to test against.<br />
<br />
== Prototype CTDB library on top of UDP/TCP ==<br />
<br />
(note: tridge is looking at this task)<br />
<br />
The initial implementation of the CTDB protocol will be on top of UDP/TCP<br />
<br />
== Setup standalone test environment ==<br />
<br />
This test environment is meant for non-clustered usage, instead emulating a cluster using<br />
IP on loopback. It will need to run multiple instances talking over 127.0.0.X interfaces. <br />
This will involve some shell scripting, plus some work on<br />
adding/removing nodes from the cluster. It might be easiest to add a<br />
CTDB protocol request asking a node to 'go quiet', then asking it to<br />
become active again later to simulate a node dying and coming back.<br />
<br />
== Code CTDB test suite ==<br />
(note: jim is looking at this one)<br />
<br />
This reflects the fact that I want this project to concentrate on<br />
building ctdb on tdb + messaging, and not concentrate on the "whole<br />
problem" involving Samba until later. We'll do a basic s3/s4 backend<br />
implementation to make sure the ideas can work, but I want the major<br />
testing effort to involve simple tests directly against the ctdb<br />
API. It will be so much easier to simulate exotic error conditions<br />
that way.<br />
<br />
== Flesh out recovery part of ctdb protocol == <br />
<br />
(note: tridge is looking at this one)<br />
<br />
I explained on the phone that I think the simplest recovery process<br />
will be something like this:<br />
<br />
- global sync and pick 'master' for recovery<br />
- every node sends all records from its local tdb to the LMASTER<br />
- master waits till all nodes say they are done<br />
- global sync and restart<br />
<br />
The recovery phase will need to very carefully cope with lots of<br />
corner cases, like when a node goes down during recovery.<br />
<br />
== Work out details for persistent tdbs ==<br />
<br />
this will need some more thought - its not our top priority, but<br />
eventually the long lived databases will matter.<br />
<br />
== Wireshark dissector ==<br />
<br />
We'll need a wireshark dissector, but only once the protocol settles down a little.</div>Psomogyi@gamax.hu