afs: Probe multiple fileservers simultaneously

Send probes to all the unprobed fileservers in a fileserver list on all
addresses simultaneously in an attempt to find out the fastest route whilst
not getting stuck for 20s on any server or address that we don't get a
reply from.

This alleviates the problem whereby attempting to access a new server can
take a long time because the rotation algorithm ends up rotating through
all servers and addresses until it finds one that responds.

Signed-off-by: David Howells <dhowells@redhat.com>
This commit is contained in:
David Howells
2018-10-20 00:57:59 +01:00
parent 18ac61853c
commit 3bf0fb6f33
17 changed files with 1050 additions and 350 deletions

View File

@@ -231,6 +231,8 @@ static struct afs_server *afs_alloc_server(struct afs_net *net,
rwlock_init(&server->fs_lock);
INIT_HLIST_HEAD(&server->cb_volumes);
rwlock_init(&server->cb_break_lock);
init_waitqueue_head(&server->probe_wq);
spin_lock_init(&server->probe_lock);
afs_inc_servers_outstanding(net);
_leave(" = %p", server);
@@ -254,7 +256,7 @@ static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell,
ret = -ERESTARTSYS;
if (afs_begin_vlserver_operation(&vc, cell, key)) {
while (afs_select_vlserver(&vc)) {
if (test_bit(vc.ac.index, &vc.ac.alist->yfs))
if (test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags))
alist = afs_yfsvl_get_endpoints(&vc, uuid);
else
alist = afs_vl_get_addrs_u(&vc, uuid);
@@ -365,8 +367,7 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
struct afs_addr_list *alist = rcu_access_pointer(server->addresses);
struct afs_addr_cursor ac = {
.alist = alist,
.start = alist->index,
.index = 0,
.index = alist->preferred,
.error = 0,
};
_enter("%p", server);
@@ -374,6 +375,9 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags))
afs_fs_give_up_all_callbacks(net, server, &ac, NULL);
wait_var_event(&server->probe_outstanding,
atomic_read(&server->probe_outstanding) == 0);
call_rcu(&server->rcu, afs_server_rcu);
afs_dec_servers_outstanding(net);
}
@@ -506,105 +510,6 @@ void afs_purge_servers(struct afs_net *net)
_leave("");
}
/*
* Probe a fileserver to find its capabilities.
*
* TODO: Try service upgrade.
*/
static bool afs_do_probe_fileserver(struct afs_fs_cursor *fc)
{
int i;
_enter("");
fc->ac.start = READ_ONCE(fc->ac.alist->index);
fc->ac.index = fc->ac.start;
fc->ac.error = 0;
fc->ac.begun = false;
while (afs_iterate_addresses(&fc->ac)) {
afs_fs_get_capabilities(afs_v2net(fc->vnode), fc->cbi->server,
&fc->ac, fc->key);
switch (fc->ac.error) {
case 0:
if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) {
for (i = 0; i < fc->ac.alist->nr_addrs; i++)
fc->ac.alist->addrs[i].srx_service =
YFS_FS_SERVICE;
}
afs_end_cursor(&fc->ac);
set_bit(AFS_SERVER_FL_PROBED, &fc->cbi->server->flags);
return true;
case -ECONNABORTED:
fc->ac.error = afs_abort_to_error(fc->ac.abort_code);
goto error;
case -ENOMEM:
case -ENONET:
goto error;
case -ENETUNREACH:
case -EHOSTUNREACH:
case -ECONNREFUSED:
case -ETIMEDOUT:
case -ETIME:
break;
default:
fc->ac.error = afs_io_error(NULL, afs_io_error_fs_probe_fail);
goto error;
}
}
error:
afs_end_cursor(&fc->ac);
return false;
}
/*
* If we haven't already, try probing the fileserver to get its capabilities.
* We try not to instigate parallel probes, but it's possible that the parallel
* probes will fail due to authentication failure when ours would succeed.
*
* TODO: Try sending an anonymous probe if an authenticated probe fails.
*/
bool afs_probe_fileserver(struct afs_fs_cursor *fc)
{
bool success;
int ret, retries = 0;
_enter("");
retry:
if (test_bit(AFS_SERVER_FL_PROBED, &fc->cbi->server->flags)) {
_leave(" = t");
return true;
}
if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &fc->cbi->server->flags)) {
success = afs_do_probe_fileserver(fc);
clear_bit_unlock(AFS_SERVER_FL_PROBING, &fc->cbi->server->flags);
wake_up_bit(&fc->cbi->server->flags, AFS_SERVER_FL_PROBING);
_leave(" = t");
return success;
}
_debug("wait");
ret = wait_on_bit(&fc->cbi->server->flags, AFS_SERVER_FL_PROBING,
TASK_INTERRUPTIBLE);
if (ret == -ERESTARTSYS) {
fc->ac.error = ret;
_leave(" = f [%d]", ret);
return false;
}
retries++;
if (retries == 4) {
fc->ac.error = -ESTALE;
_leave(" = f [stale]");
return false;
}
_debug("retry");
goto retry;
}
/*
* Get an update for a server's address list.
*/