Replace the monolithic DRBD 8.4 state machine with an architecture suited for clusters with more than one peer. The central concept is a transactional model: state is held in per-object arrays indexed by [NOW] and [NEW], and every change is bracketed by begin/end calls that validate the proposed transition resource-wide before atomically committing it or rolling it back. Replace the single psinlock that serialized everything with finer-grained locking: a read-write lock for state access, separate locks for peer requests and interval trees. Cluster-wide state changes (role changes, connect/disconnect, resize) use a two-phase commit protocol. The initiating node sends a prepare message to all reachable peers, collects replies with timeout and exponential backoff, then commits or aborts. Not-fully-connected topologies are handled by forwarding nested 2PC rounds through intermediate nodes. Add a quorum mechanism with tiebreaker support for even-sized clusters. This can suspend or fail I/O when the cluster loses more than half of its peers. Unify post-state-change processing into a single resource-wide work item that handles UUID propagation, resync startup, I/O suspension, metadata persistence, and netlink notifications for all objects in one pass, replacing the separate per-device and per-connection callbacks from 8.4. Co-developed-by: Philipp Reisner Signed-off-by: Philipp Reisner Co-developed-by: Lars Ellenberg Signed-off-by: Lars Ellenberg Co-developed-by: Joel Colledge Signed-off-by: Joel Colledge Co-developed-by: Christoph Böhmwalder Signed-off-by: Christoph Böhmwalder --- drivers/block/drbd/drbd_state.c | 7724 +++++++++++++++++++++++-------- include/linux/drbd_genl.h | 2 + include/linux/drbd_limits.h | 7 + 3 files changed, 5898 insertions(+), 1835 deletions(-) diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index adcba7f1d8ea..ab1ff6f85fb2 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -13,199 +13,414 @@ */ -#include +#include +#include #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" #include "drbd_state_change.h" -struct after_state_chg_work { + +struct after_state_change_work { struct drbd_work w; - struct drbd_device *device; - union drbd_state os; - union drbd_state ns; - enum chg_state_flags flags; - struct completion *done; struct drbd_state_change *state_change; + struct completion *done; +}; + +struct quorum_info { + int up_to_date; + int present; + int voters; + int quorum_at; + int min_redundancy_at; +}; + +struct quorum_detail { + int up_to_date; + int present; + int outdated; + int diskless; + int missing_diskless; + int quorumless; + int unknown; + int quorate_peers; +}; + +struct change_context { + struct drbd_resource *resource; + int vnr; + union drbd_state mask; + union drbd_state val; + int target_node_id; + enum chg_state_flags flags; + bool change_local_state_last; + const char **err_str; +}; + +enum change_phase { + PH_LOCAL_COMMIT, + PH_PREPARE, + PH_84_COMMIT, + PH_COMMIT, }; -enum sanitize_state_warnings { - NO_WARNING, - ABORTED_ONLINE_VERIFY, - ABORTED_RESYNC, - CONNECTION_LOST_NEGOTIATING, - IMPLICITLY_UPGRADED_DISK, - IMPLICITLY_UPGRADED_PDSK, +struct change_disk_state_context { + struct change_context context; + struct drbd_device *device; }; +static bool lost_contact_to_peer_data(enum drbd_disk_state *peer_disk_state); +static bool peer_returns_diskless(struct drbd_peer_device *peer_device, + enum drbd_disk_state os, enum drbd_disk_state ns); +static void print_state_change(struct drbd_resource *resource, const char *prefix, const char *tag); +static void finish_state_change(struct drbd_resource *, const char *tag); +static int w_after_state_change(struct drbd_work *w, int unused); +static enum drbd_state_rv is_valid_soft_transition(struct drbd_resource *); +static enum drbd_state_rv is_valid_transition(struct drbd_resource *resource); +static void sanitize_state(struct drbd_resource *resource); +static void ensure_exposed_data_uuid(struct drbd_device *device); +static enum drbd_state_rv change_peer_state(struct drbd_connection *, int, union drbd_state, + union drbd_state, unsigned long *); +static void check_wrongly_set_mdf_exists(struct drbd_device *); +static void update_members(struct drbd_resource *resource); +static bool calc_data_accessible(struct drbd_state_change *state_change, int n_device, + enum which_state which); + +/* We need to stay consistent if we are neighbor of a diskless primary with + different UUID. This function should be used if the device was D_UP_TO_DATE + before. + */ +static bool may_return_to_up_to_date(struct drbd_device *device, enum which_state which) +{ + struct drbd_peer_device *peer_device; + bool rv = true; + + rcu_read_lock(); + for_each_peer_device_rcu(peer_device, device) { + if (peer_device->disk_state[which] == D_DISKLESS && + peer_device->connection->peer_role[which] == R_PRIMARY && + peer_device->current_uuid != drbd_current_uuid(device)) { + rv = false; + break; + } + } + rcu_read_unlock(); + + return rv; +} + +/** + * may_be_up_to_date() - check if transition from D_CONSISTENT to D_UP_TO_DATE is allowed + * @device: DRBD device. + * @which: OLD or NEW + * + * When fencing is enabled, it may only transition from D_CONSISTENT to D_UP_TO_DATE + * when ether all peers are connected, or outdated. + */ +static bool may_be_up_to_date(struct drbd_device *device, enum which_state which) +{ + bool all_peers_outdated = true; + int node_id; + + if (!may_return_to_up_to_date(device, which)) + return false; + + rcu_read_lock(); + for (node_id = 0; node_id < DRBD_NODE_ID_MAX; node_id++) { + struct drbd_peer_md *peer_md = &device->ldev->md.peers[node_id]; + struct drbd_peer_device *peer_device; + enum drbd_disk_state peer_disk_state; + bool want_bitmap = true; + + if (node_id == device->ldev->md.node_id) + continue; + + if (!(peer_md->flags & MDF_HAVE_BITMAP) && !(peer_md->flags & MDF_NODE_EXISTS)) + continue; + + if (!(peer_md->flags & MDF_PEER_FENCING)) + continue; + peer_device = peer_device_by_node_id(device, node_id); + if (peer_device) { + struct peer_device_conf *pdc = rcu_dereference(peer_device->conf); + want_bitmap = pdc->bitmap; + peer_disk_state = peer_device->disk_state[NEW]; + } else { + peer_disk_state = D_UNKNOWN; + } + + switch (peer_disk_state) { + case D_DISKLESS: + if (!(peer_md->flags & MDF_PEER_DEVICE_SEEN)) + continue; + fallthrough; + case D_ATTACHING: + case D_DETACHING: + case D_FAILED: + case D_NEGOTIATING: + case D_UNKNOWN: + if (!want_bitmap) + continue; + if ((peer_md->flags & MDF_PEER_OUTDATED)) + continue; + break; + case D_INCONSISTENT: + case D_OUTDATED: + continue; + case D_CONSISTENT: + case D_UP_TO_DATE: + /* These states imply that there is a connection. If there is + a connection we do not need to insist that the peer was + outdated. */ + continue; + case D_MASK: + break; + } + + all_peers_outdated = false; + } + rcu_read_unlock(); + return all_peers_outdated; +} + +static bool stable_up_to_date_neighbor(struct drbd_device *device) +{ + struct drbd_peer_device *peer_device; + bool rv = false; + + rcu_read_lock(); + for_each_peer_device_rcu(peer_device, device) { + if (peer_device->disk_state[NEW] == D_UP_TO_DATE && + peer_device->uuid_flags & UUID_FLAG_STABLE && /* primary is also stable */ + peer_device->current_uuid == drbd_current_uuid(device)) { + rv = true; + break; + } + } + rcu_read_unlock(); + + return rv; +} + +/** + * disk_state_from_md() - determine initial disk state + * @device: DRBD device. + * + * When a disk is attached to a device, we set the disk state to D_NEGOTIATING. + * We then wait for all connected peers to send the peer disk state. Once that + * has happened, we can determine the actual disk state based on the peer disk + * states and the state of the disk itself. + * + * The initial disk state becomes D_UP_TO_DATE without fencing or when we know + * that all peers have been outdated, and D_CONSISTENT otherwise. + * + * The caller either needs to have a get_ldev() reference, or need to call + * this function only if disk_state[NOW] >= D_NEGOTIATING and holding the + * state_rwlock. + */ +enum drbd_disk_state disk_state_from_md(struct drbd_device *device) +{ + enum drbd_disk_state disk_state; + + if (!drbd_md_test_flag(device->ldev, MDF_CONSISTENT)) + disk_state = D_INCONSISTENT; + else if (!drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE)) + disk_state = D_OUTDATED; + else + disk_state = may_be_up_to_date(device, NOW) ? D_UP_TO_DATE : D_CONSISTENT; + + return disk_state; +} + +bool is_suspended_fen(struct drbd_resource *resource, enum which_state which) +{ + struct drbd_connection *connection; + bool rv = false; + + rcu_read_lock(); + for_each_connection_rcu(connection, resource) { + if (connection->susp_fen[which]) { + rv = true; + break; + } + } + rcu_read_unlock(); + + return rv; +} + +bool resource_is_suspended(struct drbd_resource *resource, enum which_state which) +{ + bool rv = resource->susp_user[which] || resource->susp_nod[which] || + resource->susp_quorum[which] || resource->susp_uuid[which]; + + if (rv) + return rv; + + return is_suspended_fen(resource, which); +} + static void count_objects(struct drbd_resource *resource, - unsigned int *n_devices, - unsigned int *n_connections) + struct drbd_state_change_object_count *ocnt) { + struct drbd_path *path; struct drbd_device *device; struct drbd_connection *connection; int vnr; - *n_devices = 0; - *n_connections = 0; + lockdep_assert_held(&resource->state_rwlock); + + ocnt->n_devices = 0; + ocnt->n_connections = 0; + ocnt->n_paths = 0; idr_for_each_entry(&resource->devices, device, vnr) - (*n_devices)++; - for_each_connection(connection, resource) - (*n_connections)++; + ocnt->n_devices++; + for_each_connection(connection, resource) { + ocnt->n_connections++; + list_for_each_entry(path, &connection->transport.paths, list) { + ocnt->n_paths++; + } + } } -static struct drbd_state_change *alloc_state_change(unsigned int n_devices, unsigned int n_connections, gfp_t gfp) +static struct drbd_state_change *alloc_state_change(struct drbd_state_change_object_count *ocnt, gfp_t flags) { struct drbd_state_change *state_change; - unsigned int size, n; + unsigned int size; size = sizeof(struct drbd_state_change) + - n_devices * sizeof(struct drbd_device_state_change) + - n_connections * sizeof(struct drbd_connection_state_change) + - n_devices * n_connections * sizeof(struct drbd_peer_device_state_change); - state_change = kmalloc(size, gfp); + ocnt->n_devices * sizeof(struct drbd_device_state_change) + + ocnt->n_connections * sizeof(struct drbd_connection_state_change) + + ocnt->n_devices * ocnt->n_connections * sizeof(struct drbd_peer_device_state_change) + + ocnt->n_paths * sizeof(struct drbd_path_state); + state_change = kzalloc(size, flags); if (!state_change) return NULL; - state_change->n_devices = n_devices; - state_change->n_connections = n_connections; + state_change->n_connections = ocnt->n_connections; + state_change->n_devices = ocnt->n_devices; + state_change->n_paths = ocnt->n_paths; state_change->devices = (void *)(state_change + 1); - state_change->connections = (void *)&state_change->devices[n_devices]; - state_change->peer_devices = (void *)&state_change->connections[n_connections]; - state_change->resource->resource = NULL; - for (n = 0; n < n_devices; n++) - state_change->devices[n].device = NULL; - for (n = 0; n < n_connections; n++) - state_change->connections[n].connection = NULL; + state_change->connections = (void *)&state_change->devices[ocnt->n_devices]; + state_change->peer_devices = (void *)&state_change->connections[ocnt->n_connections]; + state_change->paths = (void *)&state_change->peer_devices[ocnt->n_devices*ocnt->n_connections]; return state_change; } -struct drbd_state_change *remember_old_state(struct drbd_resource *resource, gfp_t gfp) +struct drbd_state_change *remember_state_change(struct drbd_resource *resource, gfp_t gfp) { struct drbd_state_change *state_change; struct drbd_device *device; - unsigned int n_devices; struct drbd_connection *connection; - unsigned int n_connections; + struct drbd_state_change_object_count ocnt; int vnr; struct drbd_device_state_change *device_state_change; struct drbd_peer_device_state_change *peer_device_state_change; struct drbd_connection_state_change *connection_state_change; + struct drbd_path_state *path_state; /* yes, not a _change :-( */ + + lockdep_assert_held(&resource->state_rwlock); - /* Caller holds req_lock spinlock. - * No state, no device IDR, no connections lists can change. */ - count_objects(resource, &n_devices, &n_connections); - state_change = alloc_state_change(n_devices, n_connections, gfp); + count_objects(resource, &ocnt); + state_change = alloc_state_change(&ocnt, gfp); if (!state_change) return NULL; kref_get(&resource->kref); state_change->resource->resource = resource; - state_change->resource->role[OLD] = - conn_highest_role(first_connection(resource)); - state_change->resource->susp[OLD] = resource->susp; - state_change->resource->susp_nod[OLD] = resource->susp_nod; - state_change->resource->susp_fen[OLD] = resource->susp_fen; - - connection_state_change = state_change->connections; - for_each_connection(connection, resource) { - kref_get(&connection->kref); - connection_state_change->connection = connection; - connection_state_change->cstate[OLD] = - connection->cstate; - connection_state_change->peer_role[OLD] = - conn_highest_peer(connection); - connection_state_change++; - } + memcpy(state_change->resource->role, + resource->role, sizeof(resource->role)); + memcpy(state_change->resource->susp, + resource->susp_user, sizeof(resource->susp_user)); + memcpy(state_change->resource->susp_nod, + resource->susp_nod, sizeof(resource->susp_nod)); + memcpy(state_change->resource->susp_uuid, + resource->susp_uuid, sizeof(resource->susp_uuid)); + memcpy(state_change->resource->fail_io, + resource->fail_io, sizeof(resource->fail_io)); device_state_change = state_change->devices; peer_device_state_change = state_change->peer_devices; idr_for_each_entry(&resource->devices, device, vnr) { + struct drbd_peer_device *peer_device; + kref_get(&device->kref); device_state_change->device = device; - device_state_change->disk_state[OLD] = device->state.disk; + memcpy(device_state_change->disk_state, + device->disk_state, sizeof(device->disk_state)); + memcpy(device_state_change->have_quorum, + device->have_quorum, sizeof(device->have_quorum)); /* The peer_devices for each device have to be enumerated in the order of the connections. We may not use for_each_peer_device() here. */ for_each_connection(connection, resource) { - struct drbd_peer_device *peer_device; - peer_device = conn_peer_device(connection, device->vnr); + peer_device_state_change->peer_device = peer_device; - peer_device_state_change->disk_state[OLD] = - device->state.pdsk; - peer_device_state_change->repl_state[OLD] = - max_t(enum drbd_conns, - C_WF_REPORT_PARAMS, device->state.conn); - peer_device_state_change->resync_susp_user[OLD] = - device->state.user_isp; - peer_device_state_change->resync_susp_peer[OLD] = - device->state.peer_isp; - peer_device_state_change->resync_susp_dependency[OLD] = - device->state.aftr_isp; + memcpy(peer_device_state_change->disk_state, + peer_device->disk_state, sizeof(peer_device->disk_state)); + memcpy(peer_device_state_change->repl_state, + peer_device->repl_state, sizeof(peer_device->repl_state)); + memcpy(peer_device_state_change->resync_susp_user, + peer_device->resync_susp_user, + sizeof(peer_device->resync_susp_user)); + memcpy(peer_device_state_change->resync_susp_peer, + peer_device->resync_susp_peer, + sizeof(peer_device->resync_susp_peer)); + memcpy(peer_device_state_change->resync_susp_dependency, + peer_device->resync_susp_dependency, + sizeof(peer_device->resync_susp_dependency)); + memcpy(peer_device_state_change->resync_susp_other_c, + peer_device->resync_susp_other_c, + sizeof(peer_device->resync_susp_other_c)); + memcpy(peer_device_state_change->resync_active, + peer_device->resync_active, + sizeof(peer_device->resync_active)); + memcpy(peer_device_state_change->replication, + peer_device->replication, + sizeof(peer_device->replication)); + memcpy(peer_device_state_change->peer_replication, + peer_device->peer_replication, + sizeof(peer_device->peer_replication)); peer_device_state_change++; } device_state_change++; } - return state_change; -} - -static void remember_new_state(struct drbd_state_change *state_change) -{ - struct drbd_resource_state_change *resource_state_change; - struct drbd_resource *resource; - unsigned int n; - - if (!state_change) - return; - - resource_state_change = &state_change->resource[0]; - resource = resource_state_change->resource; - - resource_state_change->role[NEW] = - conn_highest_role(first_connection(resource)); - resource_state_change->susp[NEW] = resource->susp; - resource_state_change->susp_nod[NEW] = resource->susp_nod; - resource_state_change->susp_fen[NEW] = resource->susp_fen; - - for (n = 0; n < state_change->n_devices; n++) { - struct drbd_device_state_change *device_state_change = - &state_change->devices[n]; - struct drbd_device *device = device_state_change->device; - - device_state_change->disk_state[NEW] = device->state.disk; - } + connection_state_change = state_change->connections; + path_state = state_change->paths; + for_each_connection(connection, resource) { + struct drbd_path *path; - for (n = 0; n < state_change->n_connections; n++) { - struct drbd_connection_state_change *connection_state_change = - &state_change->connections[n]; - struct drbd_connection *connection = - connection_state_change->connection; + kref_get(&connection->kref); + connection_state_change->connection = connection; + memcpy(connection_state_change->cstate, + connection->cstate, sizeof(connection->cstate)); + memcpy(connection_state_change->peer_role, + connection->peer_role, sizeof(connection->peer_role)); + memcpy(connection_state_change->susp_fen, + connection->susp_fen, sizeof(connection->susp_fen)); + + list_for_each_entry(path, &connection->transport.paths, list) { + /* Share the connection kref with above. + * Could also share the pointer, but would then need to + * remember an additional n_paths per connection + * count/offset (connection_state_change->n_paths++) + * to be able to associate the paths with its connection. + * So why not directly store the pointer here again. */ + path_state->connection = connection; + kref_get(&path->kref); + path_state->path = path; + path_state->path_established = test_bit(TR_ESTABLISHED, &path->flags); + + path_state++; + } - connection_state_change->cstate[NEW] = connection->cstate; - connection_state_change->peer_role[NEW] = - conn_highest_peer(connection); + connection_state_change++; } - for (n = 0; n < state_change->n_devices * state_change->n_connections; n++) { - struct drbd_peer_device_state_change *peer_device_state_change = - &state_change->peer_devices[n]; - struct drbd_device *device = - peer_device_state_change->peer_device->device; - union drbd_dev_state state = device->state; - - peer_device_state_change->disk_state[NEW] = state.pdsk; - peer_device_state_change->repl_state[NEW] = - max_t(enum drbd_conns, C_WF_REPORT_PARAMS, state.conn); - peer_device_state_change->resync_susp_user[NEW] = - state.user_isp; - peer_device_state_change->resync_susp_peer[NEW] = - state.peer_isp; - peer_device_state_change->resync_susp_dependency[NEW] = - state.aftr_isp; - } + return state_change; } void copy_old_to_new_state_change(struct drbd_state_change *state_change) @@ -219,7 +434,8 @@ void copy_old_to_new_state_change(struct drbd_state_change *state_change) OLD_TO_NEW(resource_state_change->role); OLD_TO_NEW(resource_state_change->susp); OLD_TO_NEW(resource_state_change->susp_nod); - OLD_TO_NEW(resource_state_change->susp_fen); + OLD_TO_NEW(resource_state_change->susp_uuid); + OLD_TO_NEW(resource_state_change->fail_io); for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { struct drbd_connection_state_change *connection_state_change = @@ -227,6 +443,7 @@ void copy_old_to_new_state_change(struct drbd_state_change *state_change) OLD_TO_NEW(connection_state_change->peer_role); OLD_TO_NEW(connection_state_change->cstate); + OLD_TO_NEW(connection_state_change->susp_fen); } for (n_device = 0; n_device < state_change->n_devices; n_device++) { @@ -234,6 +451,7 @@ void copy_old_to_new_state_change(struct drbd_state_change *state_change) &state_change->devices[n_device]; OLD_TO_NEW(device_state_change->disk_state); + OLD_TO_NEW(device_state_change->have_quorum); } n_peer_devices = state_change->n_devices * state_change->n_connections; @@ -246,6 +464,10 @@ void copy_old_to_new_state_change(struct drbd_state_change *state_change) OLD_TO_NEW(p->resync_susp_user); OLD_TO_NEW(p->resync_susp_peer); OLD_TO_NEW(p->resync_susp_dependency); + OLD_TO_NEW(p->resync_susp_other_c); + OLD_TO_NEW(p->resync_active); + OLD_TO_NEW(p->replication); + OLD_TO_NEW(p->peer_replication); } #undef OLD_TO_NEW @@ -258,2140 +480,5972 @@ void forget_state_change(struct drbd_state_change *state_change) if (!state_change) return; - if (state_change->resource->resource) + if (state_change->resource->resource) { kref_put(&state_change->resource->resource->kref, drbd_destroy_resource); + } for (n = 0; n < state_change->n_devices; n++) { struct drbd_device *device = state_change->devices[n].device; - if (device) + if (device) { kref_put(&device->kref, drbd_destroy_device); + } } for (n = 0; n < state_change->n_connections; n++) { struct drbd_connection *connection = state_change->connections[n].connection; - if (connection) + if (connection) { kref_put(&connection->kref, drbd_destroy_connection); + } + } + for (n = 0; n < state_change->n_paths; n++) { + struct drbd_path *path = state_change->paths[n].path; + if (path) { + kref_put(&path->kref, drbd_destroy_path); + } } kfree(state_change); } -static int w_after_state_ch(struct drbd_work *w, int unused); -static void after_state_ch(struct drbd_device *device, union drbd_state os, - union drbd_state ns, enum chg_state_flags flags, - struct drbd_state_change *); -static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state); -static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *); -static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns); -static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os, - union drbd_state ns, enum sanitize_state_warnings *warn); - -static inline bool is_susp(union drbd_state s) +static bool state_has_changed(struct drbd_resource *resource) { - return s.susp || s.susp_nod || s.susp_fen; + struct drbd_connection *connection; + struct drbd_device *device; + int vnr; + + if (resource->state_change_flags & CS_FORCE_RECALC) + return true; + + if (resource->role[OLD] != resource->role[NEW] || + resource->susp_user[OLD] != resource->susp_user[NEW] || + resource->susp_nod[OLD] != resource->susp_nod[NEW] || + resource->susp_quorum[OLD] != resource->susp_quorum[NEW] || + resource->susp_uuid[OLD] != resource->susp_uuid[NEW] || + resource->fail_io[OLD] != resource->fail_io[NEW]) + return true; + + for_each_connection(connection, resource) { + if (connection->cstate[OLD] != connection->cstate[NEW] || + connection->peer_role[OLD] != connection->peer_role[NEW] || + connection->susp_fen[OLD] != connection->susp_fen[NEW]) + return true; + } + + idr_for_each_entry(&resource->devices, device, vnr) { + struct drbd_peer_device *peer_device; + + if (device->disk_state[OLD] != device->disk_state[NEW] || + device->have_quorum[OLD] != device->have_quorum[NEW]) + return true; + + for_each_peer_device(peer_device, device) { + if (peer_device->disk_state[OLD] != peer_device->disk_state[NEW] || + peer_device->repl_state[OLD] != peer_device->repl_state[NEW] || + peer_device->resync_susp_user[OLD] != + peer_device->resync_susp_user[NEW] || + peer_device->resync_susp_peer[OLD] != + peer_device->resync_susp_peer[NEW] || + peer_device->resync_susp_dependency[OLD] != + peer_device->resync_susp_dependency[NEW] || + peer_device->resync_susp_other_c[OLD] != + peer_device->resync_susp_other_c[NEW] || + peer_device->resync_active[OLD] != + peer_device->resync_active[NEW] || + peer_device->replication[OLD] != + peer_device->replication[NEW] || + peer_device->peer_replication[OLD] != + peer_device->peer_replication[NEW] || + peer_device->uuid_flags & UUID_FLAG_GOT_STABLE) + return true; + } + } + return false; } -bool conn_all_vols_unconf(struct drbd_connection *connection) +static void ___begin_state_change(struct drbd_resource *resource) { - struct drbd_peer_device *peer_device; - bool rv = true; + struct drbd_connection *connection; + struct drbd_device *device; int vnr; - rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - if (device->state.disk != D_DISKLESS || - device->state.conn != C_STANDALONE || - device->state.role != R_SECONDARY) { - rv = false; - break; - } + resource->role[NEW] = resource->role[NOW]; + resource->susp_user[NEW] = resource->susp_user[NOW]; + resource->susp_nod[NEW] = resource->susp_nod[NOW]; + resource->susp_quorum[NEW] = resource->susp_quorum[NOW]; + resource->susp_uuid[NEW] = resource->susp_uuid[NOW]; + resource->fail_io[NEW] = resource->fail_io[NOW]; + + for_each_connection_rcu(connection, resource) { + connection->cstate[NEW] = connection->cstate[NOW]; + connection->peer_role[NEW] = connection->peer_role[NOW]; + connection->susp_fen[NEW] = connection->susp_fen[NOW]; } - rcu_read_unlock(); - return rv; + idr_for_each_entry(&resource->devices, device, vnr) { + struct drbd_peer_device *peer_device; + + device->disk_state[NEW] = device->disk_state[NOW]; + device->have_quorum[NEW] = device->have_quorum[NOW]; + + for_each_peer_device_rcu(peer_device, device) { + peer_device->disk_state[NEW] = peer_device->disk_state[NOW]; + peer_device->repl_state[NEW] = peer_device->repl_state[NOW]; + peer_device->resync_susp_user[NEW] = + peer_device->resync_susp_user[NOW]; + peer_device->resync_susp_peer[NEW] = + peer_device->resync_susp_peer[NOW]; + peer_device->resync_susp_dependency[NEW] = + peer_device->resync_susp_dependency[NOW]; + peer_device->resync_susp_other_c[NEW] = + peer_device->resync_susp_other_c[NOW]; + peer_device->resync_active[NEW] = + peer_device->resync_active[NOW]; + peer_device->replication[NEW] = + peer_device->replication[NOW]; + peer_device->peer_replication[NEW] = + peer_device->peer_replication[NOW]; + } + } } -/* Unfortunately the states where not correctly ordered, when - they where defined. therefore can not use max_t() here. */ -static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2) +static void __begin_state_change(struct drbd_resource *resource) { - if (role1 == R_PRIMARY || role2 == R_PRIMARY) - return R_PRIMARY; - if (role1 == R_SECONDARY || role2 == R_SECONDARY) - return R_SECONDARY; - return R_UNKNOWN; + rcu_read_lock(); + ___begin_state_change(resource); } -static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2) +static enum drbd_state_rv try_state_change(struct drbd_resource *resource) { - if (role1 == R_UNKNOWN || role2 == R_UNKNOWN) - return R_UNKNOWN; - if (role1 == R_SECONDARY || role2 == R_SECONDARY) - return R_SECONDARY; - return R_PRIMARY; + enum drbd_state_rv rv; + + if (!state_has_changed(resource)) + return SS_NOTHING_TO_DO; + sanitize_state(resource); + rv = is_valid_transition(resource); + if (rv >= SS_SUCCESS && !(resource->state_change_flags & CS_HARD)) + rv = is_valid_soft_transition(resource); + return rv; } -enum drbd_role conn_highest_role(struct drbd_connection *connection) +static void apply_update_to_exposed_data_uuid(struct drbd_resource *resource) { - enum drbd_role role = R_SECONDARY; - struct drbd_peer_device *peer_device; + struct drbd_device *device; int vnr; - rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - role = max_role(role, device->state.role); - } - rcu_read_unlock(); + idr_for_each_entry(&resource->devices, device, vnr) { + u64 nedu = device->next_exposed_data_uuid; + int changed = 0; - return role; + if (!nedu) + continue; + if (device->disk_state[NOW] < D_INCONSISTENT) + changed = drbd_uuid_set_exposed(device, nedu, false); + + device->next_exposed_data_uuid = 0; + if (changed) + drbd_info(device, "Executing delayed exposed data uuid update: %016llX\n", + (unsigned long long)device->exposed_data_uuid); + else + drbd_info(device, "Canceling delayed exposed data uuid update\n"); + } } -enum drbd_role conn_highest_peer(struct drbd_connection *connection) +void __clear_remote_state_change(struct drbd_resource *resource) { - enum drbd_role peer = R_UNKNOWN; - struct drbd_peer_device *peer_device; - int vnr; + bool is_connect = resource->twopc_reply.is_connect; + int initiator_node_id = resource->twopc_reply.initiator_node_id; - rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - peer = max_role(peer, device->state.peer); + resource->remote_state_change = false; + resource->twopc_reply.initiator_node_id = -1; + resource->twopc_reply.tid = 0; + + if (is_connect && resource->twopc_prepare_reply_cmd == 0) { + struct drbd_connection *connection; + + rcu_read_lock(); + connection = drbd_connection_by_node_id(resource, initiator_node_id); + if (connection) + abort_connect(connection); + rcu_read_unlock(); } - rcu_read_unlock(); - return peer; + wake_up_all(&resource->twopc_wait); + + /* Do things that where postponed to after two-phase commits finished */ + apply_update_to_exposed_data_uuid(resource); } -enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection) +static bool state_is_stable(struct drbd_device *device) { - enum drbd_disk_state disk_state = D_DISKLESS; struct drbd_peer_device *peer_device; - int vnr; + bool stable = true; + + /* DO NOT add a default clause, we want the compiler to warn us + * for any newly introduced state we may have forgotten to add here */ rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - disk_state = max_t(enum drbd_disk_state, disk_state, device->state.disk); + for_each_peer_device_rcu(peer_device, device) { + switch (peer_device->repl_state[NOW]) { + /* New io is only accepted when the peer device is unknown or there is + * a well-established connection. */ + case L_OFF: + case L_ESTABLISHED: + case L_SYNC_SOURCE: + case L_SYNC_TARGET: + case L_VERIFY_S: + case L_VERIFY_T: + case L_PAUSED_SYNC_S: + case L_PAUSED_SYNC_T: + case L_AHEAD: + case L_BEHIND: + case L_STARTING_SYNC_S: + case L_STARTING_SYNC_T: + break; + + /* Allow IO in BM exchange states with new protocols */ + case L_WF_BITMAP_S: + if (peer_device->connection->agreed_pro_version < 96) + stable = false; + break; + + /* no new io accepted in these states */ + case L_WF_BITMAP_T: + case L_WF_SYNC_UUID: + stable = false; + break; + } + if (!stable) + break; } rcu_read_unlock(); - return disk_state; + switch (device->disk_state[NOW]) { + case D_DISKLESS: + case D_INCONSISTENT: + case D_OUTDATED: + case D_CONSISTENT: + case D_UP_TO_DATE: + case D_FAILED: + case D_DETACHING: + /* disk state is stable as well. */ + break; + + /* no new io accepted during transitional states */ + case D_ATTACHING: + case D_NEGOTIATING: + case D_UNKNOWN: + case D_MASK: + stable = false; + } + + return stable; } -enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection) +static bool drbd_state_change_is_connect(struct drbd_resource *resource) { - enum drbd_disk_state disk_state = D_MASK; - struct drbd_peer_device *peer_device; - int vnr; + struct drbd_connection *connection; - rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - disk_state = min_t(enum drbd_disk_state, disk_state, device->state.disk); + for_each_connection(connection, resource) { + if (connection->cstate[NOW] == C_CONNECTING && + connection->cstate[NEW] == C_CONNECTED) + return true; } - rcu_read_unlock(); - return disk_state; + return false; } -enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection) +static struct after_state_change_work *alloc_after_state_change_work(struct drbd_resource *resource) { - enum drbd_disk_state disk_state = D_DISKLESS; - struct drbd_peer_device *peer_device; - int vnr; + struct after_state_change_work *work; - rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - disk_state = max_t(enum drbd_disk_state, disk_state, device->state.pdsk); + lockdep_assert_held(&resource->state_rwlock); + + /* If the resource is already "unregistered", the worker thread + * is gone, there is no-one to consume the work item and release + * the associated refcounts. Just don't even create it. + */ + if (test_bit(R_UNREGISTERED, &resource->flags)) + return NULL; + + work = kmalloc_obj(*work, GFP_ATOMIC); + if (work) { + work->state_change = remember_state_change(resource, GFP_ATOMIC); + if (!work->state_change) { + kfree(work); + work = NULL; + } } - rcu_read_unlock(); + if (!work) + drbd_err(resource, "Could not allocate after state change work\n"); - return disk_state; + return work; } -enum drbd_conns conn_lowest_conn(struct drbd_connection *connection) +static void queue_after_state_change_work(struct drbd_resource *resource, + struct completion *done, + struct after_state_change_work *work) { - enum drbd_conns conn = C_MASK; - struct drbd_peer_device *peer_device; - int vnr; + if (work) { + work->w.cb = w_after_state_change; + work->done = done; + drbd_queue_work(&resource->work, &work->w); + } else if (done) { + complete(done); + } +} - rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - conn = min_t(enum drbd_conns, conn, device->state.conn); +static enum drbd_state_rv ___end_state_change(struct drbd_resource *resource, struct completion *done, + enum drbd_state_rv rv, const char *tag) +{ + enum chg_state_flags flags = resource->state_change_flags; + struct drbd_connection *connection; + struct drbd_device *device; + bool is_connect; + unsigned int pro_ver; + int vnr; + bool all_devs_have_quorum = true; + struct after_state_change_work *work; + + if (flags & CS_ABORT) + goto out; + if (rv >= SS_SUCCESS) + rv = try_state_change(resource); + if (rv < SS_SUCCESS) { + if (flags & CS_VERBOSE) { + drbd_err(resource, "State change failed: %s (%d)\n", + drbd_set_st_err_str(rv), rv); + print_state_change(resource, "Failed: ", tag); + } + goto out; } - rcu_read_unlock(); + if (flags & CS_PREPARE) + goto out; - return conn; -} + update_members(resource); + finish_state_change(resource, tag); -static bool no_peer_wf_report_params(struct drbd_connection *connection) -{ - struct drbd_peer_device *peer_device; - int vnr; - bool rv = true; + /* Check whether we are establishing a connection before applying the change. */ + is_connect = drbd_state_change_is_connect(resource); - rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) - if (peer_device->device->state.conn == C_WF_REPORT_PARAMS) { - rv = false; - break; + /* This remembers the state change, so call before applying the change. */ + work = alloc_after_state_change_work(resource); + + /* changes to local_cnt and device flags should be visible before + * changes to state, which again should be visible before anything else + * depending on that change happens. */ + smp_wmb(); + resource->role[NOW] = resource->role[NEW]; + resource->susp_user[NOW] = resource->susp_user[NEW]; + resource->susp_nod[NOW] = resource->susp_nod[NEW]; + resource->susp_quorum[NOW] = resource->susp_quorum[NEW]; + resource->susp_uuid[NOW] = resource->susp_uuid[NEW]; + resource->fail_io[NOW] = resource->fail_io[NEW]; + resource->cached_susp = resource_is_suspended(resource, NEW); + + pro_ver = PRO_VERSION_MAX; + for_each_connection(connection, resource) { + connection->cstate[NOW] = connection->cstate[NEW]; + connection->peer_role[NOW] = connection->peer_role[NEW]; + connection->susp_fen[NOW] = connection->susp_fen[NEW]; + + pro_ver = min_t(unsigned int, pro_ver, + connection->agreed_pro_version); + + wake_up(&connection->ee_wait); + } + resource->cached_min_aggreed_protocol_version = pro_ver; + + idr_for_each_entry(&resource->devices, device, vnr) { + struct res_opts *o = &resource->res_opts; + struct drbd_peer_device *peer_device; + + device->disk_state[NOW] = device->disk_state[NEW]; + device->have_quorum[NOW] = device->have_quorum[NEW]; + + if (!device->have_quorum[NOW]) + all_devs_have_quorum = false; + + for_each_peer_device(peer_device, device) { + peer_device->disk_state[NOW] = peer_device->disk_state[NEW]; + peer_device->repl_state[NOW] = peer_device->repl_state[NEW]; + peer_device->resync_susp_user[NOW] = + peer_device->resync_susp_user[NEW]; + peer_device->resync_susp_peer[NOW] = + peer_device->resync_susp_peer[NEW]; + peer_device->resync_susp_dependency[NOW] = + peer_device->resync_susp_dependency[NEW]; + peer_device->resync_susp_other_c[NOW] = + peer_device->resync_susp_other_c[NEW]; + peer_device->resync_active[NOW] = + peer_device->resync_active[NEW]; + peer_device->replication[NOW] = + peer_device->replication[NEW]; + peer_device->peer_replication[NOW] = + peer_device->peer_replication[NEW]; } - rcu_read_unlock(); + device->cached_state_unstable = !state_is_stable(device); + device->cached_err_io = + (o->on_no_quorum == ONQ_IO_ERROR && !device->have_quorum[NOW]) || + (o->on_no_data == OND_IO_ERROR && !drbd_data_accessible(device, NOW)) || + resource->fail_io[NEW]; + } + resource->cached_all_devices_have_quorum = all_devs_have_quorum; + smp_wmb(); /* Make the NEW_CUR_UUID bit visible after the state change! */ - return rv; -} + idr_for_each_entry(&resource->devices, device, vnr) { + struct drbd_peer_device *peer_device; + if (test_bit(__NEW_CUR_UUID, &device->flags)) { + clear_bit(__NEW_CUR_UUID, &device->flags); + set_bit(NEW_CUR_UUID, &device->flags); + } + ensure_exposed_data_uuid(device); + + wake_up(&device->al_wait); + wake_up(&device->misc_wait); + + /* Due to the exclusivity of two-phase commits, there can only + * be one connection being established at once. Hence it is OK + * to release uuid_sem for all connections if the state change + * is establishing any connection. */ + if (is_connect) { + for_each_peer_device(peer_device, device) { + if (test_and_clear_bit(HOLDING_UUID_READ_LOCK, &peer_device->flags)) + up_read_non_owner(&device->uuid_sem); + } + } + } -static void wake_up_all_devices(struct drbd_connection *connection) -{ - struct drbd_peer_device *peer_device; - int vnr; + wake_up_all(&resource->state_wait); - rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) - wake_up(&peer_device->device->state_wait); + /* Call this after applying the state change from NEW to NOW. */ + queue_after_state_change_work(resource, done, work); +out: rcu_read_unlock(); -} + if ((flags & CS_TWOPC) && !(flags & CS_PREPARE)) + __clear_remote_state_change(resource); + resource->state_change_err_str = NULL; + return rv; +} -/** - * cl_wide_st_chg() - true if the state change is a cluster wide one - * @device: DRBD device. - * @os: old (current) state. - * @ns: new (wanted) state. - */ -static int cl_wide_st_chg(struct drbd_device *device, - union drbd_state os, union drbd_state ns) +void state_change_lock(struct drbd_resource *resource, unsigned long *irq_flags, enum chg_state_flags flags) { - return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED && - ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || - (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || - (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || - (os.disk != D_FAILED && ns.disk == D_FAILED))) || - (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || - (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S) || - (os.conn == C_CONNECTED && ns.conn == C_WF_REPORT_PARAMS); + if ((flags & CS_SERIALIZE) && !(flags & (CS_ALREADY_SERIALIZED | CS_PREPARED))) { + WARN_ONCE(current == resource->worker.task, + "worker should not initiate state changes with CS_SERIALIZE\n"); + down(&resource->state_sem); + } + write_lock_irqsave(&resource->state_rwlock, *irq_flags); + resource->state_change_flags = flags; } -static union drbd_state -apply_mask_val(union drbd_state os, union drbd_state mask, union drbd_state val) +static void __state_change_unlock(struct drbd_resource *resource, unsigned long *irq_flags, struct completion *done) { - union drbd_state ns; - ns.i = (os.i & ~mask.i) | val.i; - return ns; + enum chg_state_flags flags = resource->state_change_flags; + + resource->state_change_flags = 0; + write_unlock_irqrestore(&resource->state_rwlock, *irq_flags); + if (done && expect(resource, current != resource->worker.task)) + wait_for_completion(done); + if ((flags & CS_SERIALIZE) && !(flags & (CS_ALREADY_SERIALIZED | CS_PREPARE))) + up(&resource->state_sem); } -enum drbd_state_rv -drbd_change_state(struct drbd_device *device, enum chg_state_flags f, - union drbd_state mask, union drbd_state val) +void state_change_unlock(struct drbd_resource *resource, unsigned long *irq_flags) { - unsigned long flags; - union drbd_state ns; - enum drbd_state_rv rv; - - spin_lock_irqsave(&device->resource->req_lock, flags); - ns = apply_mask_val(drbd_read_state(device), mask, val); - rv = _drbd_set_state(device, ns, f, NULL); - spin_unlock_irqrestore(&device->resource->req_lock, flags); - - return rv; + __state_change_unlock(resource, irq_flags, NULL); } -/** - * drbd_force_state() - Impose a change which happens outside our control on our state - * @device: DRBD device. - * @mask: mask of state bits to change. - * @val: value of new state bits. - */ -void drbd_force_state(struct drbd_device *device, - union drbd_state mask, union drbd_state val) +void begin_state_change_locked(struct drbd_resource *resource, enum chg_state_flags flags) { - drbd_change_state(device, CS_HARD, mask, val); + BUG_ON(flags & (CS_SERIALIZE | CS_WAIT_COMPLETE | CS_PREPARE | CS_ABORT)); + resource->state_change_flags = flags; + __begin_state_change(resource); } -static enum drbd_state_rv -_req_st_cond(struct drbd_device *device, union drbd_state mask, - union drbd_state val) +enum drbd_state_rv end_state_change_locked(struct drbd_resource *resource, const char *tag) { - union drbd_state os, ns; - unsigned long flags; - enum drbd_state_rv rv; - - if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &device->flags)) - return SS_CW_SUCCESS; + return ___end_state_change(resource, NULL, SS_SUCCESS, tag); +} - if (test_and_clear_bit(CL_ST_CHG_FAIL, &device->flags)) - return SS_CW_FAILED_BY_PEER; +void begin_state_change(struct drbd_resource *resource, unsigned long *irq_flags, enum chg_state_flags flags) +{ + state_change_lock(resource, irq_flags, flags); + __begin_state_change(resource); +} - spin_lock_irqsave(&device->resource->req_lock, flags); - os = drbd_read_state(device); - ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); - rv = is_valid_transition(os, ns); - if (rv >= SS_SUCCESS) - rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ +static enum drbd_state_rv __end_state_change(struct drbd_resource *resource, + unsigned long *irq_flags, + enum drbd_state_rv rv, + const char *tag) +{ + enum chg_state_flags flags = resource->state_change_flags; + struct completion __done, *done = NULL; - if (!cl_wide_st_chg(device, os, ns)) - rv = SS_CW_NO_NEED; - if (rv == SS_UNKNOWN_ERROR) { - rv = is_valid_state(device, ns); - if (rv >= SS_SUCCESS) { - rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection); - if (rv >= SS_SUCCESS) - rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ - } + if ((flags & CS_WAIT_COMPLETE) && !(flags & (CS_PREPARE | CS_ABORT))) { + done = &__done; + init_completion(done); } - spin_unlock_irqrestore(&device->resource->req_lock, flags); - + rv = ___end_state_change(resource, done, rv, tag); + __state_change_unlock(resource, irq_flags, rv >= SS_SUCCESS ? done : NULL); return rv; } -/** - * drbd_req_state() - Perform an eventually cluster wide state change - * @device: DRBD device. - * @mask: mask of state bits to change. - * @val: value of new state bits. - * @f: flags - * - * Should not be called directly, use drbd_request_state() or - * _drbd_request_state(). - */ -static enum drbd_state_rv -drbd_req_state(struct drbd_device *device, union drbd_state mask, - union drbd_state val, enum chg_state_flags f) +enum drbd_state_rv end_state_change(struct drbd_resource *resource, unsigned long *irq_flags, + const char *tag) { - struct completion done; - unsigned long flags; - union drbd_state os, ns; - enum drbd_state_rv rv; - void *buffer = NULL; - - init_completion(&done); - - if (f & CS_SERIALIZE) - mutex_lock(device->state_mutex); - if (f & CS_INHIBIT_MD_IO) - buffer = drbd_md_get_buffer(device, __func__); - - spin_lock_irqsave(&device->resource->req_lock, flags); - os = drbd_read_state(device); - ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); - rv = is_valid_transition(os, ns); - if (rv < SS_SUCCESS) { - spin_unlock_irqrestore(&device->resource->req_lock, flags); - goto abort; - } + return __end_state_change(resource, irq_flags, SS_SUCCESS, tag); +} - if (cl_wide_st_chg(device, os, ns)) { - rv = is_valid_state(device, ns); - if (rv == SS_SUCCESS) - rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection); - spin_unlock_irqrestore(&device->resource->req_lock, flags); +void abort_state_change(struct drbd_resource *resource, unsigned long *irq_flags) +{ + resource->state_change_flags &= ~CS_VERBOSE; + __end_state_change(resource, irq_flags, SS_UNKNOWN_ERROR, NULL); +} - if (rv < SS_SUCCESS) { - if (f & CS_VERBOSE) - print_st_err(device, os, ns, rv); - goto abort; - } +void abort_state_change_locked(struct drbd_resource *resource) +{ + resource->state_change_flags &= ~CS_VERBOSE; + ___end_state_change(resource, NULL, SS_UNKNOWN_ERROR, NULL); +} - if (drbd_send_state_req(first_peer_device(device), mask, val)) { - rv = SS_CW_FAILED_BY_PEER; - if (f & CS_VERBOSE) - print_st_err(device, os, ns, rv); - goto abort; - } +static void begin_remote_state_change(struct drbd_resource *resource, unsigned long *irq_flags) +{ + rcu_read_unlock(); + write_unlock_irqrestore(&resource->state_rwlock, *irq_flags); +} - wait_event(device->state_wait, - (rv = _req_st_cond(device, mask, val))); +static void __end_remote_state_change(struct drbd_resource *resource, enum chg_state_flags flags) +{ + rcu_read_lock(); + resource->state_change_flags = flags; + ___begin_state_change(resource); +} - if (rv < SS_SUCCESS) { - if (f & CS_VERBOSE) - print_st_err(device, os, ns, rv); - goto abort; - } - spin_lock_irqsave(&device->resource->req_lock, flags); - ns = apply_mask_val(drbd_read_state(device), mask, val); - rv = _drbd_set_state(device, ns, f, &done); - } else { - rv = _drbd_set_state(device, ns, f, &done); - } +static void end_remote_state_change(struct drbd_resource *resource, unsigned long *irq_flags, enum chg_state_flags flags) +{ + write_lock_irqsave(&resource->state_rwlock, *irq_flags); + __end_remote_state_change(resource, flags); +} - spin_unlock_irqrestore(&device->resource->req_lock, flags); +void clear_remote_state_change(struct drbd_resource *resource) +{ + unsigned long irq_flags; - if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) { - D_ASSERT(device, current != first_peer_device(device)->connection->worker.task); - wait_for_completion(&done); - } + write_lock_irqsave(&resource->state_rwlock, irq_flags); + __clear_remote_state_change(resource); + write_unlock_irqrestore(&resource->state_rwlock, irq_flags); +} -abort: - if (buffer) - drbd_md_put_buffer(device); - if (f & CS_SERIALIZE) - mutex_unlock(device->state_mutex); +static union drbd_state drbd_get_resource_state(struct drbd_resource *resource, enum which_state which) +{ + union drbd_state rv = { { + .conn = C_STANDALONE, /* really: undefined */ + /* (user_isp, peer_isp, and aftr_isp are undefined as well.) */ + .disk = D_UNKNOWN, /* really: undefined */ + .role = resource->role[which], + .peer = R_UNKNOWN, /* really: undefined */ + .susp = resource->susp_user[which] || resource->susp_quorum[which] || resource->susp_uuid[which], + .susp_nod = resource->susp_nod[which], + .susp_fen = is_suspended_fen(resource, which), + .pdsk = D_UNKNOWN, /* really: undefined */ + } }; return rv; } -/** - * _drbd_request_state() - Request a state change (with flags) - * @device: DRBD device. - * @mask: mask of state bits to change. - * @val: value of new state bits. - * @f: flags - * - * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE - * flag, or when logging of failed state change requests is not desired. - */ -enum drbd_state_rv -_drbd_request_state(struct drbd_device *device, union drbd_state mask, - union drbd_state val, enum chg_state_flags f) +union drbd_state drbd_get_device_state(struct drbd_device *device, enum which_state which) { - enum drbd_state_rv rv; + union drbd_state rv = drbd_get_resource_state(device->resource, which); - wait_event(device->state_wait, - (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE); + rv.disk = device->disk_state[which]; + rv.quorum = device->have_quorum[which]; return rv; } -/* - * We grab drbd_md_get_buffer(), because we don't want to "fail" the disk while - * there is IO in-flight: the transition into D_FAILED for detach purposes - * may get misinterpreted as actual IO error in a confused endio function. - * - * We wrap it all into wait_event(), to retry in case the drbd_req_state() - * returns SS_IN_TRANSIENT_STATE. - * - * To avoid potential deadlock with e.g. the receiver thread trying to grab - * drbd_md_get_buffer() while trying to get out of the "transient state", we - * need to grab and release the meta data buffer inside of that wait_event loop. - */ -static enum drbd_state_rv -request_detach(struct drbd_device *device) -{ - return drbd_req_state(device, NS(disk, D_FAILED), - CS_VERBOSE | CS_ORDERED | CS_INHIBIT_MD_IO); -} - -int drbd_request_detach_interruptible(struct drbd_device *device) +union drbd_state drbd_get_peer_device_state(struct drbd_peer_device *peer_device, enum which_state which) { - int ret, rv; + struct drbd_connection *connection = peer_device->connection; + union drbd_state rv; - drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */ - wait_event_interruptible(device->state_wait, - (rv = request_detach(device)) != SS_IN_TRANSIENT_STATE); - drbd_resume_io(device); - - ret = wait_event_interruptible(device->misc_wait, - device->state.disk != D_FAILED); - - if (rv == SS_IS_DISKLESS) - rv = SS_NOTHING_TO_DO; - if (ret) - rv = ERR_INTR; + rv = drbd_get_device_state(peer_device->device, which); + rv.user_isp = peer_device->resync_susp_user[which]; + rv.peer_isp = peer_device->resync_susp_peer[which]; + rv.aftr_isp = resync_susp_comb_dep(peer_device, which); + rv.conn = combined_conn_state(peer_device, which); + rv.peer = connection->peer_role[which]; + rv.pdsk = peer_device->disk_state[which]; return rv; } -enum drbd_state_rv -_drbd_request_state_holding_state_mutex(struct drbd_device *device, union drbd_state mask, - union drbd_state val, enum chg_state_flags f) +enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection) { - enum drbd_state_rv rv; - - BUG_ON(f & CS_SERIALIZE); + enum drbd_disk_state disk_state = D_DISKLESS; + struct drbd_peer_device *peer_device; + int vnr; - wait_event_cmd(device->state_wait, - (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE, - mutex_unlock(device->state_mutex), - mutex_lock(device->state_mutex)); + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + disk_state = max_t(enum drbd_disk_state, disk_state, device->disk_state[NOW]); + } + rcu_read_unlock(); - return rv; + return disk_state; } -static void print_st(struct drbd_device *device, const char *name, union drbd_state ns) +enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection) { - drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n", - name, - drbd_conn_str(ns.conn), - drbd_role_str(ns.role), - drbd_role_str(ns.peer), - drbd_disk_str(ns.disk), - drbd_disk_str(ns.pdsk), - is_susp(ns) ? 's' : 'r', - ns.aftr_isp ? 'a' : '-', - ns.peer_isp ? 'p' : '-', - ns.user_isp ? 'u' : '-', - ns.susp_fen ? 'F' : '-', - ns.susp_nod ? 'N' : '-' - ); + enum drbd_disk_state disk_state = D_DISKLESS; + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + disk_state = max_t(enum drbd_disk_state, disk_state, peer_device->disk_state[NOW]); + rcu_read_unlock(); + + return disk_state; } -void print_st_err(struct drbd_device *device, union drbd_state os, - union drbd_state ns, enum drbd_state_rv err) +static bool suspend_reason_changed(struct drbd_resource *resource) { - if (err == SS_IN_TRANSIENT_STATE) - return; - drbd_err(device, "State change failed: %s\n", drbd_set_st_err_str(err)); - print_st(device, " state", os); - print_st(device, "wanted", ns); + return resource->susp_user[OLD] != resource->susp_user[NEW] || + resource->susp_nod[OLD] != resource->susp_nod[NEW] || + resource->susp_quorum[OLD] != resource->susp_quorum[NEW] || + resource->susp_uuid[OLD] != resource->susp_uuid[NEW] || + is_suspended_fen(resource, OLD) != is_suspended_fen(resource, NEW); } -static long print_state_change(char *pb, union drbd_state os, union drbd_state ns, - enum chg_state_flags flags) +static bool resync_suspended(struct drbd_peer_device *peer_device, enum which_state which) { - char *pbp; - pbp = pb; - *pbp = 0; - - if (ns.role != os.role && flags & CS_DC_ROLE) - pbp += sprintf(pbp, "role( %s -> %s ) ", - drbd_role_str(os.role), - drbd_role_str(ns.role)); - if (ns.peer != os.peer && flags & CS_DC_PEER) - pbp += sprintf(pbp, "peer( %s -> %s ) ", - drbd_role_str(os.peer), - drbd_role_str(ns.peer)); - if (ns.conn != os.conn && flags & CS_DC_CONN) - pbp += sprintf(pbp, "conn( %s -> %s ) ", - drbd_conn_str(os.conn), - drbd_conn_str(ns.conn)); - if (ns.disk != os.disk && flags & CS_DC_DISK) - pbp += sprintf(pbp, "disk( %s -> %s ) ", - drbd_disk_str(os.disk), - drbd_disk_str(ns.disk)); - if (ns.pdsk != os.pdsk && flags & CS_DC_PDSK) - pbp += sprintf(pbp, "pdsk( %s -> %s ) ", - drbd_disk_str(os.pdsk), - drbd_disk_str(ns.pdsk)); - - return pbp - pb; + return peer_device->resync_susp_user[which] || + peer_device->resync_susp_peer[which] || + resync_susp_comb_dep(peer_device, which); } -static void drbd_pr_state_change(struct drbd_device *device, union drbd_state os, union drbd_state ns, - enum chg_state_flags flags) +static int scnprintf_resync_suspend_flags(char *buffer, size_t size, + struct drbd_peer_device *peer_device, + enum which_state which) { - char pb[300]; - char *pbp = pb; + struct drbd_device *device = peer_device->device; + char *b = buffer, *end = buffer + size; + + if (!resync_suspended(peer_device, which)) + return scnprintf(buffer, size, "no"); - pbp += print_state_change(pbp, os, ns, flags ^ CS_DC_MASK); + if (peer_device->resync_susp_user[which]) + b += scnprintf(b, end - b, "user,"); + if (peer_device->resync_susp_peer[which]) + b += scnprintf(b, end - b, "peer,"); + if (peer_device->resync_susp_dependency[which]) + b += scnprintf(b, end - b, "after dependency,"); + if (peer_device->resync_susp_other_c[which]) + b += scnprintf(b, end - b, "connection dependency,"); + if (is_sync_source_state(peer_device, which) && device->disk_state[which] <= D_INCONSISTENT) + b += scnprintf(b, end - b, "disk inconsistent,"); - if (ns.aftr_isp != os.aftr_isp) - pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ", - os.aftr_isp, - ns.aftr_isp); - if (ns.peer_isp != os.peer_isp) - pbp += sprintf(pbp, "peer_isp( %d -> %d ) ", - os.peer_isp, - ns.peer_isp); - if (ns.user_isp != os.user_isp) - pbp += sprintf(pbp, "user_isp( %d -> %d ) ", - os.user_isp, - ns.user_isp); + *(--b) = 0; - if (pbp != pb) - drbd_info(device, "%s\n", pb); + return b - buffer; } -static void conn_pr_state_change(struct drbd_connection *connection, union drbd_state os, union drbd_state ns, - enum chg_state_flags flags) +static int scnprintf_io_suspend_flags(char *buffer, size_t size, + struct drbd_resource *resource, + enum which_state which) { - char pb[300]; - char *pbp = pb; - - pbp += print_state_change(pbp, os, ns, flags); - - if (is_susp(ns) != is_susp(os) && flags & CS_DC_SUSP) - pbp += sprintf(pbp, "susp( %d -> %d ) ", - is_susp(os), - is_susp(ns)); - - if (pbp != pb) - drbd_info(connection, "%s\n", pb); + char *b = buffer, *end = buffer + size; + + if (!resource_is_suspended(resource, which)) + return scnprintf(buffer, size, "no"); + + if (resource->susp_user[which]) + b += scnprintf(b, end - b, "user,"); + if (resource->susp_nod[which]) + b += scnprintf(b, end - b, "no-disk,"); + if (is_suspended_fen(resource, which)) + b += scnprintf(b, end - b, "fencing,"); + if (resource->susp_quorum[which]) + b += scnprintf(b, end - b, "quorum,"); + if (resource->susp_uuid[which]) + b += scnprintf(b, end - b, "uuid,"); + *(--b) = 0; + + return b - buffer; } - -/** - * is_valid_state() - Returns an SS_ error code if ns is not valid - * @device: DRBD device. - * @ns: State to consider. - */ -static enum drbd_state_rv -is_valid_state(struct drbd_device *device, union drbd_state ns) +static void print_state_change(struct drbd_resource *resource, const char *prefix, const char *tag) { - /* See drbd_state_sw_errors in drbd_strings.c */ - - enum drbd_fencing_p fp; - enum drbd_state_rv rv = SS_SUCCESS; - struct net_conf *nc; + char buffer[150], *b, *end = buffer + sizeof(buffer); + struct drbd_connection *connection; + struct drbd_device *device; + enum drbd_role *role = resource->role; + bool *fail_io = resource->fail_io; + int vnr; - rcu_read_lock(); - fp = FP_DONT_CARE; - if (get_ldev(device)) { - fp = rcu_dereference(device->ldev->disk_conf)->fencing; - put_ldev(device); + b = buffer; + if (role[OLD] != role[NEW]) + b += scnprintf(b, end - b, "role( %s -> %s ) ", + drbd_role_str(role[OLD]), + drbd_role_str(role[NEW])); + if (suspend_reason_changed(resource)) { + b += scnprintf(b, end - b, "susp-io( "); + b += scnprintf_io_suspend_flags(b, end - b, resource, OLD); + b += scnprintf(b, end - b, " -> "); + b += scnprintf_io_suspend_flags(b, end - b, resource, NEW); + b += scnprintf(b, end - b, " ) "); + } + if (fail_io[OLD] != fail_io[NEW]) + b += scnprintf(b, end - b, "force-io-failures( %s -> %s ) ", + fail_io[OLD] ? "yes" : "no", + fail_io[NEW] ? "yes" : "no"); + if (b != buffer) { + *(b-1) = 0; + drbd_info(resource, "%s%s%s%s%s\n", prefix, buffer, + tag ? " [" : "", tag ?: "", tag ? "]" : ""); } - nc = rcu_dereference(first_peer_device(device)->connection->net_conf); - if (nc) { - if (!nc->two_primaries && ns.role == R_PRIMARY) { - if (ns.peer == R_PRIMARY) - rv = SS_TWO_PRIMARIES; - else if (conn_highest_peer(first_peer_device(device)->connection) == R_PRIMARY) - rv = SS_O_VOL_PEER_PRI; + for_each_connection(connection, resource) { + enum drbd_conn_state *cstate = connection->cstate; + enum drbd_role *peer_role = connection->peer_role; + + b = buffer; + if (cstate[OLD] != cstate[NEW]) + b += scnprintf(b, end - b, "conn( %s -> %s ) ", + drbd_conn_str(cstate[OLD]), + drbd_conn_str(cstate[NEW])); + if (peer_role[OLD] != peer_role[NEW]) + b += scnprintf(b, end - b, "peer( %s -> %s ) ", + drbd_role_str(peer_role[OLD]), + drbd_role_str(peer_role[NEW])); + + if (b != buffer) { + *(b-1) = 0; + drbd_info(connection, "%s%s%s%s%s\n", prefix, buffer, + tag ? " [" : "", tag ?: "", tag ? "]" : ""); } } - if (rv <= 0) - goto out; /* already found a reason to abort */ - else if (ns.role == R_SECONDARY && device->open_cnt) - rv = SS_DEVICE_IN_USE; + idr_for_each_entry(&resource->devices, device, vnr) { + struct drbd_peer_device *peer_device; + enum drbd_disk_state *disk_state = device->disk_state; + bool *have_quorum = device->have_quorum; + + b = buffer; + if (disk_state[OLD] != disk_state[NEW]) + b += scnprintf(b, end - b, "disk( %s -> %s ) ", + drbd_disk_str(disk_state[OLD]), + drbd_disk_str(disk_state[NEW])); + if (have_quorum[OLD] != have_quorum[NEW]) + b += scnprintf(b, end - b, "quorum( %s -> %s ) ", + have_quorum[OLD] ? "yes" : "no", + have_quorum[NEW] ? "yes" : "no"); + if (b != buffer) { + *(b-1) = 0; + drbd_info(device, "%s%s%s%s%s\n", prefix, buffer, + tag ? " [" : "", tag ?: "", tag ? "]" : ""); + } - else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE) - rv = SS_NO_UP_TO_DATE_DISK; + for_each_peer_device(peer_device, device) { + enum drbd_disk_state *peer_disk_state = peer_device->disk_state; + enum drbd_repl_state *repl_state = peer_device->repl_state; + bool *replication = peer_device->replication; + bool *peer_replication = peer_device->peer_replication; + + b = buffer; + if (peer_disk_state[OLD] != peer_disk_state[NEW]) + b += scnprintf(b, end - b, "pdsk( %s -> %s ) ", + drbd_disk_str(peer_disk_state[OLD]), + drbd_disk_str(peer_disk_state[NEW])); + if (repl_state[OLD] != repl_state[NEW]) + b += scnprintf(b, end - b, "repl( %s -> %s ) ", + drbd_repl_str(repl_state[OLD]), + drbd_repl_str(repl_state[NEW])); + + if (resync_suspended(peer_device, OLD) != + resync_suspended(peer_device, NEW)) { + b += scnprintf(b, end - b, "resync-susp( "); + b += scnprintf_resync_suspend_flags(b, end - b, peer_device, OLD); + b += scnprintf(b, end - b, " -> "); + b += scnprintf_resync_suspend_flags(b, end - b, peer_device, NEW); + b += scnprintf(b, end - b, " ) "); + } - else if (fp >= FP_RESOURCE && - ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN) - rv = SS_PRIMARY_NOP; + if (replication[OLD] != replication[NEW]) + b += scnprintf(b, end - b, "replication( %s -> %s ) ", + replication[OLD] ? "yes" : "no", + replication[NEW] ? "yes" : "no"); - else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT) - rv = SS_NO_UP_TO_DATE_DISK; + if (peer_replication[OLD] != peer_replication[NEW]) + b += scnprintf(b, end - b, "peer_replication( %s -> %s ) ", + peer_replication[OLD] ? "yes" : "no", + peer_replication[NEW] ? "yes" : "no"); - else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT) - rv = SS_NO_LOCAL_DISK; + if (b != buffer) { + *(b-1) = 0; + drbd_info(peer_device, "%s%s%s%s%s\n", prefix, buffer, + tag ? " [" : "", tag ?: "", tag ? "]" : ""); + } + } + } +} - else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) - rv = SS_NO_REMOTE_DISK; +static bool local_disk_may_be_outdated(struct drbd_device *device) +{ + struct drbd_peer_device *peer_device; - else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) - rv = SS_NO_UP_TO_DATE_DISK; + if (device->resource->role[NEW] == R_PRIMARY) { + for_each_peer_device(peer_device, device) { + if (peer_device->disk_state[NEW] == D_UP_TO_DATE && + peer_device->repl_state[NEW] == L_WF_BITMAP_T) + return true; + } + return false; + } - else if ((ns.conn == C_CONNECTED || - ns.conn == C_WF_BITMAP_S || - ns.conn == C_SYNC_SOURCE || - ns.conn == C_PAUSED_SYNC_S) && - ns.disk == D_OUTDATED) - rv = SS_CONNECTED_OUTDATES; + for_each_peer_device(peer_device, device) { + if (peer_device->connection->peer_role[NEW] == R_PRIMARY && + peer_device->repl_state[NEW] > L_OFF) + goto have_primary_neighbor; + } - else if (nc && (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && - (nc->verify_alg[0] == 0)) - rv = SS_NO_VERIFY_ALG; + return true; /* No neighbor primary, I might be outdated*/ - else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && - first_peer_device(device)->connection->agreed_pro_version < 88) - rv = SS_NOT_SUPPORTED; +have_primary_neighbor: + /* Allow self outdating while connecting to a diskless primary. */ + if (peer_device->disk_state[NEW] == D_DISKLESS && + peer_device->repl_state[OLD] == L_OFF && peer_device->repl_state[NEW] == L_ESTABLISHED) + return true; - else if (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) - rv = SS_NO_UP_TO_DATE_DISK; + for_each_peer_device(peer_device, device) { + enum drbd_repl_state repl_state = peer_device->repl_state[NEW]; + switch (repl_state) { + case L_WF_BITMAP_S: + case L_STARTING_SYNC_S: + case L_SYNC_SOURCE: + case L_PAUSED_SYNC_S: + case L_AHEAD: + case L_ESTABLISHED: + case L_VERIFY_S: + case L_VERIFY_T: + case L_OFF: + continue; + case L_WF_SYNC_UUID: + case L_WF_BITMAP_T: + case L_STARTING_SYNC_T: + case L_SYNC_TARGET: + case L_PAUSED_SYNC_T: + case L_BEHIND: + return true; + } + } - else if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && - ns.pdsk == D_UNKNOWN) - rv = SS_NEED_CONNECTION; + return false; +} - else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) - rv = SS_CONNECTED_OUTDATES; +static int calc_quorum_at(s32 setting, int voters) +{ + int quorum_at; -out: - rcu_read_unlock(); + switch (setting) { + case QOU_MAJORITY: + quorum_at = voters / 2 + 1; + break; + case QOU_ALL: + quorum_at = voters; + break; + default: + quorum_at = setting; + } - return rv; + return quorum_at; } -/** - * is_valid_soft_transition() - Returns an SS_ error code if the state transition is not possible - * This function limits state transitions that may be declined by DRBD. I.e. - * user requests (aka soft transitions). - * @os: old state. - * @ns: new state. - * @connection: DRBD connection. - */ -static enum drbd_state_rv -is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_connection *connection) +static void __calc_quorum_with_disk(struct drbd_device *device, struct quorum_detail *qd) { - enum drbd_state_rv rv = SS_SUCCESS; - - if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && - os.conn > C_CONNECTED) - rv = SS_RESYNC_RUNNING; + struct drbd_resource *resource = device->resource; + const u64 quorumless_nodes = device->have_quorum[NOW] ? ~resource->members : 0; + const int my_node_id = resource->res_opts.node_id; + int node_id; - if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE) - rv = SS_ALREADY_STANDALONE; + check_wrongly_set_mdf_exists(device); - if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS) - rv = SS_IS_DISKLESS; + rcu_read_lock(); + for (node_id = 0; node_id < DRBD_NODE_ID_MAX; node_id++) { + struct drbd_peer_md *peer_md = &device->ldev->md.peers[node_id]; + struct drbd_peer_device *peer_device; + enum drbd_disk_state disk_state; + enum drbd_repl_state repl_state; + bool is_intentional_diskless, is_tiebreaker; + struct net_conf *nc; + + if (node_id == my_node_id) { + disk_state = device->disk_state[NEW]; + if (disk_state > D_DISKLESS) { + if (disk_state == D_UP_TO_DATE) + qd->up_to_date++; + else + qd->present++; + } + continue; + } - if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED) - rv = SS_NO_NET_CONFIG; + /* Ignore non existing nodes. + Note: a fresh (before connected once), intentional diskless peer + gets ignored as well by this. + A fresh diskful peer counts! (since it has MDF_HAVE_BITMAP) */ + if (!(peer_md->flags & (MDF_HAVE_BITMAP | MDF_NODE_EXISTS | MDF_PEER_DEVICE_SEEN))) + continue; - if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING) - rv = SS_LOWER_THAN_OUTDATED; + peer_device = peer_device_by_node_id(device, node_id); - if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) - rv = SS_IN_TRANSIENT_STATE; + if (peer_device) { + is_intentional_diskless = !want_bitmap(peer_device); + nc = rcu_dereference(peer_device->connection->transport.net_conf); + is_tiebreaker = rcu_dereference(peer_device->conf)->peer_tiebreaker; + if (nc && !nc->allow_remote_read) { + dynamic_drbd_dbg(peer_device, + "Excluding from quorum calculation because allow-remote-read = no\n"); + continue; + } + } else { + is_intentional_diskless = !(peer_md->flags & MDF_PEER_DEVICE_SEEN); + is_tiebreaker = true; + } - /* While establishing a connection only allow cstate to change. - Delay/refuse role changes, detach attach etc... (they do not touch cstate) */ - if (test_bit(STATE_SENT, &connection->flags) && - !((ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION) || - (ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS))) - rv = SS_IN_TRANSIENT_STATE; + if (is_intentional_diskless && !is_tiebreaker) + continue; - /* Do not promote during resync handshake triggered by "force primary". - * This is a hack. It should really be rejected by the peer during the - * cluster wide state change request. */ - if (os.role != R_PRIMARY && ns.role == R_PRIMARY - && ns.pdsk == D_UP_TO_DATE - && ns.disk != D_UP_TO_DATE && ns.disk != D_DISKLESS - && (ns.conn <= C_WF_SYNC_UUID || ns.conn != os.conn)) - rv = SS_IN_TRANSIENT_STATE; + repl_state = peer_device ? peer_device->repl_state[NEW] : L_OFF; + disk_state = peer_device ? peer_device->disk_state[NEW] : D_UNKNOWN; + + if (repl_state == L_OFF) { + if (is_intentional_diskless) + /* device should be diskless but is absent */ + qd->missing_diskless++; + else if (disk_state <= D_OUTDATED || peer_md->flags & MDF_PEER_OUTDATED) + qd->outdated++; + else if (NODE_MASK(node_id) & quorumless_nodes) + qd->quorumless++; + else + qd->unknown++; + } else { + if (disk_state == D_DISKLESS && is_intentional_diskless) + qd->diskless++; + else if (disk_state == D_UP_TO_DATE) + qd->up_to_date++; + else + qd->present++; + } + } + rcu_read_unlock(); +} - if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) - rv = SS_NEED_CONNECTION; +static void __calc_quorum_no_disk(struct drbd_device *device, struct quorum_detail *qd) +{ + struct drbd_resource *resource = device->resource; + const u64 quorumless_nodes = device->have_quorum[NOW] ? ~resource->members : 0; + struct drbd_peer_device *peer_device; + bool is_intentional_diskless; + + if (device->disk_state[NEW] == D_DISKLESS) { + /* We only want to consider ourselves as a diskless node when + * we actually intended to be diskless in the config. Otherwise, + * we shouldn't get a vote in the quorum process, so count + * ourselves as unknown. */ + if (device->device_conf.intentional_diskless) + qd->diskless++; + else + qd->unknown++; + } - if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && - ns.conn != os.conn && os.conn > C_CONNECTED) - rv = SS_RESYNC_RUNNING; + rcu_read_lock(); + for_each_peer_device_rcu(peer_device, device) { + enum drbd_disk_state disk_state; + enum drbd_repl_state repl_state; + struct net_conf *nc; + bool is_tiebreaker; + + repl_state = peer_device->repl_state[NEW]; + disk_state = peer_device->disk_state[NEW]; + + is_intentional_diskless = !want_bitmap(peer_device); + nc = rcu_dereference(peer_device->connection->transport.net_conf); + is_tiebreaker = rcu_dereference(peer_device->conf)->peer_tiebreaker; + if (nc && !nc->allow_remote_read) { + dynamic_drbd_dbg(peer_device, + "Excluding from quorum calculation because allow-remote-read = no\n"); + continue; + } + if (is_intentional_diskless && !is_tiebreaker) + continue; - if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && - os.conn < C_CONNECTED) - rv = SS_NEED_CONNECTION; + if (repl_state == L_OFF) { + if (is_intentional_diskless) + /* device should be diskless but is absent */ + qd->missing_diskless++; + else if (disk_state <= D_OUTDATED) + qd->outdated++; + else if (NODE_MASK(peer_device->node_id) & quorumless_nodes) + qd->quorumless++; + else + qd->unknown++; + } else { + if (disk_state == D_DISKLESS && is_intentional_diskless) + qd->diskless++; + else if (disk_state == D_UP_TO_DATE) + qd->up_to_date++; + else + qd->present++; + } - if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE) - && os.conn < C_WF_REPORT_PARAMS) - rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ + if (disk_state == D_UP_TO_DATE && test_bit(PEER_QUORATE, &peer_device->flags)) + qd->quorate_peers++; - if (ns.conn == C_DISCONNECTING && ns.pdsk == D_OUTDATED && - os.conn < C_CONNECTED && os.pdsk > D_OUTDATED) - rv = SS_OUTDATE_WO_CONN; - return rv; + } + rcu_read_unlock(); } -static enum drbd_state_rv -is_valid_conn_transition(enum drbd_conns oc, enum drbd_conns nc) +static bool calc_quorum(struct drbd_device *device, struct quorum_info *qi) { - /* no change -> nothing to do, at least for the connection part */ - if (oc == nc) - return SS_NOTHING_TO_DO; - - /* disconnect of an unconfigured connection does not make sense */ - if (oc == C_STANDALONE && nc == C_DISCONNECTING) - return SS_ALREADY_STANDALONE; + struct drbd_resource *resource = device->resource; + int voters, quorum_at, diskless_majority_at, min_redundancy_at; + struct quorum_detail qd = {}; + bool have_quorum; - /* from C_STANDALONE, we start with C_UNCONNECTED */ - if (oc == C_STANDALONE && nc != C_UNCONNECTED) - return SS_NEED_CONNECTION; + if (device->disk_state[NEW] > D_ATTACHING && get_ldev_if_state(device, D_ATTACHING)) { + __calc_quorum_with_disk(device, &qd); + put_ldev(device); + } else { + __calc_quorum_no_disk(device, &qd); + } - /* When establishing a connection we need to go through WF_REPORT_PARAMS! - Necessary to do the right thing upon invalidate-remote on a disconnected resource */ - if (oc < C_WF_REPORT_PARAMS && nc >= C_CONNECTED) - return SS_NEED_CONNECTION; + /* Check if a partition containing all missing nodes might have quorum */ + voters = qd.outdated + qd.quorumless + qd.unknown + qd.up_to_date + qd.present; + quorum_at = calc_quorum_at(resource->res_opts.quorum, voters); + if (qd.outdated + qd.quorumless + qd.unknown >= quorum_at) { + /* when the missing nodes have the quorum, give up the quorumless */ + qd.unknown += qd.quorumless; + qd.quorumless = 0; + } - /* After a network error only C_UNCONNECTED or C_DISCONNECTING may follow. */ - if (oc >= C_TIMEOUT && oc <= C_TEAR_DOWN && nc != C_UNCONNECTED && nc != C_DISCONNECTING) - return SS_IN_TRANSIENT_STATE; + /* When all the absent nodes are D_OUTDATED (no one D_UNKNOWN), we can be + sure that the other partition is not able to promote. -> + We remove them from the voters. -> We have quorum */ + if (qd.unknown) + voters = qd.outdated + qd.quorumless + qd.unknown + qd.up_to_date + qd.present; + else + voters = qd.up_to_date + qd.present; + + quorum_at = calc_quorum_at(resource->res_opts.quorum, voters); + diskless_majority_at = calc_quorum_at(QOU_MAJORITY, qd.diskless + qd.missing_diskless); + min_redundancy_at = calc_quorum_at(resource->res_opts.quorum_min_redundancy, voters); + + if (qi) { + qi->voters = voters; + qi->up_to_date = qd.up_to_date; + qi->present = qd.present; + qi->quorum_at = quorum_at; + qi->min_redundancy_at = min_redundancy_at; + } - /* After C_DISCONNECTING only C_STANDALONE may follow */ - if (oc == C_DISCONNECTING && nc != C_STANDALONE) - return SS_IN_TRANSIENT_STATE; + have_quorum = qd.quorate_peers || + ((qd.up_to_date + qd.present) >= quorum_at && qd.up_to_date >= min_redundancy_at); + + if (!have_quorum && voters != 0 && voters % 2 == 0 && qd.up_to_date + qd.present == quorum_at - 1 && + /* It is an even number of nodes (think 2) and we failed by one vote. + Check if we have majority of the diskless nodes connected. + Using the diskless nodes a tie-breaker! */ + qd.diskless >= diskless_majority_at && device->have_quorum[NOW]) { + have_quorum = true; + if (!test_bit(TIEBREAKER_QUORUM, &device->flags)) { + set_bit(TIEBREAKER_QUORUM, &device->flags); + drbd_info(device, "Would lose quorum, but using tiebreaker logic to keep\n"); + } + } else { + clear_bit(TIEBREAKER_QUORUM, &device->flags); + } - return SS_SUCCESS; + return have_quorum; } +static __printf(2, 3) void _drbd_state_err(struct change_context *context, const char *fmt, ...) +{ + struct drbd_resource *resource = context->resource; + const char *err_str; + va_list args; + + va_start(args, fmt); + err_str = kvasprintf(GFP_ATOMIC, fmt, args); + va_end(args); + if (!err_str) + return; + if (context->flags & CS_VERBOSE) + drbd_err(resource, "%s\n", err_str); + + if (context->err_str) + *context->err_str = err_str; + else + kfree(err_str); +} + +static __printf(2, 3) void drbd_state_err(struct drbd_resource *resource, const char *fmt, ...) +{ + const char *err_str; + va_list args; + + va_start(args, fmt); + err_str = kvasprintf(GFP_ATOMIC, fmt, args); + va_end(args); + if (!err_str) + return; + if (resource->state_change_flags & CS_VERBOSE) + drbd_err(resource, "%s\n", err_str); + + if (resource->state_change_err_str) + *resource->state_change_err_str = err_str; + else + kfree(err_str); +} + +static enum drbd_state_rv __is_valid_soft_transition(struct drbd_resource *resource) +{ + enum drbd_role *role = resource->role; + bool *fail_io = resource->fail_io; + struct drbd_connection *connection; + struct drbd_device *device; + bool in_handshake = false; + int vnr; + + /* See drbd_state_sw_errors in drbd_strings.c */ + + if (role[OLD] != R_PRIMARY && role[NEW] == R_PRIMARY) { + for_each_connection_rcu(connection, resource) { + struct net_conf *nc; + + nc = rcu_dereference(connection->transport.net_conf); + if (!nc || nc->two_primaries) + continue; + if (connection->peer_role[NEW] == R_PRIMARY) + return SS_TWO_PRIMARIES; + } + } + + for_each_connection_rcu(connection, resource) { + struct drbd_peer_device *peer_device; + + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + if (test_bit(HOLDING_UUID_READ_LOCK, &peer_device->flags) && + peer_device->repl_state[NOW] == L_OFF) { + in_handshake = true; + goto handshake_found; + } + } + } +handshake_found: + + if (in_handshake && role[OLD] != role[NEW]) + return SS_IN_TRANSIENT_STATE; + + if (role[OLD] == R_SECONDARY && role[NEW] == R_PRIMARY && fail_io[NEW]) + return SS_DEVICE_IN_USE; + + for_each_connection_rcu(connection, resource) { + enum drbd_conn_state *cstate = connection->cstate; + enum drbd_role *peer_role = connection->peer_role; + struct net_conf *nc; + bool two_primaries; + + if (cstate[NEW] == C_DISCONNECTING && cstate[OLD] == C_STANDALONE) + return SS_ALREADY_STANDALONE; + + if (cstate[NEW] == C_CONNECTING && cstate[OLD] < C_UNCONNECTED) + return SS_NO_NET_CONFIG; + + if (cstate[NEW] == C_DISCONNECTING && cstate[OLD] == C_UNCONNECTED) + return SS_IN_TRANSIENT_STATE; + + nc = rcu_dereference(connection->transport.net_conf); + two_primaries = nc ? nc->two_primaries : false; + if (peer_role[NEW] == R_PRIMARY && peer_role[OLD] != R_PRIMARY && !two_primaries) { + if (role[NOW] == R_PRIMARY) + return SS_TWO_PRIMARIES; + if (!fail_io[NEW]) { + idr_for_each_entry(&resource->devices, device, vnr) { + if (!device->writable && device->open_cnt) + return SS_PRIMARY_READER; + /* + * One might be tempted to add "|| open_rw_cont" here. + * That is wrong. The promotion of a rw opener will be + * handled in its own two-phase commit. + * Returning SS_PRIMARY_READER for a rw_opener might + * causes confusion for the caller, if that then waits + * for the read-only openers to go away. + */ + } + } + } + } + + idr_for_each_entry(&resource->devices, device, vnr) { + enum drbd_disk_state *disk_state = device->disk_state; + struct drbd_peer_device *peer_device; + bool any_disk_up_to_date[2]; + enum which_state which; + int nr_negotiating = 0; + + if (in_handshake && + ((disk_state[OLD] < D_ATTACHING && disk_state[NEW] == D_ATTACHING) || + (disk_state[OLD] > D_DETACHING && disk_state[NEW] == D_DETACHING))) + return SS_IN_TRANSIENT_STATE; + + if (role[OLD] == R_PRIMARY && role[NEW] == R_SECONDARY && device->writable && + !(resource->state_change_flags & CS_FS_IGN_OPENERS)) + return SS_DEVICE_IN_USE; + + if (disk_state[NEW] > D_ATTACHING && disk_state[OLD] == D_DISKLESS) + return SS_IS_DISKLESS; + + if (disk_state[NEW] == D_OUTDATED && disk_state[OLD] < D_OUTDATED && + disk_state[OLD] != D_ATTACHING && disk_state[OLD] != D_NEGOTIATING) { + /* Do not allow outdate of inconsistent or diskless. + But we have to allow Inconsistent -> Outdated if a resync + finishes over one connection, and is paused on other connections */ + + for_each_peer_device_rcu(peer_device, device) { + enum drbd_repl_state *repl_state = peer_device->repl_state; + if (repl_state[OLD] == L_SYNC_TARGET && repl_state[NEW] == L_ESTABLISHED) + goto allow; + } + return SS_LOWER_THAN_OUTDATED; + } + allow: + + for (which = OLD; which <= NEW; which++) + any_disk_up_to_date[which] = drbd_data_accessible(device, which); + + /* Prevent becoming primary while there is not data accessible + and prevent detach or disconnect while primary */ + if (!(role[OLD] == R_PRIMARY && !any_disk_up_to_date[OLD]) && + (role[NEW] == R_PRIMARY && !any_disk_up_to_date[NEW])) + return SS_NO_UP_TO_DATE_DISK; + + /* Prevent detach or disconnect while held open read only */ + if (!device->writable && device->open_cnt && + any_disk_up_to_date[OLD] && !any_disk_up_to_date[NEW]) + return SS_NO_UP_TO_DATE_DISK; + + if (disk_state[NEW] == D_NEGOTIATING) + nr_negotiating++; + + /* Prevent promote when there is no quorum and + * prevent graceful disconnect/detach that would kill quorum + */ + if ((role[OLD] == R_SECONDARY || device->have_quorum[OLD]) && + role[NEW] == R_PRIMARY && !device->have_quorum[NEW]) { + struct quorum_info qi; + + calc_quorum(device, &qi); + + if (disk_state[NEW] <= D_ATTACHING) + drbd_state_err(resource, "no UpToDate peer with quorum"); + else if (qi.up_to_date + qi.present < qi.quorum_at) + drbd_state_err(resource, "%d of %d nodes visible, need %d for quorum", + qi.up_to_date + qi.present, qi.voters, qi.quorum_at); + else if (qi.up_to_date < qi.min_redundancy_at) + drbd_state_err(resource, "%d of %d nodes up_to_date, need %d for " + "quorum-minimum-redundancy", + qi.up_to_date, qi.voters, qi.min_redundancy_at); + return SS_NO_QUORUM; + } + + for_each_peer_device_rcu(peer_device, device) { + enum drbd_disk_state *peer_disk_state = peer_device->disk_state; + enum drbd_repl_state *repl_state = peer_device->repl_state; + + if (peer_disk_state[NEW] == D_NEGOTIATING) + nr_negotiating++; + + if (nr_negotiating > 1) + return SS_IN_TRANSIENT_STATE; + + if (peer_device->connection->fencing_policy >= FP_RESOURCE && + !(role[OLD] == R_PRIMARY && repl_state[OLD] < L_ESTABLISHED && !(peer_disk_state[OLD] <= D_OUTDATED)) && + (role[NEW] == R_PRIMARY && repl_state[NEW] < L_ESTABLISHED && !(peer_disk_state[NEW] <= D_OUTDATED))) + return SS_PRIMARY_NOP; + + if (!(repl_state[OLD] > L_ESTABLISHED && disk_state[OLD] < D_INCONSISTENT) && + (repl_state[NEW] > L_ESTABLISHED && disk_state[NEW] < D_INCONSISTENT)) + return SS_NO_LOCAL_DISK; + + if (!(repl_state[OLD] > L_ESTABLISHED && peer_disk_state[OLD] < D_INCONSISTENT) && + (repl_state[NEW] > L_ESTABLISHED && peer_disk_state[NEW] < D_INCONSISTENT)) + return SS_NO_REMOTE_DISK; + + if (disk_state[OLD] > D_OUTDATED && disk_state[NEW] == D_OUTDATED && + !local_disk_may_be_outdated(device)) + return SS_CONNECTED_OUTDATES; + + if (!(repl_state[OLD] == L_VERIFY_S || repl_state[OLD] == L_VERIFY_T) && + (repl_state[NEW] == L_VERIFY_S || repl_state[NEW] == L_VERIFY_T)) { + struct net_conf *nc = rcu_dereference(peer_device->connection->transport.net_conf); + + if (!nc || nc->verify_alg[0] == 0) + return SS_NO_VERIFY_ALG; + } + + if (!(repl_state[OLD] == L_VERIFY_S || repl_state[OLD] == L_VERIFY_T) && + (repl_state[NEW] == L_VERIFY_S || repl_state[NEW] == L_VERIFY_T) && + peer_device->connection->agreed_pro_version < 88) + return SS_NOT_SUPPORTED; + + if (repl_is_sync_source(repl_state[OLD]) && + repl_state[NEW] == L_WF_BITMAP_S) + return SS_RESYNC_RUNNING; + + if (repl_is_sync_target(repl_state[OLD]) && + repl_state[NEW] == L_WF_BITMAP_T) + return SS_RESYNC_RUNNING; + + if (repl_state[NEW] != repl_state[OLD] && + (repl_state[NEW] == L_STARTING_SYNC_T || repl_state[NEW] == L_STARTING_SYNC_S) && + repl_state[OLD] > L_ESTABLISHED) + return SS_RESYNC_RUNNING; + + if ((repl_state[NEW] == L_VERIFY_S || repl_state[NEW] == L_VERIFY_T) && repl_state[OLD] < L_ESTABLISHED) + return SS_NEED_CONNECTION; + + if ((repl_state[NEW] == L_VERIFY_S || repl_state[NEW] == L_VERIFY_T) && + repl_state[NEW] != repl_state[OLD] && repl_state[OLD] > L_ESTABLISHED) + return SS_RESYNC_RUNNING; + + if ((repl_state[NEW] == L_STARTING_SYNC_S || repl_state[NEW] == L_STARTING_SYNC_T) && + repl_state[OLD] < L_ESTABLISHED) + return SS_NEED_CONNECTION; + + if ((repl_state[NEW] == L_SYNC_TARGET || repl_state[NEW] == L_SYNC_SOURCE) + && repl_state[OLD] < L_OFF) + return SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ + + if ((peer_disk_state[NEW] > D_DISKLESS && peer_disk_state[NEW] != D_UNKNOWN) && + peer_disk_state[OLD] == D_DISKLESS && !want_bitmap(peer_device)) + return SS_ATTACH_NO_BITMAP; /* peer with --bitmap=no wannts to attach ??? */ + } + } + + return SS_SUCCESS; +} + +/** + * is_valid_soft_transition() - Returns an SS_ error code if state[NEW] is not valid + * + * "Soft" transitions are voluntary state changes which drbd may decline, such + * as a user request to promote a resource to primary. Opposed to that are + * involuntary or "hard" transitions like a network connection loss. + * + * When deciding if a "soft" transition should be allowed, "hard" transitions + * may already have forced the resource into a critical state. It may take + * several "soft" transitions to get the resource back to normal. To allow + * those, rather than checking if the desired new state is valid, we can only + * check if the desired new state is "at least as good" as the current state. + * + * @resource: DRBD resource + */ +static enum drbd_state_rv is_valid_soft_transition(struct drbd_resource *resource) +{ + enum drbd_state_rv rv; + + rcu_read_lock(); + rv = __is_valid_soft_transition(resource); + rcu_read_unlock(); + + return rv; +} + +static enum drbd_state_rv +is_valid_conn_transition(enum drbd_conn_state oc, enum drbd_conn_state nc) +{ + /* no change -> nothing to do, at least for the connection part */ + if (oc == nc) + return SS_NOTHING_TO_DO; + + /* disconnect of an unconfigured connection does not make sense */ + if (oc == C_STANDALONE && nc == C_DISCONNECTING) + return SS_ALREADY_STANDALONE; + + /* from C_STANDALONE, we start with C_UNCONNECTED */ + if (oc == C_STANDALONE && nc != C_UNCONNECTED) + return SS_NEED_CONNECTION; + + /* After a network error only C_UNCONNECTED or C_DISCONNECTING may follow. */ + if (oc >= C_TIMEOUT && oc <= C_TEAR_DOWN && nc != C_UNCONNECTED && nc != C_DISCONNECTING) + return SS_IN_TRANSIENT_STATE; + + /* After C_DISCONNECTING only C_STANDALONE may follow */ + if (oc == C_DISCONNECTING && nc != C_STANDALONE) + return SS_IN_TRANSIENT_STATE; + + return SS_SUCCESS; +} + + +/** + * is_valid_transition() - Returns an SS_ error code if the state transition is not possible + * This limits hard state transitions. Hard state transitions are facts there are + * imposed on DRBD by the environment. E.g. disk broke or network broke down. + * But those hard state transitions are still not allowed to do everything. + * @resource: DRBD resource. + */ +static enum drbd_state_rv is_valid_transition(struct drbd_resource *resource) +{ + enum drbd_state_rv rv; + struct drbd_connection *connection; + struct drbd_device *device; + int vnr; + + for_each_connection(connection, resource) { + rv = is_valid_conn_transition(connection->cstate[OLD], connection->cstate[NEW]); + if (rv < SS_SUCCESS) + return rv; + } + + idr_for_each_entry(&resource->devices, device, vnr) { + /* we cannot fail (again) if we already detached */ + if ((device->disk_state[NEW] == D_FAILED || device->disk_state[NEW] == D_DETACHING) && + device->disk_state[OLD] == D_DISKLESS) { + return SS_IS_DISKLESS; + } + } + + return SS_SUCCESS; +} + +static bool is_sync_target_other_c(struct drbd_peer_device *ign_peer_device) +{ + struct drbd_device *device = ign_peer_device->device; + struct drbd_peer_device *peer_device; + + for_each_peer_device(peer_device, device) { + enum drbd_repl_state r; + + if (peer_device == ign_peer_device) + continue; + + r = peer_device->repl_state[NEW]; + if (r == L_SYNC_TARGET || r == L_PAUSED_SYNC_T) + return true; + } + + return false; +} + +static void drbd_start_other_targets_paused(struct drbd_peer_device *peer_device) +{ + struct drbd_device *device = peer_device->device; + struct drbd_peer_device *p; + + for_each_peer_device(p, device) { + if (p == peer_device) + continue; + + if (p->disk_state[NEW] >= D_INCONSISTENT && p->repl_state[NEW] == L_ESTABLISHED) + p->repl_state[NEW] = L_PAUSED_SYNC_T; + } +} + +static bool drbd_is_sync_target_candidate(struct drbd_peer_device *peer_device) +{ + if (!repl_is_sync_target(peer_device->repl_state[NEW])) + return false; + + if (peer_device->resync_susp_dependency[NEW] || + peer_device->resync_susp_peer[NEW] || + peer_device->resync_susp_user[NEW]) + return false; + + if (peer_device->disk_state[NEW] < D_OUTDATED) + return false; + + return true; + +} + +static void drbd_select_sync_target(struct drbd_device *device) +{ + struct drbd_peer_device *peer_device; + struct drbd_peer_device *target_current = NULL; + struct drbd_peer_device *target_active = NULL; + struct drbd_peer_device *target_desired = NULL; + + /* Find current and active resync peers. */ + for_each_peer_device_rcu(peer_device, device) { + if (peer_device->repl_state[OLD] == L_SYNC_TARGET && drbd_is_sync_target_candidate(peer_device)) + target_current = peer_device; + + if (peer_device->resync_active[NEW]) + target_active = peer_device; + } + + /* Choose desired resync peer. */ + for_each_peer_device_rcu(peer_device, device) { + if (!drbd_is_sync_target_candidate(peer_device)) + continue; + + if (target_desired && drbd_bm_total_weight(peer_device) > drbd_bm_total_weight(target_desired)) + continue; + + target_desired = peer_device; + } + + /* Keep current resync target if the alternative has less than 1MiB + * storage (256 bits) less to resync. */ + if (target_current && target_desired && + drbd_bm_total_weight(target_current) < drbd_bm_total_weight(target_desired) + 256UL) + target_desired = target_current; + + /* Do not activate/unpause a resync if some other is still active. */ + if (target_desired && target_active && target_desired != target_active) + target_desired = NULL; + + /* Activate resync (if not already active). */ + if (target_desired) + target_desired->resync_active[NEW] = true; + + /* Make sure that the targets are correctly paused/unpaused. */ + for_each_peer_device_rcu(peer_device, device) { + enum drbd_repl_state *repl_state = peer_device->repl_state; + + peer_device->resync_susp_other_c[NEW] = target_desired && peer_device != target_desired; + + if (!repl_is_sync_target(repl_state[NEW])) + continue; + + peer_device->repl_state[NEW] = peer_device == target_desired ? L_SYNC_TARGET : L_PAUSED_SYNC_T; + } +} + +static bool drbd_change_to_inconsistent(enum drbd_disk_state *disk_state, + enum drbd_conn_state *cstate) +{ + return !(disk_state[OLD] == D_INCONSISTENT && cstate[OLD] == C_CONNECTED) && + (disk_state[NEW] == D_INCONSISTENT && cstate[NEW] == C_CONNECTED); +} + +static void sanitize_state(struct drbd_resource *resource) +{ + enum drbd_role *role = resource->role; + struct drbd_connection *connection; + struct drbd_device *device; + bool maybe_crashed_primary = false; + bool volume_lost_data_access = false; + bool volumes_have_data_access = true; + bool resource_has_quorum = true; + int connected_primaries = 0; + int vnr; + + rcu_read_lock(); + for_each_connection_rcu(connection, resource) { + enum drbd_conn_state *cstate = connection->cstate; + + if (cstate[NEW] < C_CONNECTED) + connection->peer_role[NEW] = R_UNKNOWN; + + if (connection->peer_role[OLD] == R_PRIMARY && cstate[OLD] == C_CONNECTED && + ((cstate[NEW] >= C_TIMEOUT && cstate[NEW] <= C_PROTOCOL_ERROR) || + (cstate[NEW] == C_DISCONNECTING && resource->state_change_flags & CS_HARD))) + /* implies also C_BROKEN_PIPE and C_NETWORK_FAILURE */ + maybe_crashed_primary = true; + + if (connection->peer_role[NEW] == R_PRIMARY) + connected_primaries++; + } + + idr_for_each_entry(&resource->devices, device, vnr) { + struct drbd_peer_device *peer_device; + enum drbd_disk_state *disk_state = device->disk_state; + bool lost_connection = false; + bool have_good_peer = false; + + if (disk_state[OLD] == D_DISKLESS && disk_state[NEW] == D_DETACHING) + disk_state[NEW] = D_DISKLESS; + + if ((resource->state_change_flags & CS_IGN_OUTD_FAIL) && + disk_state[OLD] < D_OUTDATED && disk_state[NEW] == D_OUTDATED) + disk_state[NEW] = disk_state[OLD]; + + if (disk_state[NEW] == D_NEGOTIATING) { + int all = 0, target = 0, no_result = 0; + bool up_to_date_neighbor = false; + + if (disk_state[OLD] != D_NEGOTIATING) { + for_each_peer_device_rcu(peer_device, device) + peer_device->negotiation_result = L_NEGOTIATING; + } + + for_each_peer_device_rcu(peer_device, device) { + enum drbd_repl_state repl_state = peer_device->repl_state[NEW]; + enum drbd_repl_state nr = peer_device->negotiation_result; + enum drbd_disk_state pdsk = peer_device->disk_state[NEW]; + + if (pdsk < D_NEGOTIATING || repl_state == L_OFF) + continue; + + if (pdsk == D_UP_TO_DATE) + up_to_date_neighbor = true; + + all++; + if (nr == L_NEG_NO_RESULT) + no_result++; + else if (nr == L_NEGOTIATING) + goto stay_negotiating; + else if (nr == L_WF_BITMAP_T) + target++; + else if (nr != L_ESTABLISHED && nr != L_WF_BITMAP_S) + drbd_err(peer_device, "Unexpected nr = %s\n", drbd_repl_str(nr)); + } + + /* negotiation finished */ + if (no_result > 0 && no_result == all) + disk_state[NEW] = D_DETACHING; + else if (target) + disk_state[NEW] = D_INCONSISTENT; + else + disk_state[NEW] = up_to_date_neighbor ? D_UP_TO_DATE : + /* ldev_safe: dstate */ disk_state_from_md(device); + + for_each_peer_device_rcu(peer_device, device) { + enum drbd_repl_state nr = peer_device->negotiation_result; + + if (peer_device->connection->cstate[NEW] < C_CONNECTED || + nr == L_NEGOTIATING) + continue; + + if (nr == L_NEG_NO_RESULT) + nr = L_ESTABLISHED; + + if (nr == L_WF_BITMAP_S && disk_state[NEW] == D_INCONSISTENT) { + /* Should be sync source for one peer and sync + target for an other peer. Delay the sync source + role */ + nr = L_PAUSED_SYNC_S; + peer_device->resync_susp_other_c[NEW] = true; + drbd_warn(peer_device, "Finish me\n"); + } + peer_device->repl_state[NEW] = nr; + } + } + stay_negotiating: + + for_each_peer_device_rcu(peer_device, device) { + enum drbd_repl_state *repl_state = peer_device->repl_state; + enum drbd_disk_state *peer_disk_state = peer_device->disk_state; + struct drbd_connection *connection = peer_device->connection; + enum drbd_conn_state *cstate = connection->cstate; + + if (peer_disk_state[NEW] == D_UP_TO_DATE && + (device->exposed_data_uuid & ~UUID_PRIMARY) == + (peer_device->current_uuid & ~UUID_PRIMARY)) + have_good_peer = true; + + if (repl_state[NEW] < L_ESTABLISHED) { + peer_device->resync_susp_peer[NEW] = false; + if (peer_disk_state[NEW] > D_UNKNOWN || + peer_disk_state[NEW] < D_INCONSISTENT) + peer_disk_state[NEW] = D_UNKNOWN; + } + if (repl_state[OLD] >= L_ESTABLISHED && repl_state[NEW] < L_ESTABLISHED) { + lost_connection = true; + peer_device->resync_active[NEW] = false; + } + + /* Clear the aftr_isp when becoming unconfigured */ + if (cstate[NEW] == C_STANDALONE && + disk_state[NEW] == D_DISKLESS && + role[NEW] == R_SECONDARY) + peer_device->resync_susp_dependency[NEW] = false; + + /* Abort resync if a disk fails/detaches */ + if (repl_state[NEW] > L_ESTABLISHED && + (disk_state[NEW] <= D_FAILED || + peer_disk_state[NEW] <= D_FAILED)) { + repl_state[NEW] = L_ESTABLISHED; + clear_bit(RECONCILIATION_RESYNC, &peer_device->flags); + peer_device->resync_active[NEW] = false; + } + + /* Suspend IO while fence-peer handler runs (peer lost) */ + if (connection->fencing_policy == FP_STONITH && + (role[NEW] == R_PRIMARY && + repl_state[NEW] < L_ESTABLISHED && + peer_disk_state[NEW] == D_UNKNOWN) && + (role[OLD] != R_PRIMARY || + peer_disk_state[OLD] != D_UNKNOWN)) + connection->susp_fen[NEW] = true; + } + + drbd_select_sync_target(device); + + for_each_peer_device_rcu(peer_device, device) { + enum drbd_repl_state *repl_state = peer_device->repl_state; + enum drbd_disk_state *peer_disk_state = peer_device->disk_state; + struct drbd_connection *connection = peer_device->connection; + enum drbd_conn_state *cstate = connection->cstate; + enum drbd_disk_state min_disk_state, max_disk_state; + enum drbd_disk_state min_peer_disk_state, max_peer_disk_state; + enum drbd_role *peer_role = connection->peer_role; + bool uuids_match, cond; + + /* Pause a SyncSource until it finishes resync as target on other connections */ + if (repl_state[OLD] != L_SYNC_SOURCE && repl_state[NEW] == L_SYNC_SOURCE && + is_sync_target_other_c(peer_device)) + peer_device->resync_susp_other_c[NEW] = true; + + if (resync_suspended(peer_device, NEW)) { + if (repl_state[NEW] == L_SYNC_SOURCE) + repl_state[NEW] = L_PAUSED_SYNC_S; + } else { + if (repl_state[NEW] == L_PAUSED_SYNC_S) + repl_state[NEW] = L_SYNC_SOURCE; + } + + /* Implication of the repl state on other peer's repl state */ + if (repl_state[OLD] != L_STARTING_SYNC_T && repl_state[NEW] == L_STARTING_SYNC_T) + drbd_start_other_targets_paused(peer_device); + + /* D_CONSISTENT vanish when we get connected (pre 9.0) */ + if (connection->agreed_pro_version < 110 && + repl_state[NEW] >= L_ESTABLISHED && repl_state[NEW] < L_AHEAD) { + if (disk_state[NEW] == D_CONSISTENT) + disk_state[NEW] = D_UP_TO_DATE; + if (peer_disk_state[NEW] == D_CONSISTENT) + peer_disk_state[NEW] = D_UP_TO_DATE; + } + + /* Implications of the repl state on the disk states */ + min_disk_state = D_DISKLESS; + max_disk_state = D_UP_TO_DATE; + min_peer_disk_state = D_INCONSISTENT; + max_peer_disk_state = D_UNKNOWN; + switch (repl_state[NEW]) { + case L_OFF: + /* values from above */ + break; + case L_WF_BITMAP_T: + case L_STARTING_SYNC_T: + case L_WF_SYNC_UUID: + case L_BEHIND: + min_disk_state = D_INCONSISTENT; + max_disk_state = D_OUTDATED; + min_peer_disk_state = D_INCONSISTENT; + max_peer_disk_state = D_UP_TO_DATE; + break; + case L_VERIFY_S: + case L_VERIFY_T: + min_disk_state = D_INCONSISTENT; + max_disk_state = D_UP_TO_DATE; + min_peer_disk_state = D_INCONSISTENT; + max_peer_disk_state = D_UP_TO_DATE; + break; + case L_ESTABLISHED: + min_disk_state = D_DISKLESS; + max_disk_state = D_UP_TO_DATE; + min_peer_disk_state = D_DISKLESS; + max_peer_disk_state = D_UP_TO_DATE; + break; + case L_WF_BITMAP_S: + case L_PAUSED_SYNC_S: + case L_STARTING_SYNC_S: + case L_AHEAD: + min_disk_state = D_INCONSISTENT; + max_disk_state = D_UP_TO_DATE; + min_peer_disk_state = D_INCONSISTENT; + max_peer_disk_state = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/ + break; + case L_PAUSED_SYNC_T: + case L_SYNC_TARGET: + min_disk_state = D_INCONSISTENT; + max_disk_state = D_INCONSISTENT; + min_peer_disk_state = D_INCONSISTENT; + max_peer_disk_state = D_UP_TO_DATE; + break; + case L_SYNC_SOURCE: + min_disk_state = D_INCONSISTENT; + max_disk_state = D_UP_TO_DATE; + min_peer_disk_state = D_INCONSISTENT; + max_peer_disk_state = D_INCONSISTENT; + break; + } + + /* Implications of the repl state on the disk states */ + if (disk_state[NEW] > max_disk_state) + disk_state[NEW] = max_disk_state; + + if (disk_state[NEW] < min_disk_state) + disk_state[NEW] = min_disk_state; + + if (peer_disk_state[NEW] > max_peer_disk_state) + peer_disk_state[NEW] = max_peer_disk_state; + + if (peer_disk_state[NEW] < min_peer_disk_state) + peer_disk_state[NEW] = min_peer_disk_state; + + /* A detach is a cluster wide transaction. The peer_disk_state updates + are coming in while we have it prepared. When the cluster wide + state change gets committed prevent D_DISKLESS -> D_FAILED */ + if (peer_disk_state[OLD] == D_DISKLESS && + (peer_disk_state[NEW] == D_FAILED || peer_disk_state[NEW] == D_DETACHING)) + peer_disk_state[NEW] = D_DISKLESS; + + /* Upgrade myself from D_OUTDATED if.. + 1) We connect to stable D_UP_TO_DATE(or D_CONSISTENT) peer without resync + 2) The peer just became stable + 3) the peer was stable and just became D_UP_TO_DATE */ + if (repl_state[NEW] == L_ESTABLISHED && disk_state[NEW] == D_OUTDATED && + peer_disk_state[NEW] >= D_CONSISTENT && test_bit(UUIDS_RECEIVED, &peer_device->flags) && + peer_device->uuid_flags & UUID_FLAG_STABLE && + (repl_state[OLD] < L_ESTABLISHED || + peer_device->uuid_flags & UUID_FLAG_GOT_STABLE || + peer_disk_state[OLD] == D_OUTDATED)) + disk_state[NEW] = peer_disk_state[NEW]; + + /* The attempted resync made us D_OUTDATED, roll that back in case */ + if (repl_state[OLD] == L_WF_BITMAP_T && repl_state[NEW] == L_OFF && + disk_state[NEW] == D_OUTDATED && stable_up_to_date_neighbor(device) && + /* ldev_safe: repl_state[OLD] */ may_be_up_to_date(device, NEW)) + disk_state[NEW] = D_UP_TO_DATE; + + /* clause intentional here, the D_CONSISTENT form above might trigger this */ + if (repl_state[OLD] < L_ESTABLISHED && repl_state[NEW] >= L_ESTABLISHED && + disk_state[NEW] == D_CONSISTENT && + /* ldev_safe: repl_state[NEW] */ may_be_up_to_date(device, NEW)) + disk_state[NEW] = D_UP_TO_DATE; + + /* Follow a neighbor that goes from D_CONSISTENT TO D_UP_TO_DATE */ + if (disk_state[NEW] == D_CONSISTENT && + peer_disk_state[OLD] == D_CONSISTENT && + peer_disk_state[NEW] == D_UP_TO_DATE && + peer_device->uuid_flags & UUID_FLAG_STABLE) + disk_state[NEW] = D_UP_TO_DATE; + + peer_device->uuid_flags &= ~UUID_FLAG_GOT_STABLE; + + uuids_match = + (peer_device->current_uuid & ~UUID_PRIMARY) == + (drbd_current_uuid(device) & ~UUID_PRIMARY); + + if (peer_role[OLD] == R_UNKNOWN && peer_role[NEW] == R_PRIMARY && + peer_disk_state[NEW] == D_DISKLESS && disk_state[NEW] >= D_NEGOTIATING) { + /* Got connected to a diskless primary */ + if (uuids_match && !is_sync_target_other_c(peer_device)) { + if (device->disk_state[NOW] < D_UP_TO_DATE) { + drbd_info(peer_device, "Upgrading local disk to D_UP_TO_DATE since current UUID matches.\n"); + disk_state[NEW] = D_UP_TO_DATE; + } + } else { + set_bit(TRY_TO_GET_RESYNC, &device->flags); + if (disk_state[NEW] == D_UP_TO_DATE) { + drbd_info(peer_device, "Downgrading local disk to D_CONSISTENT since current UUID differs.\n"); + disk_state[NEW] = D_CONSISTENT; + /* This is a "safety net"; it can only happen if fencing and quorum + are both disabled. This alone would be racy, look for + "Do not trust this guy!" (see also may_return_to_up_to_date()) */ + } + } + } + + if (connection->agreed_features & DRBD_FF_RS_SKIP_UUID) + cond = have_good_peer && + (device->exposed_data_uuid & ~UUID_PRIMARY) != + (peer_device->current_uuid & ~UUID_PRIMARY); + else + cond = peer_disk_state[OLD] == D_UNKNOWN && + role[NEW] == R_PRIMARY && !uuids_match; + + if (disk_state[NEW] == D_DISKLESS && peer_disk_state[NEW] == D_UP_TO_DATE && + cond) { + /* Do not trust this guy! + He wants to be D_UP_TO_DATE, but has a different current + UUID. Do not accept him as D_UP_TO_DATE but downgrade that to + D_CONSISTENT here. + */ + peer_disk_state[NEW] = D_CONSISTENT; + } + + /* + * Determine whether peer will disable replication due to this transition. + * + * This matches the condition on the peer below. + */ + if (drbd_change_to_inconsistent(disk_state, cstate) || + (!repl_is_sync_target(repl_state[OLD]) && + repl_is_sync_target(repl_state[NEW]))) + peer_device->peer_replication[NEW] = + test_bit(PEER_REPLICATION_NEXT, &peer_device->flags); + + /* + * Decide whether to disable replication when the peer + * transitions to Inconsistent. Only consider the disk + * state when we are Connected because we want to wait + * until we know whether replication should be enabled + * on the next transition to Inconsistent. This is + * communicated with the P_ENABLE_REPLICATION_NEXT + * packet. + * + * Also re-evaluate whether to disable replication when + * we become SyncSource, even when the peer's disk was + * already Inconsistent. This is relevant when + * switching between Ahead-Behind+Inconsistent and + * SyncSource-SyncTarget. + * + * This matches the condition on the peer above. + */ + if (drbd_change_to_inconsistent(peer_disk_state, cstate) || + (!repl_is_sync_source(repl_state[OLD]) && + repl_is_sync_source(repl_state[NEW]))) + peer_device->replication[NEW] = + test_bit(REPLICATION_NEXT, &peer_device->flags); + + /* + * Not strictly necessary, since "replication" is only + * considered when the peer disk is Inconsistent, but + * it makes the logs clearer. + */ + if (peer_disk_state[OLD] == D_INCONSISTENT && + peer_disk_state[NEW] != D_INCONSISTENT) + peer_device->replication[NEW] = true; + } + + if (resource->res_opts.quorum != QOU_OFF) + device->have_quorum[NEW] = calc_quorum(device, NULL); + else + device->have_quorum[NEW] = true; + + if (!device->have_quorum[NEW] && disk_state[NEW] == D_UP_TO_DATE && + test_bit(RESTORE_QUORUM, &device->flags)) { + device->have_quorum[NEW] = true; + set_bit(RESTORING_QUORUM, &device->flags); + } + + if (!device->have_quorum[NEW]) + resource_has_quorum = false; + + /* Suspend IO if we have no accessible data available. + * Policy may be extended later to be able to suspend + * if redundancy falls below a certain level. */ + if (role[NEW] == R_PRIMARY && !drbd_data_accessible(device, NEW)) { + volumes_have_data_access = false; + if (role[OLD] != R_PRIMARY || drbd_data_accessible(device, OLD)) + volume_lost_data_access = true; + } + + if (lost_connection && disk_state[NEW] == D_NEGOTIATING) + disk_state[NEW] = /* ldev_safe: disk_state */ disk_state_from_md(device); + + if (maybe_crashed_primary && !connected_primaries && + disk_state[NEW] == D_UP_TO_DATE && role[NOW] == R_SECONDARY) + disk_state[NEW] = D_CONSISTENT; + } + rcu_read_unlock(); + + if (volumes_have_data_access) + resource->susp_nod[NEW] = false; + if (volume_lost_data_access && resource->res_opts.on_no_data == OND_SUSPEND_IO) + resource->susp_nod[NEW] = true; + + resource->susp_quorum[NEW] = + resource->res_opts.on_no_quorum == ONQ_SUSPEND_IO ? !resource_has_quorum : false; + + if (!resource->susp_uuid[OLD] && + resource_is_suspended(resource, OLD) && !resource_is_suspended(resource, NEW)) { + idr_for_each_entry(&resource->devices, device, vnr) { + if (test_bit(NEW_CUR_UUID, &device->flags)) { + resource->susp_uuid[NEW] = true; + break; + } + } + } + + if (role[OLD] == R_PRIMARY && role[NEW] == R_SECONDARY && + (resource->state_change_flags & CS_FS_IGN_OPENERS)) { + int rw_count, ro_count; + drbd_open_counts(resource, &rw_count, &ro_count); + if (rw_count) + resource->fail_io[NEW] = true; + } +} + +void drbd_resume_al(struct drbd_device *device) +{ + if (test_and_clear_bit(AL_SUSPENDED, &device->flags)) + drbd_info(device, "Resumed AL updates\n"); +} + +static bool drbd_need_twopc_after_lost_peer(struct drbd_connection *connection) +{ + enum drbd_conn_state *cstate = connection->cstate; + + /* Is the state change a disconnect? */ + if (!(cstate[OLD] == C_CONNECTED && cstate[NEW] < C_CONNECTED)) + return false; + + /* + * The peer did not provide reachable_nodes when disconnecting, so + * trigger a twopc ourselves. + */ + if (!(connection->agreed_features & DRBD_FF_2PC_V2)) + return true; + + /* Trigger a twopc if it was a non-graceful disconnect. */ + return cstate[NEW] != C_TEAR_DOWN; +} + +static void drbd_schedule_empty_twopc(struct drbd_resource *resource) +{ + kref_get(&resource->kref); + if (!schedule_work(&resource->empty_twopc)) { + kref_put(&resource->kref, drbd_destroy_resource); + } +} + +/* + * We cache a node mask of the online members of the cluster. It might + * be off because a node is still marked as online immediately after + * it crashes. That means it might have an online mark for an already + * offline node. On the other hand, we guarantee that it never has + * a zero for an online node. + */ +static void update_members(struct drbd_resource *resource) +{ + enum chg_state_flags flags = resource->state_change_flags; + struct twopc_reply *reply = &resource->twopc_reply; + const int my_node_id = resource->res_opts.node_id; + struct drbd_connection *connection; + + /* in case we initiated 2PC we know the reachable nodes */ + if (flags & CS_TWOPC && reply->initiator_node_id == my_node_id) { + resource->members = reply->reachable_nodes; + return; + } + + /* In case I am 2PC target of a connect or non-graceful disconnect */ + for_each_connection(connection, resource) { + enum drbd_conn_state *cstate = connection->cstate; + const int peer_node_mask = NODE_MASK(connection->peer_node_id); + + /* add a fresh connection to the members */ + if (cstate[OLD] < C_CONNECTED && cstate[NEW] == C_CONNECTED) + resource->members |= peer_node_mask; + + /* Connection to peer lost. Check if we should remove it from the members */ + if (drbd_need_twopc_after_lost_peer(connection) && + resource->members & peer_node_mask) + drbd_schedule_empty_twopc(resource); + } +} + +static bool drbd_any_peer_device_up_to_date(struct drbd_connection *connection) +{ + int vnr; + struct drbd_peer_device *peer_device; + + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + if (peer_device->disk_state[NEW] == D_UP_TO_DATE) + return true; + } + + return false; +} + +/* Whether replication is enabled on all peers for this device */ +bool drbd_all_peer_replication(struct drbd_device *device, enum which_state which) +{ + struct drbd_peer_device *peer_device; + bool all_peer_replication = true; + + rcu_read_lock(); + for_each_peer_device_rcu(peer_device, device) { + if (!peer_device->peer_replication[which]) + all_peer_replication = false; + } + rcu_read_unlock(); + + return all_peer_replication; +} + +/* As drbd_all_peer_replication() but takes a state change object */ +static bool drbd_all_peer_replication_change(struct drbd_state_change *state_change, int n_device, + enum which_state which) +{ + int n_connection; + + for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { + struct drbd_peer_device_state_change *peer_device_state_change = + &state_change->peer_devices[ + n_device * state_change->n_connections + n_connection]; + + if (!peer_device_state_change->peer_replication[which]) + return false; + } + + return true; +} + +static void drbd_determine_flush_pending(struct drbd_resource *resource) +{ + struct drbd_device *device; + struct drbd_connection *primary_connection; + struct drbd_connection *up_to_date_connection; + int vnr; + bool send_flush_requests = false; + + /* Clear any bits if we no longer expect or require a flush ack */ + spin_lock(&resource->initiator_flush_lock); + for_each_connection(primary_connection, resource) { + u64 *pending_flush_mask = &primary_connection->pending_flush_mask; + + /* + * Clear bits if we no longer expect or require a flush ack due + * to loss of connection to the Primary peer. + */ + if (primary_connection->cstate[NEW] != C_CONNECTED) { + if (*pending_flush_mask) + *pending_flush_mask = 0; + continue; + } + + /* + * Clear bits if we no longer expect or require a flush ack + * because the peer that was UpToDate is no longer UpToDate. + * For instance, if we lose the connection to that peer. + */ + for_each_connection(up_to_date_connection, resource) { + u64 up_to_date_mask = NODE_MASK(up_to_date_connection->peer_node_id); + + if (drbd_any_peer_device_up_to_date(up_to_date_connection)) + continue; + + if (*pending_flush_mask & up_to_date_mask) + *pending_flush_mask &= ~up_to_date_mask; + } + } + spin_unlock(&resource->initiator_flush_lock); + + /* Check if we need a new flush */ + idr_for_each_entry(&resource->devices, device, vnr) { + struct drbd_peer_device *peer_device; + + for_each_peer_device(peer_device, device) { + if (!(is_sync_target_state(peer_device, NOW) && + drbd_all_peer_replication(device, NOW)) && + is_sync_target_state(peer_device, NEW) && + drbd_all_peer_replication(device, NEW)) + send_flush_requests = true; + } + } + + if (!send_flush_requests) + return; + + /* We need a new flush. Mark which acks we are waiting for. */ + spin_lock(&resource->initiator_flush_lock); + resource->current_flush_sequence++; + + for_each_connection(primary_connection, resource) { + primary_connection->pending_flush_mask = 0; + + if (primary_connection->peer_role[NEW] != R_PRIMARY) + continue; + + if (primary_connection->agreed_pro_version < 123) + continue; + + for_each_connection(up_to_date_connection, resource) { + u64 up_to_date_mask = NODE_MASK(up_to_date_connection->peer_node_id); + + if (!drbd_any_peer_device_up_to_date(up_to_date_connection)) + continue; + + if (up_to_date_connection->agreed_pro_version < 123) + continue; + + primary_connection->pending_flush_mask |= up_to_date_mask; + } + } + spin_unlock(&resource->initiator_flush_lock); +} + +static void set_ov_position(struct drbd_peer_device *peer_device, + enum drbd_repl_state repl_state) +{ + struct drbd_device *device = peer_device->device; + struct drbd_bitmap *bm = device->bitmap; + + if (peer_device->connection->agreed_pro_version < 90) + peer_device->ov_start_sector = 0; + peer_device->rs_total = drbd_bm_bits(device); + peer_device->ov_position = 0; + if (repl_state == L_VERIFY_T) { + /* starting online verify from an arbitrary position + * does not fit well into the existing protocol. + * on L_VERIFY_T, we initialize ov_left and friends + * implicitly in receive_common_data_request once the + * first P_OV_REQUEST is received */ + peer_device->ov_start_sector = ~(sector_t)0; + } else { + unsigned long bit = bm_sect_to_bit(bm, peer_device->ov_start_sector); + if (bit >= peer_device->rs_total) { + peer_device->ov_start_sector = + bm_bit_to_sect(bm, peer_device->rs_total - 1); + peer_device->rs_total = 1; + } else + peer_device->rs_total -= bit; + peer_device->ov_position = peer_device->ov_start_sector; + } + atomic64_set(&peer_device->ov_left, peer_device->rs_total); + peer_device->ov_skipped = 0; +} + +static void initialize_resync_progress_marks(struct drbd_peer_device *peer_device) +{ + unsigned long tw = drbd_bm_total_weight(peer_device); + unsigned long now = jiffies; + int i; + + peer_device->rs_last_progress_report_ts = now; + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + peer_device->rs_mark_left[i] = tw; + peer_device->rs_mark_time[i] = now; + } +} + +static void initialize_resync(struct drbd_peer_device *peer_device) +{ + unsigned long tw = drbd_bm_total_weight(peer_device); + unsigned long now = jiffies; + + peer_device->last_in_sync_end = 0; + peer_device->resync_next_bit = 0; + peer_device->last_resync_pass_bits = tw; + peer_device->rs_failed = 0; + peer_device->rs_paused = 0; + peer_device->rs_same_csum = 0; + peer_device->rs_total = tw; + peer_device->rs_start = now; + peer_device->rs_last_writeout = now; + initialize_resync_progress_marks(peer_device); + drbd_rs_controller_reset(peer_device); +} + +/* Is there a primary with access to up to date data known */ +static bool primary_and_data_present(struct drbd_device *device) +{ + bool up_to_date_data = device->disk_state[NEW] == D_UP_TO_DATE; + struct drbd_resource *resource = device->resource; + bool primary = resource->role[NEW] == R_PRIMARY; + struct drbd_peer_device *peer_device; + + for_each_peer_device(peer_device, device) { + struct drbd_connection *connection = peer_device->connection; + + /* Do not consider the peer if we are disconnecting. */ + if (resource->remote_state_change && + drbd_twopc_between_peer_and_me(connection) && + resource->twopc_reply.is_disconnect) + continue; + + if (connection->peer_role[NEW] == R_PRIMARY) + primary = true; + + if (peer_device->disk_state[NEW] == D_UP_TO_DATE) + up_to_date_data = true; + } + + return primary && up_to_date_data; +} + +static bool extra_ldev_ref_for_after_state_chg(enum drbd_disk_state *disk_state) +{ + return (disk_state[OLD] != D_FAILED && disk_state[NEW] == D_FAILED) || + (disk_state[OLD] != D_DETACHING && disk_state[NEW] == D_DETACHING) || + (disk_state[OLD] != D_DISKLESS && disk_state[NEW] == D_DISKLESS); +} + +static bool has_starting_resyncs(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr; + + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + if (peer_device->repl_state[NEW] > L_ESTABLISHED) + return true; + } + return false; +} + +static bool should_try_become_up_to_date(struct drbd_device *device, enum drbd_disk_state *disk_state, + enum which_state which) +{ + return disk_state[OLD] == D_UP_TO_DATE && disk_state[NEW] == D_CONSISTENT && + may_return_to_up_to_date(device, which); +} + +/** + * finish_state_change - carry out actions triggered by a state change + * @resource: DBRD resource. + * @tag: State change tag to print in status messages. + */ +static void finish_state_change(struct drbd_resource *resource, const char *tag) +{ + enum drbd_role *role = resource->role; + bool *susp_uuid = resource->susp_uuid; + struct drbd_device *device; + struct drbd_connection *connection; + bool starting_resync = false; + bool start_new_epoch = false; + bool lost_a_primary_peer = false; + bool some_peer_is_primary = false; + bool some_peer_request_in_flight = false; + bool resource_suspended[2]; + bool unfreeze_io = false; + int vnr; + + print_state_change(resource, "", tag); + + resource_suspended[OLD] = resource_is_suspended(resource, OLD); + resource_suspended[NEW] = resource_is_suspended(resource, NEW); + + idr_for_each_entry(&resource->devices, device, vnr) { + bool *have_quorum = device->have_quorum; + struct drbd_peer_device *peer_device; + + for_each_peer_device(peer_device, device) { + struct drbd_connection *connection = peer_device->connection; + bool did, should; + + did = drbd_should_do_remote(peer_device, NOW); + should = drbd_should_do_remote(peer_device, NEW); + + if (!did && should) { + /* Since "did" is false, the request with this + * dagtag and prior requests were not be marked + * to be sent to this peer. Hence this will not + * send a dagtag packet before the + * corresponding data packet. + * + * It is possible that this peer does not + * actually have the data corresponding to this + * dagtag. However in that case, the disk state + * of that peer will not be D_UP_TO_DATE, so it + * not be relevant what dagtag we have sent it. */ + connection->send_dagtag = resource->dagtag_sector; + drbd_queue_work_if_unqueued( + &connection->sender_work, + &connection->send_dagtag_work); + } + + if (did != should) + start_new_epoch = true; + + if (peer_device->repl_state[OLD] != L_WF_BITMAP_S && + peer_device->repl_state[NEW] == L_WF_BITMAP_S) + clear_bit(B_RS_H_DONE, &peer_device->flags); + + if (peer_device->repl_state[OLD] != L_WF_BITMAP_T && + peer_device->repl_state[NEW] == L_WF_BITMAP_T) + clear_bit(B_RS_H_DONE, &peer_device->flags); + + if (!is_sync_state(peer_device, NOW) && + is_sync_state(peer_device, NEW)) { + clear_bit(RS_DONE, &peer_device->flags); + clear_bit(B_RS_H_DONE, &peer_device->flags); + clear_bit(SYNC_TARGET_TO_BEHIND, &peer_device->flags); + } + } + + if (role[NEW] == R_PRIMARY && !have_quorum[NEW]) + set_bit(PRIMARY_LOST_QUORUM, &device->flags); + } + if (start_new_epoch) + start_new_tl_epoch(resource); + + spin_lock(&resource->peer_ack_lock); + if (role[OLD] == R_PRIMARY && role[NEW] == R_SECONDARY && resource->peer_ack_req) { + resource->last_peer_acked_dagtag = resource->peer_ack_req->dagtag_sector; + drbd_queue_peer_ack(resource, resource->peer_ack_req); + resource->peer_ack_req = NULL; + } + spin_unlock(&resource->peer_ack_lock); + + drbd_determine_flush_pending(resource); + + if (!resource->fail_io[OLD] && resource->fail_io[NEW]) + drbd_warn(resource, "Failing IOs\n"); + + for_each_connection(connection, resource) { + enum drbd_role *peer_role = connection->peer_role; + enum drbd_conn_state *cstate = connection->cstate; + + if (peer_role[NEW] == R_PRIMARY) + some_peer_is_primary = true; + + switch (cstate[NEW]) { + case C_CONNECTED: + if (atomic_read(&connection->active_ee_cnt) + || atomic_read(&connection->done_ee_cnt)) + some_peer_request_in_flight = true; + break; + case C_STANDALONE: + case C_UNCONNECTED: + case C_CONNECTING: + /* maybe others are safe as well? which ones? */ + break; + default: + /* if we just disconnected, there may still be some request in flight. */ + some_peer_request_in_flight = true; + } + + if (some_peer_is_primary && some_peer_request_in_flight) + break; + } + + idr_for_each_entry(&resource->devices, device, vnr) { + struct drbd_peer_device *peer_device; + enum drbd_disk_state *disk_state = device->disk_state; + bool create_new_uuid = false; + + if (test_bit(RESTORING_QUORUM, &device->flags) && + !device->have_quorum[OLD] && device->have_quorum[NEW]) { + clear_bit(RESTORING_QUORUM, &device->flags); + drbd_info(resource, "Restored quorum from before reboot\n"); + } + + if (test_bit(RESTORE_QUORUM, &device->flags) && + (device->have_quorum[NEW] || disk_state[NEW] < D_UP_TO_DATE)) + clear_bit(RESTORE_QUORUM, &device->flags); + + /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference + * on the ldev here, to be sure the transition -> D_DISKLESS resp. + * drbd_ldev_destroy() won't happen before our corresponding + * w_after_state_change works run, where we put_ldev again. */ + if (extra_ldev_ref_for_after_state_chg(disk_state)) + atomic_inc(&device->local_cnt); + + if (disk_state[OLD] != D_DISKLESS && disk_state[NEW] == D_DISKLESS) { + /* who knows if we are ever going to be attached again, + * and whether that will be the same device, or a newly + * initialized one. */ + for_each_peer_device(peer_device, device) + peer_device->bitmap_index = -1; + } + + /* ldev_safe: transitioning from D_ATTACHING, ldev just established */ + if (disk_state[OLD] == D_ATTACHING && disk_state[NEW] >= D_NEGOTIATING) + drbd_info(device, "attached to current UUID: %016llX\n", device->ldev->md.current_uuid); + + for_each_peer_device(peer_device, device) { + enum drbd_repl_state *repl_state = peer_device->repl_state; + enum drbd_disk_state *peer_disk_state = peer_device->disk_state; + struct drbd_connection *connection = peer_device->connection; + enum drbd_role *peer_role = connection->peer_role; + + if (repl_state[OLD] <= L_ESTABLISHED && repl_state[NEW] == L_WF_BITMAP_S) + starting_resync = true; + + if ((disk_state[OLD] != D_UP_TO_DATE || peer_disk_state[OLD] != D_UP_TO_DATE) && + (disk_state[NEW] == D_UP_TO_DATE && peer_disk_state[NEW] == D_UP_TO_DATE)) { + clear_bit(CRASHED_PRIMARY, &device->flags); + if (test_bit(UUIDS_RECEIVED, &peer_device->flags)) + peer_device->uuid_flags &= ~((u64)UUID_FLAG_CRASHED_PRIMARY); + } + + /* Aborted verify run, or we reached the stop sector. + * Log the last position, unless end-of-device. */ + if ((repl_state[OLD] == L_VERIFY_S || repl_state[OLD] == L_VERIFY_T) && + repl_state[NEW] <= L_ESTABLISHED) { + /* ldev_safe: repl_state[OLD] */ + struct drbd_bitmap *bm = device->bitmap; + unsigned long ov_left = atomic64_read(&peer_device->ov_left); + + /* ldev_safe: repl_state[OLD] */ + peer_device->ov_start_sector = + bm_bit_to_sect(bm, drbd_bm_bits(device) - ov_left); + if (ov_left) + drbd_info(peer_device, "Online Verify reached sector %llu\n", + (unsigned long long)peer_device->ov_start_sector); + } + + if ((repl_state[OLD] == L_PAUSED_SYNC_T || repl_state[OLD] == L_PAUSED_SYNC_S) && + (repl_state[NEW] == L_SYNC_TARGET || repl_state[NEW] == L_SYNC_SOURCE)) { + drbd_info(peer_device, "Syncer continues.\n"); + peer_device->rs_paused += (long)jiffies + -(long)peer_device->rs_mark_time[peer_device->rs_last_mark]; + initialize_resync_progress_marks(peer_device); + peer_device->resync_next_bit = 0; + } + + if ((repl_state[OLD] == L_SYNC_TARGET || repl_state[OLD] == L_SYNC_SOURCE) && + (repl_state[NEW] == L_PAUSED_SYNC_T || repl_state[NEW] == L_PAUSED_SYNC_S)) { + drbd_info(peer_device, "Resync suspended\n"); + peer_device->rs_mark_time[peer_device->rs_last_mark] = jiffies; + } + + + if (repl_state[OLD] > L_ESTABLISHED && repl_state[NEW] <= L_ESTABLISHED) + clear_bit(RECONCILIATION_RESYNC, &peer_device->flags); + + if (repl_state[OLD] >= L_ESTABLISHED && repl_state[NEW] < L_ESTABLISHED) + clear_bit(AHEAD_TO_SYNC_SOURCE, &peer_device->flags); + + if (repl_state[OLD] == L_ESTABLISHED && + (repl_state[NEW] == L_VERIFY_S || repl_state[NEW] == L_VERIFY_T)) { + unsigned long now = jiffies; + int i; + + /* ldev_safe: repl_state[NEW] */ + set_ov_position(peer_device, repl_state[NEW]); + peer_device->rs_start = now; + peer_device->ov_last_oos_size = 0; + peer_device->ov_last_oos_start = 0; + peer_device->ov_last_skipped_size = 0; + peer_device->ov_last_skipped_start = 0; + peer_device->rs_last_writeout = now; + peer_device->rs_last_progress_report_ts = now; + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + peer_device->rs_mark_left[i] = peer_device->rs_total; + peer_device->rs_mark_time[i] = now; + } + + drbd_rs_controller_reset(peer_device); + } else if (!(repl_state[OLD] >= L_SYNC_SOURCE && repl_state[OLD] <= L_PAUSED_SYNC_T) && + (repl_state[NEW] >= L_SYNC_SOURCE && repl_state[NEW] <= L_PAUSED_SYNC_T)) { + initialize_resync(peer_device); + } + + if (disk_state[NEW] != D_NEGOTIATING && get_ldev(device)) { + if (peer_device->bitmap_index != -1) { + enum drbd_disk_state pdsk = peer_device->disk_state[NEW]; + u32 mdf = device->ldev->md.peers[peer_device->node_id].flags; + /* Do NOT clear MDF_PEER_DEVICE_SEEN here. + * We want to be able to refuse a resize beyond "last agreed" size, + * even if the peer is currently detached. + */ + mdf &= ~(MDF_PEER_CONNECTED | MDF_PEER_OUTDATED | MDF_PEER_FENCING); + if (repl_state[NEW] > L_OFF) + mdf |= MDF_PEER_CONNECTED; + if (pdsk >= D_INCONSISTENT) { + if (pdsk <= D_OUTDATED) + mdf |= MDF_PEER_OUTDATED; + if (pdsk != D_UNKNOWN) + mdf |= MDF_PEER_DEVICE_SEEN; + } + if (pdsk == D_DISKLESS && !want_bitmap(peer_device)) + mdf &= ~MDF_PEER_DEVICE_SEEN; + if (peer_device->connection->fencing_policy != FP_DONT_CARE) + mdf |= MDF_PEER_FENCING; + if (mdf != device->ldev->md.peers[peer_device->node_id].flags) { + device->ldev->md.peers[peer_device->node_id].flags = mdf; + drbd_md_mark_dirty(device); + } + } + + /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ + if (disk_state[OLD] == D_INCONSISTENT && + peer_disk_state[OLD] == D_INCONSISTENT && peer_disk_state[NEW] == D_UP_TO_DATE && + peer_role[OLD] == R_SECONDARY && peer_role[NEW] == R_PRIMARY) + set_bit(CONSIDER_RESYNC, &peer_device->flags); + + /* Resume AL writing if we get a connection */ + if (repl_state[OLD] < L_ESTABLISHED && repl_state[NEW] >= L_ESTABLISHED) + drbd_resume_al(device); + put_ldev(device); + } + + if (repl_state[OLD] == L_AHEAD && repl_state[NEW] == L_SYNC_SOURCE) { + set_bit(SEND_STATE_AFTER_AHEAD, &peer_device->flags); + set_bit(SEND_STATE_AFTER_AHEAD_C, &connection->flags); + + clear_bit(CONN_CONGESTED, &connection->flags); + wake_up(&connection->sender_work.q_wait); + } + + /* We start writing locally without replicating the changes, + * better start a new data generation */ + if (repl_state[OLD] != L_AHEAD && repl_state[NEW] == L_AHEAD) + create_new_uuid = true; + + if (lost_contact_to_peer_data(peer_disk_state)) { + if (role[NEW] == R_PRIMARY && !test_bit(UNREGISTERED, &device->flags) && + drbd_data_accessible(device, NEW)) + create_new_uuid = true; + + if (connection->agreed_pro_version < 110 && + peer_role[NEW] == R_PRIMARY && + disk_state[NEW] >= D_UP_TO_DATE) + create_new_uuid = true; + } + if (peer_returns_diskless(peer_device, peer_disk_state[OLD], peer_disk_state[NEW])) { + if (role[NEW] == R_PRIMARY && !test_bit(UNREGISTERED, &device->flags) && + disk_state[NEW] == D_UP_TO_DATE) + create_new_uuid = true; + } + + if (disk_state[OLD] > D_FAILED && disk_state[NEW] == D_FAILED && + role[NEW] == R_PRIMARY && drbd_data_accessible(device, NEW)) + create_new_uuid = true; + + if (peer_disk_state[NEW] < D_UP_TO_DATE && test_bit(GOT_NEG_ACK, &peer_device->flags)) + clear_bit(GOT_NEG_ACK, &peer_device->flags); + + if (repl_state[OLD] > L_ESTABLISHED && repl_state[NEW] <= L_ESTABLISHED) + clear_bit(SYNC_SRC_CRASHED_PRI, &peer_device->flags); + + if (peer_role[OLD] != peer_role[NEW] || role[OLD] != role[NEW] || + peer_disk_state[OLD] != peer_disk_state[NEW]) + drbd_update_mdf_al_disabled(device, NEW); + } + + if (disk_state[OLD] >= D_INCONSISTENT && disk_state[NEW] < D_INCONSISTENT && + role[NEW] == R_PRIMARY && drbd_data_accessible(device, NEW)) + create_new_uuid = true; + + if (role[OLD] == R_SECONDARY && role[NEW] == R_PRIMARY) + create_new_uuid = true; + + /* Only a single new current uuid when susp_uuid becomes true */ + if (create_new_uuid && !susp_uuid[OLD]) + set_bit(__NEW_CUR_UUID, &device->flags); + + if (disk_state[NEW] != D_NEGOTIATING && get_ldev_if_state(device, D_DETACHING)) { + u32 mdf = device->ldev->md.flags; + bool graceful_detach = disk_state[NEW] == D_DETACHING && !test_bit(FORCE_DETACH, &device->flags); + + /* For now, always require a drbdmeta apply-al run, + * even if that ends up only re-initializing the AL */ + mdf &= ~MDF_AL_CLEAN; + /* reset some flags to what we know now */ + mdf &= ~MDF_CRASHED_PRIMARY; + if (test_bit(CRASHED_PRIMARY, &device->flags) || + (role[NEW] == R_PRIMARY && !graceful_detach)) + mdf |= MDF_CRASHED_PRIMARY; + mdf &= ~MDF_PRIMARY_LOST_QUORUM; + if (test_bit(PRIMARY_LOST_QUORUM, &device->flags)) + mdf |= MDF_PRIMARY_LOST_QUORUM; + /* Do not touch MDF_CONSISTENT if we are D_FAILED */ + if (disk_state[NEW] >= D_INCONSISTENT) { + mdf &= ~(MDF_CONSISTENT | MDF_WAS_UP_TO_DATE); + + if (disk_state[NEW] > D_INCONSISTENT) + mdf |= MDF_CONSISTENT; + if (disk_state[NEW] > D_OUTDATED) + mdf |= MDF_WAS_UP_TO_DATE; + } else if ((disk_state[NEW] == D_FAILED || disk_state[NEW] == D_DETACHING) && + mdf & MDF_WAS_UP_TO_DATE && + primary_and_data_present(device)) { + /* There are cases when we still can update meta-data even if disk + state is failed.... Clear MDF_WAS_UP_TO_DATE if appropriate */ + mdf &= ~MDF_WAS_UP_TO_DATE; + } + +/* + * MDF_PRIMARY_IND IS set: apply activity log after crash + * MDF_PRIMARY_IND NOT set: do not apply, forget and re-initialize activity log after crash. + * We want the MDF_PRIMARY_IND set *always* before our backend could possibly + * be target of write requests, whether we are Secondary or Primary ourselves. + * + * We want to avoid to clear that flag just because we lost the connection to a + * detached Primary, but before all in-flight IO was drained, because we may + * have some dirty bits not yet persisted. + * + * We want it cleared only once we are *certain* that we no longer see any Primary, + * are not Primary ourselves, AND all previously received WRITE (peer-) requests + * have been processed, NOTHING is in flight against our backend anymore, + * AND we have successfully written out any dirty bitmap pages. + * + * + * MDF_PEER_DEVICE_SEEN ... The peer had a backing device at some point + * MDF_NODE_EXISTS ... We have seen evidence that this node exists in the cluster. + * Note: This bit does **not** get set when a new peer/connection is created with + * `drbdsetup new-peer ...`. The bit gets set when we establish a connection + * successfully for the first time or we learn via other nodes about the + * existence. + */ + + /* set, if someone is/becomes primary */ + if (role[NEW] == R_PRIMARY || some_peer_is_primary) + mdf |= MDF_PRIMARY_IND; + /* clear, if */ + else if (/* NO peer requests in flight, AND */ + !some_peer_request_in_flight && + (graceful_detach || + /* or everyone secondary ... */ + (role[NEW] == R_SECONDARY && !some_peer_is_primary && + /* ... and not detaching because of IO error. */ + disk_state[NEW] >= D_INCONSISTENT))) + mdf &= ~MDF_PRIMARY_IND; + + if (device->have_quorum[NEW]) + mdf |= MDF_HAVE_QUORUM; + else + mdf &= ~MDF_HAVE_QUORUM; + /* apply changed flags to md.flags, + * and "schedule" for write-out */ + if (mdf != device->ldev->md.flags || + device->ldev->md.members != resource->members) { + device->ldev->md.flags = mdf; + device->ldev->md.members = resource->members; + drbd_md_mark_dirty(device); + } + if (disk_state[OLD] < D_CONSISTENT && disk_state[NEW] >= D_CONSISTENT) + drbd_uuid_set_exposed(device, device->ldev->md.current_uuid, true); + put_ldev(device); + } + + /* remember last attach time so request_timer_fn() won't + * kill newly established sessions while we are still trying to thaw + * previously frozen IO */ + if ((disk_state[OLD] == D_ATTACHING || disk_state[OLD] == D_NEGOTIATING) && + disk_state[NEW] > D_NEGOTIATING) + device->last_reattach_jif = jiffies; + + if (!device->have_quorum[OLD] && device->have_quorum[NEW]) + clear_bit(PRIMARY_LOST_QUORUM, &device->flags); + + if (resource_suspended[NEW] && + !(role[OLD] == R_PRIMARY && !drbd_data_accessible(device, OLD)) && + (role[NEW] == R_PRIMARY && !drbd_data_accessible(device, NEW)) && + resource->res_opts.on_no_data == OND_IO_ERROR) + unfreeze_io = true; + + if (!resource->fail_io[OLD] && resource->fail_io[NEW]) + unfreeze_io = true; + + if (role[OLD] == R_PRIMARY && role[NEW] == R_SECONDARY) + clear_bit(NEW_CUR_UUID, &device->flags); + + if (should_try_become_up_to_date(device, disk_state, NEW)) + set_bit(TRY_BECOME_UP_TO_DATE_PENDING, &resource->flags); + } + + for_each_connection(connection, resource) { + enum drbd_conn_state *cstate = connection->cstate; + enum drbd_role *peer_role = connection->peer_role; + + /* + * If we lose connection to a Primary node then we need to + * inform our peers so that we can potentially do a + * reconciliation resync. The function conn_disconnect() + * informs the peers. So we must set the flag before stopping + * the receiver. + */ + if (cstate[OLD] == C_CONNECTED && cstate[NEW] < C_CONNECTED && + peer_role[OLD] == R_PRIMARY) + set_bit(NOTIFY_PEERS_LOST_PRIMARY, &connection->flags); + + /* Receiver should clean up itself */ + if (cstate[OLD] != C_DISCONNECTING && cstate[NEW] == C_DISCONNECTING) + drbd_thread_stop_nowait(&connection->receiver); + + /* Now the receiver finished cleaning up itself, it should die */ + if (cstate[OLD] != C_STANDALONE && cstate[NEW] == C_STANDALONE) + drbd_thread_stop_nowait(&connection->receiver); + + /* Upon network failure, we need to restart the receiver. */ + if (cstate[OLD] >= C_CONNECTING && + cstate[NEW] <= C_TEAR_DOWN && cstate[NEW] >= C_TIMEOUT) + drbd_thread_restart_nowait(&connection->receiver); + + if (cstate[OLD] == C_CONNECTED && cstate[NEW] < C_CONNECTED) + twopc_connection_down(connection); + + /* remember last connect time so request_timer_fn() won't + * kill newly established sessions while we are still trying to thaw + * previously frozen IO */ + if (cstate[OLD] < C_CONNECTED && cstate[NEW] == C_CONNECTED) + connection->last_reconnect_jif = jiffies; + + if (resource_suspended[OLD]) { + enum drbd_req_event walk_event = -1; + + /* If we resume IO without this connection, then we + * need to cancel suspended requests. */ + if ((!resource_suspended[NEW] || unfreeze_io) && cstate[NEW] < C_CONNECTED) + walk_event = CANCEL_SUSPENDED_IO; + /* On reconnection when we have been suspended we need + * to process suspended requests. If there are resyncs, + * that means that it was not a simple disconnect and + * reconnect, so we cannot resend. We must cancel + * instead. */ + else if (cstate[OLD] < C_CONNECTED && cstate[NEW] == C_CONNECTED) + walk_event = has_starting_resyncs(connection) ? CANCEL_SUSPENDED_IO : RESEND; + + if (walk_event != -1) + __tl_walk(resource, connection, &connection->req_not_net_done, walk_event); + + /* Since we are in finish_state_change(), and the state + * was previously not C_CONNECTED, the sender cannot + * have received any requests yet. So it will find any + * requests to resend when it rescans the transfer log. */ + if (walk_event == RESEND) + wake_up(&connection->sender_work.q_wait); + } + + if (cstate[OLD] == C_CONNECTED && cstate[NEW] < C_CONNECTED) + set_bit(RECONNECT, &connection->flags); + + if (starting_resync && peer_role[NEW] == R_PRIMARY) + apply_unacked_peer_requests(connection); + + if (peer_role[OLD] == R_PRIMARY && peer_role[NEW] == R_UNKNOWN) + lost_a_primary_peer = true; + } + + if (lost_a_primary_peer) { + idr_for_each_entry(&resource->devices, device, vnr) { + struct drbd_peer_device *peer_device; + + for_each_peer_device(peer_device, device) { + enum drbd_repl_state repl_state = peer_device->repl_state[NEW]; + + if (!test_bit(UNSTABLE_RESYNC, &peer_device->flags) && + (repl_state == L_SYNC_TARGET || repl_state == L_PAUSED_SYNC_T) && + !(peer_device->uuid_flags & UUID_FLAG_STABLE) && + !drbd_stable_sync_source_present(peer_device, NEW)) + set_bit(UNSTABLE_RESYNC, &peer_device->flags); + } + } + } + + if (resource_suspended[OLD] && !resource_suspended[NEW]) + drbd_restart_suspended_reqs(resource); + + if ((resource_suspended[OLD] && !resource_suspended[NEW]) || unfreeze_io) + __tl_walk(resource, NULL, NULL, COMPLETION_RESUMED); +} + +static void abw_start_sync(struct drbd_device *device, + struct drbd_peer_device *peer_device, int rv) +{ + struct drbd_peer_device *pd; + + if (rv) { + drbd_err(device, "Writing the bitmap failed not starting resync.\n"); + stable_change_repl_state(peer_device, L_ESTABLISHED, CS_VERBOSE, "start-sync"); + return; + } + + switch (peer_device->repl_state[NOW]) { + case L_STARTING_SYNC_T: + /* Since the number of set bits changed and the other peer_devices are + lready in L_PAUSED_SYNC_T state, we need to set rs_total here */ + rcu_read_lock(); + for_each_peer_device_rcu(pd, device) + initialize_resync(pd); + rcu_read_unlock(); + + if (peer_device->connection->agreed_pro_version < 110) + stable_change_repl_state(peer_device, L_WF_SYNC_UUID, CS_VERBOSE, + "start-sync"); + else + drbd_start_resync(peer_device, L_SYNC_TARGET, "start-sync"); + break; + case L_STARTING_SYNC_S: + drbd_start_resync(peer_device, L_SYNC_SOURCE, "start-sync"); + break; + default: + break; + } +} + +int drbd_bitmap_io_from_worker(struct drbd_device *device, + int (*io_fn)(struct drbd_device *, struct drbd_peer_device *), + char *why, enum bm_flag flags, + struct drbd_peer_device *peer_device) +{ + int rv; + + D_ASSERT(device, current == device->resource->worker.task); + + if (!device->bitmap) + return 0; + + /* open coded non-blocking drbd_suspend_io(device); */ + atomic_inc(&device->suspend_cnt); + + if (flags & BM_LOCK_SINGLE_SLOT) + drbd_bm_slot_lock(peer_device, why, flags); + else + drbd_bm_lock(device, why, flags); + rv = io_fn(device, peer_device); + if (flags & BM_LOCK_SINGLE_SLOT) + drbd_bm_slot_unlock(peer_device); + else + drbd_bm_unlock(device); + + drbd_resume_io(device); + + return rv; +} + +static bool state_change_is_susp_fen(struct drbd_state_change *state_change, + enum which_state which) +{ + int n_connection; + + for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { + struct drbd_connection_state_change *connection_state_change = + &state_change->connections[n_connection]; + + if (connection_state_change->susp_fen[which]) + return true; + } + + return false; +} + +static bool state_change_is_susp_quorum(struct drbd_state_change *state_change, + enum which_state which) +{ + struct drbd_resource *resource = state_change->resource[0].resource; + int n_device; + + if (resource->res_opts.on_no_quorum != ONQ_SUSPEND_IO) + return false; + + for (n_device = 0; n_device < state_change->n_devices; n_device++) { + struct drbd_device_state_change *device_state_change = + &state_change->devices[n_device]; + + if (!device_state_change->have_quorum[which]) + return true; + } + + return false; +} + +static bool resync_susp_comb_dep_sc(struct drbd_state_change *state_change, + unsigned int n_device, int n_connection, + enum which_state which) +{ + struct drbd_peer_device_state_change *peer_device_state_change = + &state_change->peer_devices[n_device * state_change->n_connections + n_connection]; + struct drbd_device_state_change *device_state_change = &state_change->devices[n_device]; + bool resync_susp_dependency = peer_device_state_change->resync_susp_dependency[which]; + bool resync_susp_other_c = peer_device_state_change->resync_susp_other_c[which]; + enum drbd_repl_state repl_state = peer_device_state_change->repl_state[which]; + enum drbd_disk_state disk_state = device_state_change->disk_state[which]; + + return resync_susp_dependency || resync_susp_other_c || + ((repl_state == L_SYNC_SOURCE || repl_state == L_PAUSED_SYNC_S) + && disk_state <= D_INCONSISTENT); +} + +static union drbd_state state_change_word(struct drbd_state_change *state_change, + unsigned int n_device, int n_connection, + enum which_state which) +{ + struct drbd_resource_state_change *resource_state_change = + &state_change->resource[0]; + struct drbd_device_state_change *device_state_change = + &state_change->devices[n_device]; + union drbd_state state = { { + .role = R_UNKNOWN, + .peer = R_UNKNOWN, + .conn = C_STANDALONE, + .disk = D_UNKNOWN, + .pdsk = D_UNKNOWN, + } }; + + state.role = resource_state_change->role[which]; + state.susp = resource_state_change->susp[which] || state_change_is_susp_quorum(state_change, which) || + resource_state_change->susp_uuid[which]; + state.susp_nod = resource_state_change->susp_nod[which]; + state.susp_fen = state_change_is_susp_fen(state_change, which); + state.quorum = device_state_change->have_quorum[which]; + state.disk = device_state_change->disk_state[which]; + if (n_connection != -1) { + struct drbd_connection_state_change *connection_state_change = + &state_change->connections[n_connection]; + struct drbd_peer_device_state_change *peer_device_state_change = + &state_change->peer_devices[n_device * state_change->n_connections + n_connection]; + + state.peer = connection_state_change->peer_role[which]; + state.conn = peer_device_state_change->repl_state[which]; + if (state.conn <= L_OFF) + state.conn = connection_state_change->cstate[which]; + state.pdsk = peer_device_state_change->disk_state[which]; + state.aftr_isp = resync_susp_comb_dep_sc(state_change, n_device, n_connection, which); + state.peer_isp = peer_device_state_change->resync_susp_peer[which]; + state.user_isp = peer_device_state_change->resync_susp_user[which]; + } + return state; +} + +int notify_resource_state_change(struct sk_buff *skb, + unsigned int seq, + void *state_change, + enum drbd_notification_type type) +{ + struct drbd_resource_state_change *resource_state_change = + ((struct drbd_state_change *)state_change)->resource; + struct drbd_resource *resource = resource_state_change->resource; + struct resource_info resource_info = { + .res_role = resource_state_change->role[NEW], + .res_susp = resource_state_change->susp[NEW], + .res_susp_nod = resource_state_change->susp_nod[NEW], + .res_susp_fen = state_change_is_susp_fen(state_change, NEW), + .res_susp_quorum = state_change_is_susp_quorum(state_change, NEW) || + resource_state_change->susp_uuid[NEW], + .res_fail_io = resource_state_change->fail_io[NEW], + }; + + return notify_resource_state(skb, seq, resource, &resource_info, NULL, type); +} + +int notify_connection_state_change(struct sk_buff *skb, + unsigned int seq, + void *state_change, + enum drbd_notification_type type) +{ + struct drbd_connection_state_change *connection_state_change = state_change; + struct drbd_connection *connection = connection_state_change->connection; + struct connection_info connection_info = { + .conn_connection_state = connection_state_change->cstate[NEW], + .conn_role = connection_state_change->peer_role[NEW], + }; + + return notify_connection_state(skb, seq, connection, &connection_info, type); +} + +int notify_device_state_change(struct sk_buff *skb, + unsigned int seq, + void *state_change, + enum drbd_notification_type type) +{ + struct drbd_device_state_change *device_state_change = state_change; + struct drbd_device *device = device_state_change->device; + struct device_info device_info; + device_state_change_to_info(&device_info, device_state_change); + + return notify_device_state(skb, seq, device, &device_info, type); +} + +int notify_peer_device_state_change(struct sk_buff *skb, + unsigned int seq, + void *state_change, + enum drbd_notification_type type) +{ + struct drbd_peer_device_state_change *peer_device_state_change = state_change; + struct drbd_peer_device *peer_device = peer_device_state_change->peer_device; + struct peer_device_info peer_device_info; + peer_device_state_change_to_info(&peer_device_info, state_change); + + return notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type); +} + +static void notify_state_change(struct drbd_state_change *state_change) +{ + struct drbd_resource_state_change *resource_state_change = &state_change->resource[0]; + bool resource_state_has_changed; + unsigned int n_device, n_connection, n_peer_device, n_peer_devices; + int (*last_func)(struct sk_buff *, unsigned int, void *, + enum drbd_notification_type) = NULL; + void *last_arg = NULL; + +#define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW]) +#define FINAL_STATE_CHANGE(type) \ + ({ if (last_func) \ + last_func(NULL, 0, last_arg, type); \ + }) +#define REMEMBER_STATE_CHANGE(func, arg, type) \ + ({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \ + last_func = (typeof(last_func))func; \ + last_arg = arg; \ + }) + + mutex_lock(¬ification_mutex); + + resource_state_has_changed = + HAS_CHANGED(resource_state_change->role) || + HAS_CHANGED(resource_state_change->susp) || + HAS_CHANGED(resource_state_change->susp_nod) || + HAS_CHANGED(resource_state_change->susp_uuid) || + state_change_is_susp_fen(state_change, OLD) != + state_change_is_susp_fen(state_change, NEW) || + state_change_is_susp_quorum(state_change, OLD) != + state_change_is_susp_quorum(state_change, NEW) || + HAS_CHANGED(resource_state_change->fail_io); + + if (resource_state_has_changed) + REMEMBER_STATE_CHANGE(notify_resource_state_change, + state_change, NOTIFY_CHANGE); + + for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { + struct drbd_connection_state_change *connection_state_change = + &state_change->connections[n_connection]; + + if (HAS_CHANGED(connection_state_change->peer_role) || + HAS_CHANGED(connection_state_change->cstate)) + REMEMBER_STATE_CHANGE(notify_connection_state_change, + connection_state_change, NOTIFY_CHANGE); + } + + for (n_device = 0; n_device < state_change->n_devices; n_device++) { + struct drbd_device_state_change *device_state_change = + &state_change->devices[n_device]; + + if (HAS_CHANGED(device_state_change->disk_state) || + HAS_CHANGED(device_state_change->have_quorum)) + REMEMBER_STATE_CHANGE(notify_device_state_change, + device_state_change, NOTIFY_CHANGE); + } + + n_peer_devices = state_change->n_devices * state_change->n_connections; + for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) { + struct drbd_peer_device_state_change *p = + &state_change->peer_devices[n_peer_device]; + + if (HAS_CHANGED(p->disk_state) || + HAS_CHANGED(p->repl_state) || + HAS_CHANGED(p->resync_susp_user) || + HAS_CHANGED(p->resync_susp_peer) || + HAS_CHANGED(p->resync_susp_dependency) || + HAS_CHANGED(p->resync_susp_other_c)) + REMEMBER_STATE_CHANGE(notify_peer_device_state_change, + p, NOTIFY_CHANGE); + } + + FINAL_STATE_CHANGE(NOTIFY_CHANGE); + mutex_unlock(¬ification_mutex); + +#undef HAS_CHANGED +#undef FINAL_STATE_CHANGE +#undef REMEMBER_STATE_CHANGE +} + +static void send_role_to_all_peers(struct drbd_state_change *state_change) +{ + unsigned int n_connection; + + for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { + struct drbd_connection_state_change *connection_state_change = + &state_change->connections[n_connection]; + struct drbd_connection *connection = connection_state_change->connection; + enum drbd_conn_state new_cstate = connection_state_change->cstate[NEW]; + + if (new_cstate < C_CONNECTED) + continue; + + if (connection->agreed_pro_version < 110) { + unsigned int n_device; + + /* Before DRBD 9, the role is a device attribute + * instead of a resource attribute. */ + for (n_device = 0; n_device < state_change->n_devices; n_device++) { + struct drbd_peer_device *peer_device = + state_change->peer_devices[n_connection].peer_device; + union drbd_state state = + state_change_word(state_change, n_device, n_connection, NEW); + + drbd_send_state(peer_device, state); + } + } else { + union drbd_state state = { { + .role = state_change->resource[0].role[NEW], + } }; + + conn_send_state(connection, state); + } + } +} + +static void send_new_state_to_all_peer_devices(struct drbd_state_change *state_change, int n_device) +{ + unsigned int n_connection; + + BUG_ON(state_change->n_devices <= n_device); + for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { + struct drbd_peer_device_state_change *peer_device_state_change = + &state_change->peer_devices[n_device * state_change->n_connections + n_connection]; + struct drbd_peer_device *peer_device = peer_device_state_change->peer_device; + union drbd_state new_state = state_change_word(state_change, n_device, n_connection, NEW); + + if (new_state.conn >= C_CONNECTED) + drbd_send_state(peer_device, new_state); + } +} + +/* This function is supposed to have the same semantics as drbd_device_stable() in drbd_main.c + A primary is stable since it is authoritative. + Unstable are neighbors of a primary and resync target nodes. + Nodes further away from a primary are stable! Do no confuse with "weak".*/ +static bool calc_device_stable(struct drbd_state_change *state_change, int n_device, enum which_state which) +{ + int n_connection; + + if (state_change->resource->role[which] == R_PRIMARY) + return true; + + for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { + struct drbd_connection_state_change *connection_state_change = + &state_change->connections[n_connection]; + enum drbd_role *peer_role = connection_state_change->peer_role; + + if (peer_role[which] == R_PRIMARY) + return false; + } + + return true; +} + +static bool calc_resync_target(struct drbd_state_change *state_change, int n_device, enum which_state which) +{ + int n_connection; + + for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { + struct drbd_peer_device_state_change *peer_device_state_change = + &state_change->peer_devices[n_device * state_change->n_connections + n_connection]; + enum drbd_repl_state *repl_state = peer_device_state_change->repl_state; + + switch (repl_state[which]) { + case L_WF_BITMAP_T: + case L_SYNC_TARGET: + case L_PAUSED_SYNC_T: + return true; + default: + continue; + } + } + + return false; +} + +/* takes old and new peer disk state */ +static bool lost_contact_to_peer_data(enum drbd_disk_state *peer_disk_state) +{ + enum drbd_disk_state os = peer_disk_state[OLD]; + enum drbd_disk_state ns = peer_disk_state[NEW]; + + return (os >= D_INCONSISTENT && os != D_UNKNOWN && os != D_OUTDATED) + && (ns < D_INCONSISTENT || ns == D_UNKNOWN || ns == D_OUTDATED); +} + +static bool peer_returns_diskless(struct drbd_peer_device *peer_device, + enum drbd_disk_state os, enum drbd_disk_state ns) +{ + struct drbd_device *device = peer_device->device; + bool rv = false; + + /* Scenario, starting with normal operation + * Connected Primary/Secondary UpToDate/UpToDate + * NetworkFailure Primary/Unknown UpToDate/DUnknown (frozen) + * ... + * Connected Primary/Secondary UpToDate/Diskless (resumed; needs to bump uuid!) + */ + + if (get_ldev(device)) { + if (os == D_UNKNOWN && (ns == D_DISKLESS || ns == D_FAILED || ns == D_OUTDATED) && + drbd_bitmap_uuid(peer_device) == 0) + rv = true; + put_ldev(device); + } + return rv; +} + +static void check_may_resume_io_after_fencing(struct drbd_state_change *state_change, int n_connection) +{ + struct drbd_connection_state_change *connection_state_change = &state_change->connections[n_connection]; + struct drbd_resource_state_change *resource_state_change = &state_change->resource[0]; + struct drbd_connection *connection = connection_state_change->connection; + struct drbd_resource *resource = resource_state_change->resource; + bool all_peer_disks_outdated = true; + bool all_peer_disks_connected = true; + struct drbd_peer_device *peer_device; + unsigned long irq_flags; + int vnr, n_device; + + for (n_device = 0; n_device < state_change->n_devices; n_device++) { + struct drbd_peer_device_state_change *peer_device_state_change = + &state_change->peer_devices[n_device * state_change->n_connections + n_connection]; + enum drbd_repl_state *repl_state = peer_device_state_change->repl_state; + enum drbd_disk_state *peer_disk_state = peer_device_state_change->disk_state; + + if (peer_disk_state[NEW] > D_OUTDATED) + all_peer_disks_outdated = false; + if (repl_state[NEW] < L_ESTABLISHED) + all_peer_disks_connected = false; + } + + /* case1: The outdate peer handler is successful: */ + if (all_peer_disks_outdated) { + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + if (test_and_clear_bit(NEW_CUR_UUID, &device->flags)) { + kref_get(&device->kref); + rcu_read_unlock(); + drbd_uuid_new_current(device, false); + kref_put(&device->kref, drbd_destroy_device); + rcu_read_lock(); + } + } + rcu_read_unlock(); + begin_state_change(resource, &irq_flags, CS_VERBOSE); + __change_io_susp_fencing(connection, false); + end_state_change(resource, &irq_flags, "after-fencing"); + } + /* case2: The connection was established again: */ + if (all_peer_disks_connected) { + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + clear_bit(NEW_CUR_UUID, &device->flags); + } + rcu_read_unlock(); + begin_state_change(resource, &irq_flags, CS_VERBOSE); + __change_io_susp_fencing(connection, false); + end_state_change(resource, &irq_flags, "after-fencing"); + } +} + +static bool drbd_should_unfence(struct drbd_state_change *state_change, int n_connection) +{ + bool some_peer_was_not_up_to_date = false; + int n_device; + + for (n_device = 0; n_device < state_change->n_devices; n_device++) { + struct drbd_device_state_change *device_state_change = + &state_change->devices[n_device]; + enum drbd_disk_state *disk_state = device_state_change->disk_state; + struct drbd_peer_device_state_change *peer_device_state_change = + &state_change->peer_devices[ + n_device * state_change->n_connections + n_connection]; + enum drbd_disk_state *peer_disk_state = peer_device_state_change->disk_state; + + /* Do not unfence if some volume is not yet up-to-date. */ + if (disk_state[NEW] != D_UP_TO_DATE || peer_disk_state[NEW] != D_UP_TO_DATE) + return false; + + /* Only unfence when the final volume becomes up-to-date. */ + if (peer_disk_state[OLD] != D_UP_TO_DATE) + some_peer_was_not_up_to_date = true; + } + + return some_peer_was_not_up_to_date; +} + +static bool use_checksum_based_resync(struct drbd_connection *connection, struct drbd_device *device) +{ + bool csums_after_crash_only; + rcu_read_lock(); + csums_after_crash_only = rcu_dereference(connection->transport.net_conf)->csums_after_crash_only; + rcu_read_unlock(); + return connection->agreed_pro_version >= 89 && /* supported? */ + connection->csums_tfm && /* configured? */ + (csums_after_crash_only == false /* use for each resync? */ + || test_bit(CRASHED_PRIMARY, &device->flags)); /* or only after Primary crash? */ +} + +static void drbd_run_resync(struct drbd_peer_device *peer_device, enum drbd_repl_state repl_state) +{ + struct drbd_device *device = peer_device->device; + struct drbd_bitmap *bm = device->bitmap; + struct drbd_connection *connection = peer_device->connection; + enum drbd_repl_state side = repl_is_sync_target(repl_state) ? L_SYNC_TARGET : L_SYNC_SOURCE; + + drbd_info(peer_device, "Began resync as %s (will sync %llu KB [%lu bits set]).\n", + drbd_repl_str(repl_state), + bm_bit_to_kb(bm, peer_device->rs_total), + (unsigned long) peer_device->rs_total); + + if (side == L_SYNC_TARGET) + drbd_uuid_set_exposed(device, peer_device->current_uuid, false); + + peer_device->use_csums = side == L_SYNC_TARGET ? + use_checksum_based_resync(connection, device) : false; + + if (side == L_SYNC_TARGET && + !(peer_device->uuid_flags & UUID_FLAG_STABLE) && + !drbd_stable_sync_source_present(peer_device, NOW)) + set_bit(UNSTABLE_RESYNC, &peer_device->flags); + + /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid + * with w_send_oos, or the sync target will get confused as to + * how much bits to resync. We cannot do that always, because for an + * empty resync and protocol < 95, we need to do it here, as we call + * drbd_resync_finished from here in that case. + * We drbd_gen_and_send_sync_uuid here for protocol < 96, + * and from after_state_ch otherwise. */ + if (side == L_SYNC_SOURCE && connection->agreed_pro_version < 96) + drbd_gen_and_send_sync_uuid(peer_device); + + if (connection->agreed_pro_version < 95 && peer_device->rs_total == 0) { + /* This still has a race (about when exactly the peers + * detect connection loss) that can lead to a full sync + * on next handshake. In 8.3.9 we fixed this with explicit + * resync-finished notifications, but the fix + * introduces a protocol change. Sleeping for some + * time longer than the ping interval + timeout on the + * SyncSource, to give the SyncTarget the chance to + * detect connection loss, then waiting for a ping + * response (implicit in drbd_resync_finished) reduces + * the race considerably, but does not solve it. */ + if (side == L_SYNC_SOURCE) { + struct net_conf *nc; + int timeo; + + rcu_read_lock(); + nc = rcu_dereference(connection->transport.net_conf); + timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9; + rcu_read_unlock(); + schedule_timeout_interruptible(timeo); + } + drbd_resync_finished(peer_device, D_MASK); + } + + /* ns.conn may already be != peer_device->repl_state[NOW], + * we may have been paused in between, or become paused until + * the timer triggers. + * No matter, that is handled in resync_timer_fn() */ + if (repl_state == L_SYNC_TARGET || repl_state == L_PAUSED_SYNC_T) + drbd_uuid_resync_starting(peer_device); + + drbd_md_sync_if_dirty(device); +} + + +/* + * Perform after state change actions that may sleep. + */ +static int w_after_state_change(struct drbd_work *w, int unused) +{ + struct after_state_change_work *work = + container_of(w, struct after_state_change_work, w); + struct drbd_state_change *state_change = work->state_change; + struct drbd_resource_state_change *resource_state_change = &state_change->resource[0]; + struct drbd_resource *resource = resource_state_change->resource; + enum drbd_role *role = resource_state_change->role; + bool *susp_uuid = resource_state_change->susp_uuid; + struct drbd_peer_device *send_state_others = NULL; + int n_device, n_connection; + bool still_connected = false; + bool try_become_up_to_date = false; + bool healed_primary = false; + bool send_flush_requests = false; + + notify_state_change(state_change); + + for (n_device = 0; n_device < state_change->n_devices; n_device++) { + struct drbd_device_state_change *device_state_change = &state_change->devices[n_device]; + struct drbd_device *device = device_state_change->device; + enum drbd_disk_state *disk_state = device_state_change->disk_state; + bool have_ldev = extra_ldev_ref_for_after_state_chg(disk_state); + bool *have_quorum = device_state_change->have_quorum; + bool effective_disk_size_determined = false; + bool device_stable[2], resync_target[2]; + bool data_accessible[2]; + bool all_peer_replication[2]; + bool resync_finished = false; + bool some_peer_demoted = false; + bool new_current_uuid = false; + enum which_state which; + + for (which = OLD; which <= NEW; which++) { + device_stable[which] = calc_device_stable(state_change, n_device, which); + resync_target[which] = calc_resync_target(state_change, n_device, which); + data_accessible[which] = + calc_data_accessible(state_change, n_device, which); + all_peer_replication[which] = + drbd_all_peer_replication_change(state_change, n_device, which); + + } + + if (disk_state[NEW] == D_UP_TO_DATE) + effective_disk_size_determined = true; + + for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { + struct drbd_peer_device_state_change *peer_device_state_change = + &state_change->peer_devices[ + n_device * state_change->n_connections + n_connection]; + struct drbd_peer_device *peer_device = peer_device_state_change->peer_device; + enum drbd_disk_state *peer_disk_state = peer_device_state_change->disk_state; + enum drbd_repl_state *repl_state = peer_device_state_change->repl_state; + + if ((repl_state[OLD] == L_SYNC_TARGET || repl_state[OLD] == L_PAUSED_SYNC_T) && + repl_state[NEW] == L_ESTABLISHED) + resync_finished = true; + + if (disk_state[OLD] == D_INCONSISTENT && disk_state[NEW] == D_UP_TO_DATE && + peer_disk_state[OLD] == D_INCONSISTENT && peer_disk_state[NEW] == D_UP_TO_DATE) + send_state_others = peer_device; + + /* connect without resync or remote attach without resync */ + if (disk_state[NOW] >= D_OUTDATED && repl_state[NEW] == L_ESTABLISHED && + ((repl_state[OLD] == L_OFF && peer_disk_state[NEW] >= D_OUTDATED) || + (peer_disk_state[OLD] == D_DISKLESS && peer_disk_state[NEW] >= D_OUTDATED))) { + u64 peer_current_uuid = peer_device->current_uuid & ~UUID_PRIMARY; + u64 my_current_uuid = drbd_current_uuid(device) & ~UUID_PRIMARY; + + if (peer_current_uuid == my_current_uuid && get_ldev(device)) { + down_write(&device->uuid_sem); + drbd_uuid_set_bitmap(peer_device, 0); + up_write(&device->uuid_sem); + drbd_print_uuids(peer_device, "cleared bm UUID and bitmap"); + drbd_bitmap_io_from_worker(device, &drbd_bmio_clear_one_peer, + "clearing bm one peer", BM_LOCK_CLEAR | BM_LOCK_BULK, + peer_device); + put_ldev(device); + } + } + } + + if (role[NEW] == R_PRIMARY && !data_accessible[OLD] && data_accessible[NEW]) + healed_primary = true; + + for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { + struct drbd_connection_state_change *connection_state_change = &state_change->connections[n_connection]; + struct drbd_connection *connection = connection_state_change->connection; + enum drbd_conn_state *cstate = connection_state_change->cstate; + enum drbd_role *peer_role = connection_state_change->peer_role; + struct drbd_peer_device_state_change *peer_device_state_change = + &state_change->peer_devices[ + n_device * state_change->n_connections + n_connection]; + struct drbd_peer_device *peer_device = peer_device_state_change->peer_device; + enum drbd_repl_state *repl_state = peer_device_state_change->repl_state; + enum drbd_disk_state *peer_disk_state = peer_device_state_change->disk_state; + bool *resync_susp_user = peer_device_state_change->resync_susp_user; + bool *resync_susp_peer = peer_device_state_change->resync_susp_peer; + bool *resync_susp_dependency = peer_device_state_change->resync_susp_dependency; + union drbd_state new_state = + state_change_word(state_change, n_device, n_connection, NEW); + bool send_uuids, send_state = false; + + /* In case we finished a resync as resync-target update all neighbors + * about having a bitmap_uuid of 0 towards the previous sync-source. + * That needs to go out before sending the new disk state + * to avoid a race where the other node might downgrade our disk + * state due to old UUID values. + * + * Also check the replication state to ensure that we + * do not send these extra UUIDs before the initial + * handshake. */ + send_uuids = resync_finished && + peer_disk_state[NEW] != D_UNKNOWN && + repl_state[NEW] > L_OFF; + + /* Send UUIDs again if they changed while establishing the connection */ + if (repl_state[OLD] == L_OFF && repl_state[NEW] > L_OFF && + peer_device->comm_current_uuid != drbd_resolved_uuid(peer_device, NULL)) + send_uuids = true; + + if (repl_state[NEW] > L_OFF && device_stable[OLD] != device_stable[NEW]) + send_uuids = true; + + if (send_uuids) + drbd_send_uuids(peer_device, 0, 0); + + if (peer_disk_state[NEW] == D_UP_TO_DATE) + effective_disk_size_determined = true; + + if (!(role[OLD] == R_PRIMARY && !data_accessible[OLD]) && + (role[NEW] == R_PRIMARY && !data_accessible[NEW]) && + !test_bit(UNREGISTERED, &device->flags)) + drbd_maybe_khelper(device, connection, "pri-on-incon-degr"); + + /* Became sync source. With protocol >= 96, we still need to send out + * the sync uuid now. Need to do that before any drbd_send_state, or + * the other side may go "paused sync" before receiving the sync uuids, + * which is unexpected. */ + if (!(repl_state[OLD] == L_SYNC_SOURCE || repl_state[OLD] == L_PAUSED_SYNC_S) && + (repl_state[NEW] == L_SYNC_SOURCE || repl_state[NEW] == L_PAUSED_SYNC_S) && + connection->agreed_pro_version >= 96 && connection->agreed_pro_version < 110 && + get_ldev(device)) { + drbd_gen_and_send_sync_uuid(peer_device); + put_ldev(device); + } + + /* Do not change the order of the if above and the two below... */ + if (peer_disk_state[OLD] < D_NEGOTIATING && + peer_disk_state[NEW] == D_NEGOTIATING) { /* attach on the peer */ + /* we probably will start a resync soon. + * make sure those things are properly reset. */ + peer_device->rs_total = 0; + peer_device->rs_failed = 0; + + drbd_send_uuids(peer_device, 0, 0); + drbd_send_state(peer_device, new_state); + } + /* No point in queuing send_bitmap if we don't have a connection + * anymore, so check also the _current_ state, not only the new state + * at the time this work was queued. */ + if (repl_state[OLD] != L_WF_BITMAP_S && repl_state[NEW] == L_WF_BITMAP_S && + peer_device->repl_state[NOW] == L_WF_BITMAP_S) { + /* Now that the connection is L_WF_BITMAP_S, + * new requests will be sent to the peer as + * P_OUT_OF_SYNC packets. However, active + * requests may not have been communicated to + * the peer and may not yet be marked in the + * local bitmap. Mark these requests in the + * bitmap before reading and sending that + * bitmap. This may set bits unnecessarily, but + * it does no harm to resync a small amount of + * additional data. */ + drbd_set_pending_out_of_sync(peer_device); + /* ldev_safe: ref from extra_ldev_ref_for_after_state_chg() */ + drbd_queue_bitmap_io(device, &drbd_send_bitmap, NULL, + "send_bitmap (WFBitMapS)", + BM_LOCK_SET | BM_LOCK_CLEAR | BM_LOCK_BULK | BM_LOCK_SINGLE_SLOT, + peer_device); + } + + if (peer_role[OLD] == R_PRIMARY && peer_role[NEW] == R_SECONDARY) + some_peer_demoted = true; + + /* Last part of the attaching process ... */ + if (cstate[NEW] == C_CONNECTED && /* repl_state[NEW] might still be L_OFF */ + disk_state[OLD] == D_ATTACHING && disk_state[NEW] >= D_NEGOTIATING) { + drbd_send_sizes(peer_device, 0, 0); /* to start sync... */ + drbd_send_uuids(peer_device, 0, 0); + drbd_send_state(peer_device, new_state); + } + + /* Started resync, tell peer if drbd9 */ + if (repl_state[NEW] >= L_SYNC_SOURCE && repl_state[NEW] <= L_PAUSED_SYNC_T && + (repl_state[OLD] < L_SYNC_SOURCE || repl_state[OLD] > L_PAUSED_SYNC_T)) + send_state = true; + + /* We want to pause/continue resync, tell peer. */ + if (repl_state[NEW] >= L_ESTABLISHED && + ((resync_susp_comb_dep_sc(state_change, n_device, n_connection, OLD) != + resync_susp_comb_dep_sc(state_change, n_device, n_connection, NEW)) || + (resync_susp_user[OLD] != resync_susp_user[NEW]))) + send_state = true; + + /* finished resync, tell sync source */ + if ((repl_state[OLD] == L_SYNC_TARGET || repl_state[OLD] == L_PAUSED_SYNC_T) && + repl_state[NEW] == L_ESTABLISHED) + send_state = true; + + /* In case one of the isp bits got set, suspend other devices. */ + if (!(resync_susp_dependency[OLD] || resync_susp_peer[OLD] || resync_susp_user[OLD]) && + (resync_susp_dependency[NEW] || resync_susp_peer[NEW] || resync_susp_user[NEW])) + /* ldev_safe: ref from extra_ldev_ref_for_after_state_chg() */ + suspend_other_sg(device); + + /* Make sure the peer gets informed about eventual state + changes (ISP bits) while we were in L_OFF. */ + if (repl_state[OLD] == L_OFF && repl_state[NEW] >= L_ESTABLISHED) + send_state = true; + + if (repl_state[OLD] != L_AHEAD && repl_state[NEW] == L_AHEAD) + send_state = true; + + /* We are in the progress to start a full sync. SyncTarget sets all slots. */ + if (repl_state[OLD] != L_STARTING_SYNC_T && repl_state[NEW] == L_STARTING_SYNC_T) + /* ldev_safe: ref from extra_ldev_ref_for_after_state_chg() */ + drbd_queue_bitmap_io(device, + &drbd_bmio_set_all_n_write, &abw_start_sync, + "set_n_write from StartingSync", + BM_LOCK_CLEAR | BM_LOCK_BULK, + peer_device); + + /* We are in the progress to start a full sync. SyncSource one slot. */ + if (repl_state[OLD] != L_STARTING_SYNC_S && repl_state[NEW] == L_STARTING_SYNC_S) + /* ldev_safe: ref from extra_ldev_ref_for_after_state_chg() */ + drbd_queue_bitmap_io(device, + &drbd_bmio_set_n_write, &abw_start_sync, + "set_n_write from StartingSync", + BM_LOCK_CLEAR | BM_LOCK_BULK, + peer_device); + + /* Disks got bigger while they were detached */ + if (disk_state[NEW] > D_NEGOTIATING && peer_disk_state[NEW] > D_NEGOTIATING && + test_and_clear_bit(RESYNC_AFTER_NEG, &peer_device->flags)) { + if (repl_state[NEW] == L_ESTABLISHED) + resync_after_online_grow(peer_device); + } + + /* A resync finished or aborted, wake paused devices... */ + if ((repl_state[OLD] > L_ESTABLISHED && repl_state[NEW] <= L_ESTABLISHED) || + (resync_susp_peer[OLD] && !resync_susp_peer[NEW]) || + (resync_susp_user[OLD] && !resync_susp_user[NEW])) + /* ldev_safe: ref from extra_ldev_ref_for_after_state_chg() */ + resume_next_sg(device); + + /* sync target done with resync. Explicitly notify all peers. Our sync + source should even know by himself, but the others need that info. */ + if (disk_state[OLD] < D_UP_TO_DATE && repl_state[OLD] >= L_SYNC_SOURCE && repl_state[NEW] == L_ESTABLISHED) + send_new_state_to_all_peer_devices(state_change, n_device); + + /* Outdated myself, or became D_UP_TO_DATE tell peers + * Do not do it, when the local node was forced from R_SECONDARY to R_PRIMARY, + * because that is part of the 2-phase-commit and that is necessary to trigger + * the initial resync. */ + if ((disk_state[NEW] >= D_INCONSISTENT && disk_state[NEW] != disk_state[OLD] && + repl_state[OLD] >= L_ESTABLISHED && repl_state[NEW] >= L_ESTABLISHED) && + !(role[OLD] == R_SECONDARY && role[NEW] == R_PRIMARY)) + send_state = true; + + /* diskless peers need to be informed about quorum changes, since they consider + the quorum state of the diskfull nodes. */ + if (have_quorum[OLD] != have_quorum[NEW] && disk_state[NEW] >= D_INCONSISTENT) + send_state = true; + + /* Skipped resync with peer_device, tell others... */ + if (send_state_others && send_state_others != peer_device) + send_state = true; + + /* This triggers bitmap writeout of potentially still unwritten pages + * if the resync finished cleanly, or aborted because of peer disk + * failure, or on transition from resync back to AHEAD/BEHIND. + * + * Connection loss is handled in conn_disconnect() by the receiver. + * + * For resync aborted because of local disk failure, we cannot do + * any bitmap writeout anymore. + * + * No harm done if some bits change during this phase. + */ + if ((repl_state[OLD] > L_ESTABLISHED && repl_state[OLD] < L_AHEAD) && + (repl_state[NEW] == L_ESTABLISHED || repl_state[NEW] >= L_AHEAD) && + get_ldev(device)) { + drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL, + "write from resync_finished", BM_LOCK_BULK, + NULL); + put_ldev(device); + } + + /* Verify finished, or reached stop sector. Peer did not know about + * the stop sector, and we may even have changed the stop sector during + * verify to interrupt/stop early. Send the new state. */ + if (repl_state[OLD] == L_VERIFY_S && repl_state[NEW] == L_ESTABLISHED + && verify_can_do_stop_sector(peer_device)) + send_new_state_to_all_peer_devices(state_change, n_device); + + if (disk_state[NEW] == D_DISKLESS && + cstate[NEW] == C_STANDALONE && + role[NEW] == R_SECONDARY) { + if (resync_susp_dependency[OLD] != resync_susp_dependency[NEW]) + /* ldev_safe: ref from extra_ldev_ref_for_after_state_chg */ + resume_next_sg(device); + } + + if (device_stable[OLD] && !device_stable[NEW] && + repl_state[NEW] >= L_ESTABLISHED && get_ldev(device)) { + /* Inform peers about being unstable... + Maybe it would be a better idea to have the stable bit as + part of the state (and being sent with the state) */ + drbd_send_uuids(peer_device, 0, 0); + put_ldev(device); + } + + if (send_state && cstate[NEW] == C_CONNECTED) + drbd_send_state(peer_device, new_state); + + if (((!device_stable[OLD] && device_stable[NEW]) || + (resync_target[OLD] && !resync_target[NEW] && device_stable[NEW])) && + !(repl_state[OLD] == L_SYNC_TARGET || repl_state[OLD] == L_PAUSED_SYNC_T) && + !(peer_role[OLD] == R_PRIMARY) && disk_state[NEW] >= D_OUTDATED && + repl_state[NEW] >= L_ESTABLISHED && + get_ldev(device)) { + /* Offer all peers a resync, with the exception of ... + ... the node that made me up-to-date (with a resync) + ... I was primary + ... the peer that transitioned from primary to secondary + */ + drbd_send_uuids(peer_device, UUID_FLAG_GOT_STABLE, 0); + put_ldev(device); + } + + if (peer_disk_state[OLD] == D_UP_TO_DATE && + (peer_disk_state[NEW] == D_FAILED || peer_disk_state[NEW] == D_INCONSISTENT) && + test_and_clear_bit(NEW_CUR_UUID, &device->flags)) + /* When a peer disk goes from D_UP_TO_DATE to D_FAILED or D_INCONSISTENT + we know that a write failed on that node. Therefore we need to create + the new UUID right now (not wait for the next write to come in) */ + new_current_uuid = true; + + if (disk_state[OLD] > D_FAILED && disk_state[NEW] == D_FAILED && + role[NEW] == R_PRIMARY && test_and_clear_bit(NEW_CUR_UUID, &device->flags)) + new_current_uuid = true; + + if (repl_state[OLD] != L_VERIFY_S && repl_state[NEW] == L_VERIFY_S) { + drbd_info(peer_device, "Starting Online Verify from sector %llu\n", + (unsigned long long)peer_device->ov_position); + drbd_queue_work_if_unqueued( + &peer_device->connection->sender_work, + &peer_device->resync_work); + } + + if (!repl_is_sync(repl_state[OLD]) && repl_is_sync(repl_state[NEW])) + /* ldev_safe: ref from extra_ldev_ref_for_after_state_chg() */ + drbd_run_resync(peer_device, repl_state[NEW]); + + if (repl_is_sync(repl_state[OLD]) && !repl_is_sync(repl_state[NEW])) + drbd_last_resync_request(peer_device, false); + + if (peer_device_state_change->repl_state[OLD] != L_SYNC_TARGET && + peer_device_state_change->repl_state[NEW] == L_SYNC_TARGET) + drbd_queue_work_if_unqueued( + &peer_device->connection->sender_work, + &peer_device->resync_work); + + if (!(repl_is_sync_target(repl_state[OLD]) && + all_peer_replication[OLD]) && + repl_is_sync_target(repl_state[NEW]) && + all_peer_replication[NEW]) + send_flush_requests = true; + + if (!peer_device_state_change->peer_replication[OLD] && + peer_device_state_change->peer_replication[NEW]) + drbd_send_enable_replication(peer_device, true); + } + + if (((role[OLD] == R_PRIMARY && role[NEW] == R_SECONDARY) || some_peer_demoted) && + get_ldev(device)) { + /* The some_peer_demoted case is superseded by + * handle_neighbor_demotion(). We keep this call for + * compatibility until support for protocol version 121 + * is removed. + * + * No changes to the bitmap expected after this point, so write out any + * changes up to now to ensure that the metadata disk has the full + * bitmap content. Even if the bitmap changes (e.g. it was dual primary) + * no harm was done if it did change. */ + drbd_bitmap_io_from_worker(device, &drbd_bm_write, + "demote", BM_LOCK_SET | BM_LOCK_CLEAR | BM_LOCK_BULK, + NULL); + put_ldev(device); + } + + /* Make sure the effective disk size is stored in the metadata + * if a local disk is attached and either the local disk state + * or a peer disk state is D_UP_TO_DATE. */ + if (effective_disk_size_determined && get_ldev(device)) { + sector_t size = get_capacity(device->vdisk); + if (device->ldev->md.effective_size != size) { + char ppb[10]; + + drbd_info(device, "persisting effective size = %s (%llu KB)\n", + ppsize(ppb, size >> 1), + (unsigned long long)size >> 1); + device->ldev->md.effective_size = size; + drbd_md_mark_dirty(device); + } + put_ldev(device); + } + + /* first half of local IO error, failure to attach, + * or administrative detach */ + if ((disk_state[OLD] != D_FAILED && disk_state[NEW] == D_FAILED) || + (disk_state[OLD] != D_DETACHING && disk_state[NEW] == D_DETACHING)) { + enum drbd_io_error_p eh = EP_PASS_ON; + int was_io_error = 0; + + /* Our cleanup here with the transition to D_DISKLESS. + * It is still not safe to dereference ldev here, since + * we might come from an failed Attach before ldev was set. */ + /* ldev_safe: ref from extra_ldev_ref_for_after_state_chg() */ + if (have_ldev && device->ldev) { + rcu_read_lock(); + eh = rcu_dereference(device->ldev->disk_conf)->on_io_error; + rcu_read_unlock(); + + was_io_error = disk_state[NEW] == D_FAILED; + + /* Intentionally call this handler first, before drbd_send_state(). + * See: 2932204 drbd: call local-io-error handler early + * People may chose to hard-reset the box from this handler. + * It is useful if this looks like a "regular node crash". */ + if (was_io_error && eh == EP_CALL_HELPER) + drbd_maybe_khelper(device, NULL, "local-io-error"); + + /* Immediately allow completion of all application IO, + * that waits for completion from the local disk, + * if this was a force-detach due to disk_timeout + * or administrator request (drbdsetup detach --force). + * Do NOT abort otherwise. + * Aborting local requests may cause serious problems, + * if requests are completed to upper layers already, + * and then later the already submitted local bio completes. + * This can cause DMA into former bio pages that meanwhile + * have been re-used for other things. + * So aborting local requests may cause crashes, + * or even worse, silent data corruption. + */ + if (test_and_clear_bit(FORCE_DETACH, &device->flags)) + tl_abort_disk_io(device); + + send_new_state_to_all_peer_devices(state_change, n_device); + + /* In case we want to get something to stable storage still, + * this may be the last chance. + * Following put_ldev may transition to D_DISKLESS. */ + drbd_bitmap_io_from_worker(device, &drbd_bm_write, + "detach", BM_LOCK_SET | BM_LOCK_CLEAR | BM_LOCK_BULK, + NULL); + drbd_md_sync_if_dirty(device); + } + } + + /* second half of local IO error, failure to attach, + * or administrative detach, + * after local_cnt references have reached zero again */ + if (disk_state[OLD] != D_DISKLESS && disk_state[NEW] == D_DISKLESS) { + /* We must still be diskless, + * re-attach has to be serialized with this! */ + if (device->disk_state[NOW] != D_DISKLESS) + drbd_err(device, + "ASSERT FAILED: disk is %s while going diskless\n", + drbd_disk_str(device->disk_state[NOW])); + + /* we may need to cancel the md_sync timer */ + timer_delete_sync(&device->md_sync_timer); + + if (have_ldev) + send_new_state_to_all_peer_devices(state_change, n_device); + } + + if (have_ldev) + put_ldev(device); + + /* Notify peers that I had a local IO error and did not detach. */ + if (disk_state[OLD] == D_UP_TO_DATE && disk_state[NEW] == D_INCONSISTENT) + send_new_state_to_all_peer_devices(state_change, n_device); + + /* Testing EMPTY_TWOPC_PENDING would cause more queuing than necessary */ + if (should_try_become_up_to_date(device, disk_state, NOW)) + try_become_up_to_date = true; + + if (test_bit(TRY_TO_GET_RESYNC, &device->flags)) { + /* Got connected to a diskless primary */ + clear_bit(TRY_TO_GET_RESYNC, &device->flags); + drbd_try_to_get_resynced(device); + } + + drbd_md_sync_if_dirty(device); + + if (role[NEW] == R_PRIMARY && have_quorum[OLD] && !have_quorum[NEW]) + drbd_maybe_khelper(device, NULL, "quorum-lost"); + + if (!susp_uuid[OLD] && susp_uuid[NEW] && + test_and_clear_bit(NEW_CUR_UUID, &device->flags)) + new_current_uuid = true; + + if (new_current_uuid) + drbd_uuid_new_current(device, false); + + if (disk_state[OLD] > D_DISKLESS && disk_state[NEW] == D_DISKLESS) + drbd_reconsider_queue_parameters(device, NULL); + } + + if (role[OLD] == R_PRIMARY && role[NEW] == R_SECONDARY) + send_role_to_all_peers(state_change); + + for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { + struct drbd_connection_state_change *connection_state_change = &state_change->connections[n_connection]; + struct drbd_connection *connection = connection_state_change->connection; + enum drbd_conn_state *cstate = connection_state_change->cstate; + bool *susp_fen = connection_state_change->susp_fen; + enum drbd_fencing_policy fencing_policy; + + if (connection_state_change->peer_role[NEW] == R_PRIMARY && send_flush_requests && + connection->agreed_pro_version >= 123) { + u64 current_flush_sequence; + + spin_lock_irq(&resource->initiator_flush_lock); + /* Requirement: At least the value from the corresponding state change */ + current_flush_sequence = resource->current_flush_sequence; + spin_unlock_irq(&resource->initiator_flush_lock); + + drbd_send_flush_requests(connection, current_flush_sequence); + } + + /* Upon network configuration, we need to start the receiver */ + if (cstate[OLD] == C_STANDALONE && cstate[NEW] == C_UNCONNECTED) + drbd_thread_start(&connection->receiver); + + if (susp_fen[NEW]) + check_may_resume_io_after_fencing(state_change, n_connection); + + rcu_read_lock(); + fencing_policy = connection->fencing_policy; + rcu_read_unlock(); + if (fencing_policy != FP_DONT_CARE && + drbd_should_unfence(state_change, n_connection)) + drbd_maybe_khelper(NULL, connection, "unfence-peer"); + } + + for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { + struct drbd_connection_state_change *connection_state_change = &state_change->connections[n_connection]; + enum drbd_conn_state *cstate = connection_state_change->cstate; + + if (cstate[NEW] == C_CONNECTED || cstate[NEW] == C_CONNECTING) + still_connected = true; + } + + if (susp_uuid[NEW]) { + unsigned long irq_flags; + + begin_state_change(resource, &irq_flags, CS_VERBOSE); + resource->susp_uuid[NEW] = false; + end_state_change(resource, &irq_flags, "susp-uuid"); + } + + if (try_become_up_to_date || healed_primary) + drbd_schedule_empty_twopc(resource); + + if (!still_connected) + mod_timer_pending(&resource->twopc_timer, jiffies); + + if (work->done) + complete(work->done); + forget_state_change(state_change); + kfree(work); + + return 0; +} + +static bool local_state_change(enum chg_state_flags flags) +{ + return flags & (CS_HARD | CS_LOCAL_ONLY); +} + +static enum drbd_state_rv +__peer_request(struct drbd_connection *connection, int vnr, + union drbd_state mask, union drbd_state val) +{ + enum drbd_state_rv rv = SS_SUCCESS; + + if (connection->cstate[NOW] == C_CONNECTED) { + enum drbd_packet cmd = (vnr == -1) ? P_CONN_ST_CHG_REQ : P_STATE_CHG_REQ; + if (!conn_send_state_req(connection, vnr, cmd, mask, val)) { + set_bit(TWOPC_PREPARED, &connection->flags); + rv = SS_CW_SUCCESS; + } + } + return rv; +} + +static enum drbd_state_rv __peer_reply(struct drbd_connection *connection) +{ + if (test_and_clear_bit(TWOPC_NO, &connection->flags)) + return SS_CW_FAILED_BY_PEER; + if (test_and_clear_bit(TWOPC_YES, &connection->flags) || + !test_bit(TWOPC_PREPARED, &connection->flags)) + return SS_CW_SUCCESS; + + /* This is DRBD 9.x <-> 8.4 compat code. + * Consistent with __peer_request() above: + * No more connection: fake success. */ + if (connection->cstate[NOW] != C_CONNECTED) + return SS_SUCCESS; + return SS_UNKNOWN_ERROR; +} + +static bool when_done_lock(struct drbd_resource *resource, + unsigned long *irq_flags) +{ + write_lock_irqsave(&resource->state_rwlock, *irq_flags); + if (!resource->remote_state_change && !test_bit(TWOPC_WORK_PENDING, &resource->flags)) + return true; + write_unlock_irqrestore(&resource->state_rwlock, *irq_flags); + return false; +} /** - * is_valid_transition() - Returns an SS_ error code if the state transition is not possible - * This limits hard state transitions. Hard state transitions are facts there are - * imposed on DRBD by the environment. E.g. disk broke or network broke down. - * But those hard state transitions are still not allowed to do everything. - * @ns: new state. - * @os: old state. + * complete_remote_state_change - Wait for other remote state changes to complete + * @resource: DRBD resource. + * @irq_flags: IRQ flags from begin_state_change. */ +static void complete_remote_state_change(struct drbd_resource *resource, + unsigned long *irq_flags) +{ + if (resource->remote_state_change) { + enum chg_state_flags flags = resource->state_change_flags; + + begin_remote_state_change(resource, irq_flags); + for (;;) { + long t = twopc_timeout(resource); + + t = wait_event_timeout(resource->twopc_wait, + when_done_lock(resource, irq_flags), t); + if (t) + break; + if (when_done_lock(resource, irq_flags)) { + drbd_info(resource, "Two-phase commit: " + "not woken up in time\n"); + break; + } + } + __end_remote_state_change(resource, flags); + } +} + static enum drbd_state_rv -is_valid_transition(union drbd_state os, union drbd_state ns) +change_peer_state(struct drbd_connection *connection, int vnr, + union drbd_state mask, union drbd_state val, unsigned long *irq_flags) { + struct drbd_resource *resource = connection->resource; + enum chg_state_flags flags = resource->state_change_flags | CS_TWOPC; enum drbd_state_rv rv; - rv = is_valid_conn_transition(os.conn, ns.conn); + if (!expect(resource, flags & CS_SERIALIZE)) + return SS_CW_FAILED_BY_PEER; + + complete_remote_state_change(resource, irq_flags); + + resource->remote_state_change = true; + resource->twopc_reply.initiator_node_id = resource->res_opts.node_id; + resource->twopc_reply.tid = 0; + begin_remote_state_change(resource, irq_flags); + rv = __peer_request(connection, vnr, mask, val); + if (rv == SS_CW_SUCCESS) { + wait_event(resource->state_wait, + ((rv = __peer_reply(connection)) != SS_UNKNOWN_ERROR)); + clear_bit(TWOPC_PREPARED, &connection->flags); + } + end_remote_state_change(resource, irq_flags, flags); + return rv; +} + +static enum drbd_state_rv +__cluster_wide_request(struct drbd_resource *resource, struct twopc_request *request, + u64 reach_immediately) +{ + enum drbd_packet cmd = request->cmd; + struct drbd_connection *connection; + enum drbd_state_rv rv = SS_SUCCESS; + u64 im; + + for_each_connection_ref(connection, im, resource) { + u64 mask; + int err; + + clear_bit(TWOPC_PREPARED, &connection->flags); + + if (connection->agreed_pro_version < 110) + continue; + mask = NODE_MASK(connection->peer_node_id); + if (reach_immediately & mask) + set_bit(TWOPC_PREPARED, &connection->flags); + else + continue; - /* we cannot fail (again) if we already detached */ - if (ns.disk == D_FAILED && os.disk == D_DISKLESS) - rv = SS_IS_DISKLESS; + clear_bit(TWOPC_YES, &connection->flags); + clear_bit(TWOPC_NO, &connection->flags); + clear_bit(TWOPC_RETRY, &connection->flags); + err = conn_send_twopc_request(connection, request); + if (err) { + clear_bit(TWOPC_PREPARED, &connection->flags); + wake_up(&resource->work.q_wait); + continue; + } + if (cmd == P_TWOPC_PREPARE || cmd == P_TWOPC_PREP_RSZ) + schedule_work(&connection->send_ping_work); + rv = SS_CW_SUCCESS; + } return rv; } -static void print_sanitize_warnings(struct drbd_device *device, enum sanitize_state_warnings warn) +bool drbd_twopc_between_peer_and_me(struct drbd_connection *connection) { - static const char *msg_table[] = { - [NO_WARNING] = "", - [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.", - [ABORTED_RESYNC] = "Resync aborted.", - [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!", - [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk", - [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk", - }; + const int my_node_id = connection->resource->res_opts.node_id; + struct twopc_reply *o = &connection->resource->twopc_reply; + + return ((o->target_node_id == my_node_id || o->target_node_id == -1) && + o->initiator_node_id == connection->peer_node_id) || + ((o->target_node_id == connection->peer_node_id || o->target_node_id == -1) && + o->initiator_node_id == my_node_id); +} + +bool cluster_wide_reply_ready(struct drbd_resource *resource) +{ + struct drbd_connection *connection; + bool connect_ready = true; + bool have_no = resource->twopc_reply.state_change_failed; + bool have_retry = false; + bool all_yes = true; + + if (test_bit(TWOPC_ABORT_LOCAL, &resource->flags)) + return true; + + rcu_read_lock(); + for_each_connection_rcu(connection, resource) { + if (connection->agreed_pro_version >= 118 && + !idr_is_empty(&resource->devices) && + resource->twopc_reply.is_connect && + drbd_twopc_between_peer_and_me(connection) && + !test_bit(CONN_HANDSHAKE_READY, &connection->flags)) + connect_ready = false; + + if (!test_bit(TWOPC_PREPARED, &connection->flags)) + continue; + if (test_bit(TWOPC_NO, &connection->flags)) + have_no = true; + if (test_bit(TWOPC_RETRY, &connection->flags)) + have_retry = true; + if (!test_bit(TWOPC_YES, &connection->flags)) + all_yes = false; + } + rcu_read_unlock(); + + return have_retry || (connect_ready && (have_no || all_yes)); +} + +static enum drbd_state_rv get_cluster_wide_reply(struct drbd_resource *resource, + struct change_context *context) +{ + struct drbd_connection *connection, *failed_by = NULL; + bool handshake_disconnect = false; + bool handshake_retry = false; + bool have_no = resource->twopc_reply.state_change_failed; + bool have_retry = false; + enum drbd_state_rv rv = SS_CW_SUCCESS; + + if (test_bit(TWOPC_ABORT_LOCAL, &resource->flags)) + return SS_CONCURRENT_ST_CHG; + + rcu_read_lock(); + for_each_connection_rcu(connection, resource) { + if (resource->twopc_reply.is_connect && + drbd_twopc_between_peer_and_me(connection)) { + if (test_bit(CONN_HANDSHAKE_DISCONNECT, &connection->flags)) + handshake_disconnect = true; + if (test_bit(CONN_HANDSHAKE_RETRY, &connection->flags)) + handshake_retry = true; + } + + if (!test_bit(TWOPC_PREPARED, &connection->flags)) + continue; + if (test_bit(TWOPC_NO, &connection->flags)) { + failed_by = connection; + have_no = true; + } + if (test_bit(TWOPC_RETRY, &connection->flags)) + have_retry = true; + } + + if (have_retry) + rv = SS_CONCURRENT_ST_CHG; + else if (handshake_retry) + rv = SS_HANDSHAKE_RETRY; + else if (handshake_disconnect) + rv = SS_HANDSHAKE_DISCONNECT; + else if (have_no) { + if (context && failed_by) + _drbd_state_err(context, "Declined by peer %s (id: %d), see the kernel log there", + rcu_dereference(failed_by->transport.net_conf)->name, + failed_by->peer_node_id); + rv = SS_CW_FAILED_BY_PEER; + } + rcu_read_unlock(); + + if (rv == SS_CW_SUCCESS && test_bit(TWOPC_RECV_SIZES_ERR, &resource->flags)) + rv = SS_HANDSHAKE_DISCONNECT; + + return rv; +} + +static bool supports_two_phase_commit(struct drbd_resource *resource) +{ + struct drbd_connection *connection; + bool supported = true; + + rcu_read_lock(); + for_each_connection_rcu(connection, resource) { + if (connection->cstate[NOW] != C_CONNECTED) + continue; + if (connection->agreed_pro_version < 110) { + supported = false; + break; + } + } + rcu_read_unlock(); + + return supported; +} + +static struct drbd_connection *get_first_connection(struct drbd_resource *resource) +{ + struct drbd_connection *connection = NULL; + + rcu_read_lock(); + if (!list_empty(&resource->connections)) { + connection = first_connection(resource); + kref_get(&connection->kref); + } + rcu_read_unlock(); + return connection; +} + +/* That two_primaries is a connection option is one of those things of + the past, that should be cleaned up!! it should be a resource config! + Here is a inaccurate heuristic */ +static bool multiple_primaries_allowed(struct drbd_resource *resource) +{ + struct drbd_connection *connection; + bool allowed = false; + struct net_conf *nc; + + rcu_read_lock(); + for_each_connection_rcu(connection, resource) { + nc = rcu_dereference(connection->transport.net_conf); + if (nc && nc->two_primaries) { + allowed = true; + break; + } + } + rcu_read_unlock(); + + return allowed; +} + +static enum drbd_state_rv +check_primaries_distances(struct drbd_resource *resource) +{ + struct twopc_reply *reply = &resource->twopc_reply; + int nr_primaries = hweight64(reply->primary_nodes); + u64 common_server; + + if (nr_primaries <= 1) + return SS_SUCCESS; + if (nr_primaries > 1 && !multiple_primaries_allowed(resource)) + return SS_TWO_PRIMARIES; + /* All primaries directly connected. Good */ + if (!(reply->primary_nodes & reply->weak_nodes)) + return SS_SUCCESS; + + /* For virtualization setups with diskless hypervisors (R_PRIMARY) and one + or multiple storage servers (R_SECONDARY) allow live-migration between the + hypervisors. */ + common_server = ~reply->weak_nodes; + if (common_server) { + int node_id; + /* Only allow if the new primary is diskless. See also far_away_change() + in drbd_receiver.c for the diskless check on the other primary */ + if ((reply->primary_nodes & NODE_MASK(resource->res_opts.node_id)) && + drbd_have_local_disk(resource)) + return SS_WEAKLY_CONNECTED; + + for (node_id = 0; node_id < DRBD_NODE_ID_MAX; node_id++) { + struct drbd_connection *connection; + struct net_conf *nc; + bool two_primaries; + + if (!(common_server & NODE_MASK(node_id))) + continue; + connection = drbd_connection_by_node_id(resource, node_id); + if (!connection) + continue; + + rcu_read_lock(); + nc = rcu_dereference(connection->transport.net_conf); + two_primaries = nc ? nc->two_primaries : false; + rcu_read_unlock(); + + if (!two_primaries) + return SS_TWO_PRIMARIES; + } + + return SS_SUCCESS; + } + return SS_WEAKLY_CONNECTED; +} + +static enum drbd_state_rv +check_ro_cnt_and_primary(struct drbd_resource *resource) +{ + struct twopc_reply *reply = &resource->twopc_reply; + struct drbd_connection *connection; + enum drbd_state_rv rv = SS_SUCCESS; + struct net_conf *nc; + + if (drbd_open_ro_count(resource) == 0) + return rv; + + rcu_read_lock(); + for_each_connection_rcu(connection, resource) { + nc = rcu_dereference(connection->transport.net_conf); + if (!nc->two_primaries && + NODE_MASK(connection->peer_node_id) & reply->primary_nodes) { + rv = SS_PRIMARY_READER; + break; + } + } + rcu_read_unlock(); + + return rv; +} + +long twopc_retry_timeout(struct drbd_resource *resource, int retries) +{ + struct drbd_connection *connection; + int connections = 0; + long timeout = 0; + + rcu_read_lock(); + for_each_connection_rcu(connection, resource) { + if (connection->cstate[NOW] < C_CONNECTING) + continue; + connections++; + } + rcu_read_unlock(); + + if (connections > 0) { + if (retries > 5) + retries = 5; + timeout = resource->res_opts.twopc_retry_timeout * + HZ / 10 * connections * (1 << retries); + timeout = get_random_u32_below(timeout); + } + return timeout; +} + +void abort_connect(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + if (test_and_clear_bit(HOLDING_UUID_READ_LOCK, &peer_device->flags)) + up_read_non_owner(&peer_device->device->uuid_sem); + clear_bit(INITIAL_STATE_SENT, &peer_device->flags); + clear_bit(INITIAL_STATE_RECEIVED, &peer_device->flags); + clear_bit(UUIDS_RECEIVED, &peer_device->flags); + clear_bit(CURRENT_UUID_RECEIVED, &peer_device->flags); + } + rcu_read_unlock(); +} + +static void twopc_phase2(struct drbd_resource *resource, + struct twopc_request *request, + u64 reach_immediately) +{ + struct drbd_connection *connection; + u64 im; + + for_each_connection_ref(connection, im, resource) { + u64 mask = NODE_MASK(connection->peer_node_id); + if (!(reach_immediately & mask)) + continue; + + conn_send_twopc_request(connection, request); + } +} + +void drbd_print_cluster_wide_state_change(struct drbd_resource *resource, const char *message, + unsigned int tid, unsigned int initiator_node_id, int target_node_id, + union drbd_state mask, union drbd_state val) +{ + char buffer[150], *b, *end = buffer + sizeof(buffer); + + b = buffer; + b += scnprintf(b, end - b, "%u->", initiator_node_id); + if (target_node_id == -1) + b += scnprintf(b, end - b, "all"); + else + b += scnprintf(b, end - b, "%d", target_node_id); + + if (mask.role) + b += scnprintf(b, end - b, " role( %s )", drbd_role_str(val.role)); + + if (mask.peer) + b += scnprintf(b, end - b, " peer( %s )", drbd_role_str(val.peer)); + + if (mask.conn) { + if (val.conn > C_CONNECTED) + b += scnprintf(b, end - b, " repl( %s )", drbd_repl_str(val.conn)); + else + b += scnprintf(b, end - b, " conn( %s )", drbd_conn_str(val.conn)); + } + + if (mask.disk) + b += scnprintf(b, end - b, " disk( %s )", drbd_disk_str(val.disk)); + + if (mask.pdsk) + b += scnprintf(b, end - b, " pdsk( %s )", drbd_disk_str(val.pdsk)); + + // Any of "susp-io( user )", "susp-io( quorum )" or "susp-io( uuid )" + if (mask.susp) + b += scnprintf(b, end - b, " %ssusp-io", val.susp ? "+" : "-"); + + if (mask.susp_nod) + b += scnprintf(b, end - b, " susp-io( %sno-disk )", val.susp_nod ? "+" : "-"); + + if (mask.susp_fen) + b += scnprintf(b, end - b, " susp-io( %sfencing )", val.susp_fen ? "+" : "-"); + + if (mask.user_isp) + b += scnprintf(b, end - b, " resync-susp( %suser )", val.user_isp ? "+" : "-"); + + if (mask.peer_isp) + b += scnprintf(b, end - b, " resync-susp( %speer )", val.peer_isp ? "+" : "-"); + + if (mask.aftr_isp) + b += scnprintf(b, end - b, " resync-susp( %safter dependency )", + val.aftr_isp ? "+" : "-"); - if (warn != NO_WARNING) - drbd_warn(device, "%s\n", msg_table[warn]); + if (!mask.i) + b += scnprintf(b, end - b, " empty"); + + drbd_info(resource, "%s %u: %s\n", message, tid, buffer); } /** - * sanitize_state() - Resolves implicitly necessary additional changes to a state transition - * @device: DRBD device. - * @os: old state. - * @ns: new state. - * @warn: placeholder for returned state warning. + * change_cluster_wide_state - Cluster-wide two-phase commit + * @change: The callback function that does the actual state change. + * @context: State change context. + * @tag: State change tag to print in status messages. + * + * Perform a two-phase commit transaction among all (reachable) nodes in the + * cluster. In our transaction model, the initiator of a transaction is also + * the coordinator. + * + * In phase one of the transaction, the coordinator sends all nodes in the + * cluster a P_TWOPC_PREPARE packet. Each node replies with either P_TWOPC_YES + * if it consents or with P_TWOPC_NO if it denies the transaction. Once all + * replies have been received, the coordinator sends all nodes in the cluster a + * P_TWOPC_COMMIT or P_TWOPC_ABORT packet to finish the transaction. + * + * When a node in the cluster is busy with another transaction, it replies with + * P_TWOPC_NO. The coordinator is then responsible for retrying the + * transaction. * - * When we loose connection, we have to set the state of the peers disk (pdsk) - * to D_UNKNOWN. This rule and many more along those lines are in this function. + * Since a cluster is not guaranteed to always be fully connected, some nodes + * will not be directly reachable from other nodes. In order to still reach + * all nodes in the cluster, participants will forward requests to nodes which + * haven't received the request yet: + * + * The nodes_to_reach field in requests indicates which nodes have received the + * request already. Before forwarding a request to a peer, a node removes + * itself from nodes_to_reach; it then sends the request to all directly + * connected nodes in nodes_to_reach. + * + * If there are redundant paths in the cluster, requests will reach some nodes + * more than once. Nodes remember when they are taking part in a transaction; + * they detect duplicate requests and reply to them with P_TWOPC_YES packets. + * (Transactions are identified by the node id of the initiator and a random, + * unique-enough transaction identifier.) + * + * A configurable timeout determines how long a coordinator or participant will + * wait for a transaction to finish. A transaction that times out is assumed + * to have aborted. */ -static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os, - union drbd_state ns, enum sanitize_state_warnings *warn) +static enum drbd_state_rv +change_cluster_wide_state(bool (*change)(struct change_context *, enum change_phase), + struct change_context *context, const char *tag) { - enum drbd_fencing_p fp; - enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; + struct drbd_resource *resource = context->resource; + unsigned long irq_flags; + struct twopc_request request; + struct twopc_reply *reply = &resource->twopc_reply; + struct drbd_connection *connection, *target_connection = NULL; + enum drbd_state_rv rv; + u64 reach_immediately; + int retries = 1; + unsigned long start_time; + bool have_peers; + + begin_state_change(resource, &irq_flags, context->flags | CS_LOCAL_ONLY); + resource->state_change_err_str = context->err_str; + + if (local_state_change(context->flags)) { + /* Not a cluster-wide state change. */ + change(context, PH_LOCAL_COMMIT); + return end_state_change(resource, &irq_flags, tag); + } else { + if (!change(context, PH_PREPARE)) { + /* Not a cluster-wide state change. */ + return end_state_change(resource, &irq_flags, tag); + } + rv = try_state_change(resource); + if (rv != SS_SUCCESS) { + /* Failure or nothing to do. */ + /* abort_state_change(resource, &irq_flags); */ + if (rv == SS_NOTHING_TO_DO) + resource->state_change_flags &= ~CS_VERBOSE; + return __end_state_change(resource, &irq_flags, rv, tag); + } + /* Really a cluster-wide state change. */ + } + + if (!supports_two_phase_commit(resource)) { + connection = get_first_connection(resource); + rv = SS_SUCCESS; + if (connection) { + rv = change_peer_state(connection, context->vnr, context->mask, context->val, &irq_flags); + kref_put(&connection->kref, drbd_destroy_connection); + } + if (rv >= SS_SUCCESS) + change(context, PH_84_COMMIT); + return __end_state_change(resource, &irq_flags, rv, tag); + } + + if (!expect(resource, context->flags & CS_SERIALIZE || context->mask.i == 0)) { + rv = SS_CW_FAILED_BY_PEER; + return __end_state_change(resource, &irq_flags, rv, tag); + } + + rcu_read_lock(); + for_each_connection_rcu(connection, resource) { + if (!expect(connection, current != connection->receiver.task)) { + rcu_read_unlock(); + BUG(); + } + } + rcu_read_unlock(); + + retry: + if (current == resource->worker.task && resource->remote_state_change) + return __end_state_change(resource, &irq_flags, SS_CONCURRENT_ST_CHG, tag); + + complete_remote_state_change(resource, &irq_flags); + start_time = jiffies; + resource->state_change_err_str = context->err_str; + + *reply = (struct twopc_reply) { 0 }; + + reach_immediately = directly_connected_nodes(resource, NOW); + if (context->target_node_id != -1) { + struct drbd_connection *connection; + + /* Fail if the target node is no longer directly reachable. */ + connection = drbd_get_connection_by_node_id(resource, context->target_node_id); + if (!connection) { + rv = SS_NEED_CONNECTION; + return __end_state_change(resource, &irq_flags, rv, tag); + } - if (warn) - *warn = NO_WARNING; + if (!(connection->cstate[NOW] == C_CONNECTED || + (connection->cstate[NOW] == C_CONNECTING && + context->mask.conn == conn_MASK && + context->val.conn == C_CONNECTED))) { + rv = SS_NEED_CONNECTION; - fp = FP_DONT_CARE; - if (get_ldev(device)) { - rcu_read_lock(); - fp = rcu_dereference(device->ldev->disk_conf)->fencing; - rcu_read_unlock(); - put_ldev(device); + kref_put(&connection->kref, drbd_destroy_connection); + return __end_state_change(resource, &irq_flags, rv, tag); + } + target_connection = connection; + + /* For connect transactions, add the target node id. */ + reach_immediately |= NODE_MASK(context->target_node_id); } - /* Implications from connection to peer and peer_isp */ - if (ns.conn < C_CONNECTED) { - ns.peer_isp = 0; - ns.peer = R_UNKNOWN; - if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT) - ns.pdsk = D_UNKNOWN; + do + reply->tid = get_random_u32(); + while (!reply->tid); + + clear_bit(TWOPC_RECV_SIZES_ERR, &resource->flags); + request.tid = reply->tid; + request.initiator_node_id = resource->res_opts.node_id; + request.target_node_id = context->target_node_id; + request.nodes_to_reach = ~(reach_immediately | NODE_MASK(resource->res_opts.node_id)); + request.vnr = context->vnr; + request.cmd = P_TWOPC_PREPARE; + request.flags = TWOPC_HAS_REACHABLE; + + resource->twopc.type = TWOPC_STATE_CHANGE; + resource->twopc.state_change.mask = context->mask; + resource->twopc.state_change.val = context->val; + resource->twopc.state_change.primary_nodes = 0; + resource->twopc.state_change.reachable_nodes = 0; + resource->twopc_parent_nodes = 0; + resource->remote_state_change = true; + + drbd_print_cluster_wide_state_change(resource, "Preparing cluster-wide state change", + request.tid, resource->res_opts.node_id, context->target_node_id, + context->mask, context->val); + + reply->initiator_node_id = resource->res_opts.node_id; + reply->target_node_id = context->target_node_id; + + reply->reachable_nodes = directly_connected_nodes(resource, NOW) | + NODE_MASK(resource->res_opts.node_id); + if (context->mask.conn == conn_MASK && context->val.conn == C_CONNECTED) { + reply->reachable_nodes |= NODE_MASK(context->target_node_id); + reply->target_reachable_nodes = reply->reachable_nodes; + reply->is_connect = 1; + drbd_init_connect_state(target_connection); + } else if (context->mask.conn == conn_MASK && context->val.conn == C_DISCONNECTING) { + reply->target_reachable_nodes = NODE_MASK(context->target_node_id); + reply->reachable_nodes &= ~reply->target_reachable_nodes; + reply->is_disconnect = 1; + } else { + reply->target_reachable_nodes = reply->reachable_nodes; } - /* Clear the aftr_isp when becoming unconfigured */ - if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) - ns.aftr_isp = 0; + D_ASSERT(resource, !test_bit(TWOPC_WORK_PENDING, &resource->flags)); + begin_remote_state_change(resource, &irq_flags); + rv = __cluster_wide_request(resource, &request, reach_immediately); - /* An implication of the disk states onto the connection state */ - /* Abort resync if a disk fails/detaches */ - if (ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { - if (warn) - *warn = ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T ? - ABORTED_ONLINE_VERIFY : ABORTED_RESYNC; - ns.conn = C_CONNECTED; - } + /* If we are changing state attached to a particular connection then we + * expect that connection to remain connected. A failure to send + * P_TWOPC_PREPARE on that connection is a failure for the whole + * cluster-wide state change. */ + if (target_connection && !test_bit(TWOPC_PREPARED, &target_connection->flags)) + rv = SS_NEED_CONNECTION; - /* Connection breaks down before we finished "Negotiating" */ - if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && - get_ldev_if_state(device, D_NEGOTIATING)) { - if (device->ed_uuid == device->ldev->md.uuid[UI_CURRENT]) { - ns.disk = device->new_state_tmp.disk; - ns.pdsk = device->new_state_tmp.pdsk; - } else { - if (warn) - *warn = CONNECTION_LOST_NEGOTIATING; - ns.disk = D_DISKLESS; - ns.pdsk = D_UNKNOWN; + have_peers = rv == SS_CW_SUCCESS; + if (have_peers) { + long t; + + if (context->mask.conn == conn_MASK && context->val.conn == C_CONNECTED && + target_connection->agreed_pro_version >= 118) + conn_connect2(target_connection); + + t = wait_event_interruptible_timeout(resource->state_wait, + cluster_wide_reply_ready(resource), + twopc_timeout(resource)); + if (t > 0) + rv = get_cluster_wide_reply(resource, context); + else + rv = t == 0 ? SS_TIMEOUT : SS_INTERRUPTED; + + /* while waiting for the replies, reach_immediately might have changed. */ + reach_immediately = directly_connected_nodes(resource, NOW); + if (target_connection && target_connection->cstate[NOW] == C_CONNECTING) + reach_immediately |= NODE_MASK(context->target_node_id); + + request.nodes_to_reach = + ~(reach_immediately | NODE_MASK(resource->res_opts.node_id)); + + if (rv == SS_CW_SUCCESS) { + u64 directly_reachable = reach_immediately | + NODE_MASK(resource->res_opts.node_id); + + if (context->mask.conn == conn_MASK && context->val.conn == C_DISCONNECTING) + directly_reachable &= ~NODE_MASK(context->target_node_id); + + if ((context->mask.role == role_MASK && context->val.role == R_PRIMARY) || + (context->mask.role != role_MASK && resource->role[NOW] == R_PRIMARY)) { + reply->primary_nodes |= NODE_MASK(resource->res_opts.node_id); + if (drbd_res_data_accessible(resource)) + reply->weak_nodes |= ~directly_reachable; + } + + /* + * When a node is Primary and has access to UpToDate data, it sets + * weak_nodes to the mask of those it is not connected to. This includes the + * bits for nodes which are not configured, so will always have some set + * bits. Thus if there is a Primary node and no bits are set in weak_nodes, + * the Primary cannot have access to UpToDate data. + */ + if (reply->primary_nodes && !reply->weak_nodes) + request.flags |= TWOPC_PRI_INCAPABLE; + + drbd_info(resource, "State change %u: primary_nodes=%lX, weak_nodes=%lX\n", + reply->tid, (unsigned long)reply->primary_nodes, + (unsigned long)reply->weak_nodes); + + if ((context->mask.role == role_MASK && context->val.role == R_PRIMARY) || + (context->mask.conn == conn_MASK && context->val.conn == C_CONNECTED)) + rv = check_primaries_distances(resource); + + if (rv >= SS_SUCCESS && + context->mask.conn == conn_MASK && context->val.conn == C_CONNECTED) + rv = check_ro_cnt_and_primary(resource); + + if (!(context->mask.conn == conn_MASK && context->val.conn == C_DISCONNECTING) || + (reply->reachable_nodes & reply->target_reachable_nodes)) { + /* The cluster is still connected after this + * transaction: either this transaction does + * not disconnect a connection, or there are + * redundant connections. */ + + u64 m; + + m = reply->reachable_nodes | reply->target_reachable_nodes; + reply->reachable_nodes = m; + reply->target_reachable_nodes = m; + } else { + rcu_read_lock(); + for_each_connection_rcu(connection, resource) { + int node_id = connection->peer_node_id; + + if (node_id == context->target_node_id) { + drbd_info(connection, "Cluster is now split\n"); + break; + } + } + rcu_read_unlock(); + } + + resource->twopc.state_change.primary_nodes = reply->primary_nodes; + resource->twopc.state_change.reachable_nodes = + reply->target_reachable_nodes; } - put_ldev(device); - } - /* D_CONSISTENT and D_OUTDATED vanish when we get connected */ - if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) { - if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) - ns.disk = D_UP_TO_DATE; - if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED) - ns.pdsk = D_UP_TO_DATE; - } - - /* Implications of the connection state on the disk states */ - disk_min = D_DISKLESS; - disk_max = D_UP_TO_DATE; - pdsk_min = D_INCONSISTENT; - pdsk_max = D_UNKNOWN; - switch ((enum drbd_conns)ns.conn) { - case C_WF_BITMAP_T: - case C_PAUSED_SYNC_T: - case C_STARTING_SYNC_T: - case C_WF_SYNC_UUID: - case C_BEHIND: - disk_min = D_INCONSISTENT; - disk_max = D_OUTDATED; - pdsk_min = D_UP_TO_DATE; - pdsk_max = D_UP_TO_DATE; - break; - case C_VERIFY_S: - case C_VERIFY_T: - disk_min = D_UP_TO_DATE; - disk_max = D_UP_TO_DATE; - pdsk_min = D_UP_TO_DATE; - pdsk_max = D_UP_TO_DATE; - break; - case C_CONNECTED: - disk_min = D_DISKLESS; - disk_max = D_UP_TO_DATE; - pdsk_min = D_DISKLESS; - pdsk_max = D_UP_TO_DATE; - break; - case C_WF_BITMAP_S: - case C_PAUSED_SYNC_S: - case C_STARTING_SYNC_S: - case C_AHEAD: - disk_min = D_UP_TO_DATE; - disk_max = D_UP_TO_DATE; - pdsk_min = D_INCONSISTENT; - pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/ - break; - case C_SYNC_TARGET: - disk_min = D_INCONSISTENT; - disk_max = D_INCONSISTENT; - pdsk_min = D_UP_TO_DATE; - pdsk_max = D_UP_TO_DATE; - break; - case C_SYNC_SOURCE: - disk_min = D_UP_TO_DATE; - disk_max = D_UP_TO_DATE; - pdsk_min = D_INCONSISTENT; - pdsk_max = D_INCONSISTENT; - break; - case C_STANDALONE: - case C_DISCONNECTING: - case C_UNCONNECTED: - case C_TIMEOUT: - case C_BROKEN_PIPE: - case C_NETWORK_FAILURE: - case C_PROTOCOL_ERROR: - case C_TEAR_DOWN: - case C_WF_CONNECTION: - case C_WF_REPORT_PARAMS: - case C_MASK: - break; - } - if (ns.disk > disk_max) - ns.disk = disk_max; + if (context->mask.conn == conn_MASK && context->val.conn == C_CONNECTED && + target_connection->agreed_pro_version >= 118) { + wait_initial_states_received(target_connection); - if (ns.disk < disk_min) { - if (warn) - *warn = IMPLICITLY_UPGRADED_DISK; - ns.disk = disk_min; + if (rv >= SS_SUCCESS && test_bit(TWOPC_RECV_SIZES_ERR, &resource->flags)) + rv = SS_HANDSHAKE_DISCONNECT; + } } - if (ns.pdsk > pdsk_max) - ns.pdsk = pdsk_max; - if (ns.pdsk < pdsk_min) { - if (warn) - *warn = IMPLICITLY_UPGRADED_PDSK; - ns.pdsk = pdsk_min; + request.cmd = rv >= SS_SUCCESS ? P_TWOPC_COMMIT : P_TWOPC_ABORT; + if (rv < SS_SUCCESS && target_connection) + abort_connect(target_connection); + + if ((rv == SS_TIMEOUT || rv == SS_CONCURRENT_ST_CHG) && + !(context->flags & CS_DONT_RETRY)) { + long timeout = twopc_retry_timeout(resource, retries++); + drbd_info(resource, "Retrying cluster-wide state change after %ums\n", + jiffies_to_msecs(timeout)); + if (have_peers) + twopc_phase2(resource, &request, reach_immediately); + if (target_connection) { + kref_put(&target_connection->kref, drbd_destroy_connection); + target_connection = NULL; + } + clear_remote_state_change(resource); + schedule_timeout_interruptible(timeout); + end_remote_state_change(resource, &irq_flags, context->flags | CS_TWOPC); + goto retry; } - if (fp == FP_STONITH && - (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && - !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) - ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ - - if (device->resource->res_opts.on_no_data == OND_SUSPEND_IO && - (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) && - !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE)) - ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */ + if (rv >= SS_SUCCESS) + drbd_info(resource, "Committing cluster-wide state change %u (%ums)\n", + request.tid, + jiffies_to_msecs(jiffies - start_time)); + else + drbd_info(resource, "Aborting cluster-wide state change %u (%ums) rv = %d\n", + request.tid, + jiffies_to_msecs(jiffies - start_time), + rv); + + if (have_peers && context->change_local_state_last) { + set_bit(TWOPC_STATE_CHANGE_PENDING, &resource->flags); + twopc_phase2(resource, &request, reach_immediately); + } - if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { - if (ns.conn == C_SYNC_SOURCE) - ns.conn = C_PAUSED_SYNC_S; - if (ns.conn == C_SYNC_TARGET) - ns.conn = C_PAUSED_SYNC_T; + end_remote_state_change(resource, &irq_flags, context->flags | CS_TWOPC); + clear_bit(TWOPC_STATE_CHANGE_PENDING, &resource->flags); + if (rv >= SS_SUCCESS) { + change(context, PH_COMMIT); + rv = end_state_change(resource, &irq_flags, tag); + if (rv < SS_SUCCESS) + drbd_err(resource, "FATAL: Local commit of already committed %u failed! \n", + request.tid); } else { - if (ns.conn == C_PAUSED_SYNC_S) - ns.conn = C_SYNC_SOURCE; - if (ns.conn == C_PAUSED_SYNC_T) - ns.conn = C_SYNC_TARGET; + abort_state_change(resource, &irq_flags); } - return ns; -} + if (have_peers && !context->change_local_state_last) + twopc_phase2(resource, &request, reach_immediately); -void drbd_resume_al(struct drbd_device *device) -{ - if (test_and_clear_bit(AL_SUSPENDED, &device->flags)) - drbd_info(device, "Resumed AL updates\n"); + if (target_connection) { + kref_put(&target_connection->kref, drbd_destroy_connection); + } + return rv; } -/* helper for _drbd_set_state */ -static void set_ov_position(struct drbd_peer_device *peer_device, enum drbd_conns cs) +enum determine_dev_size +change_cluster_wide_device_size(struct drbd_device *device, + sector_t local_max_size, + uint64_t new_user_size, + enum dds_flags dds_flags, + struct resize_parms *rs) { - struct drbd_device *device = peer_device->device; + struct drbd_resource *resource = device->resource; + struct twopc_reply *reply = &resource->twopc_reply; + struct twopc_request request; + unsigned long start_time; + unsigned long irq_flags; + enum drbd_state_rv rv; + enum determine_dev_size dd; + u64 reach_immediately; + bool have_peers, commit_it; + sector_t new_size = 0; + int retries = 1; + +retry: + rv = drbd_support_2pc_resize(resource); + if (rv < SS_SUCCESS) + return DS_2PC_NOT_SUPPORTED; - if (peer_device->connection->agreed_pro_version < 90) - device->ov_start_sector = 0; - device->rs_total = drbd_bm_bits(device); - device->ov_position = 0; - if (cs == C_VERIFY_T) { - /* starting online verify from an arbitrary position - * does not fit well into the existing protocol. - * on C_VERIFY_T, we initialize ov_left and friends - * implicitly in receive_DataRequest once the - * first P_OV_REQUEST is received */ - device->ov_start_sector = ~(sector_t)0; - } else { - unsigned long bit = BM_SECT_TO_BIT(device->ov_start_sector); - if (bit >= device->rs_total) { - device->ov_start_sector = - BM_BIT_TO_SECT(device->rs_total - 1); - device->rs_total = 1; - } else - device->rs_total -= bit; - device->ov_position = device->ov_start_sector; - } - device->ov_left = device->rs_total; -} + state_change_lock(resource, &irq_flags, CS_VERBOSE | CS_LOCAL_ONLY); + rcu_read_lock(); + complete_remote_state_change(resource, &irq_flags); + start_time = jiffies; + reach_immediately = directly_connected_nodes(resource, NOW); + + *reply = (struct twopc_reply) { 0 }; + + do + reply->tid = get_random_u32(); + while (!reply->tid); + + request.tid = reply->tid; + request.initiator_node_id = resource->res_opts.node_id; + request.target_node_id = -1; + request.nodes_to_reach = ~(reach_immediately | NODE_MASK(resource->res_opts.node_id)); + request.vnr = device->vnr; + request.cmd = P_TWOPC_PREP_RSZ; + request.flags = 0; + resource->twopc.type = TWOPC_RESIZE; + resource->twopc.resize.dds_flags = dds_flags; + resource->twopc.resize.user_size = new_user_size; + resource->twopc.resize.diskful_primary_nodes = 0; + resource->twopc.resize.new_size = 0; + resource->twopc_parent_nodes = 0; + resource->remote_state_change = true; + + reply->initiator_node_id = resource->res_opts.node_id; + reply->target_node_id = -1; + reply->max_possible_size = local_max_size; + reply->reachable_nodes = reach_immediately | NODE_MASK(resource->res_opts.node_id); + reply->target_reachable_nodes = reply->reachable_nodes; + if (resource->role[NOW] == R_PRIMARY) + reply->diskful_primary_nodes = NODE_MASK(resource->res_opts.node_id); + rcu_read_unlock(); + state_change_unlock(resource, &irq_flags); -/** - * _drbd_set_state() - Set a new DRBD state - * @device: DRBD device. - * @ns: new state. - * @flags: Flags - * @done: Optional completion, that will get completed after the after_state_ch() finished - * - * Caller needs to hold req_lock. Do not call directly. - */ -enum drbd_state_rv -_drbd_set_state(struct drbd_device *device, union drbd_state ns, - enum chg_state_flags flags, struct completion *done) -{ - struct drbd_peer_device *peer_device = first_peer_device(device); - struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; - union drbd_state os; - enum drbd_state_rv rv = SS_SUCCESS; - enum sanitize_state_warnings ssw; - struct after_state_chg_work *ascw; - struct drbd_state_change *state_change; + drbd_info(device, "Preparing cluster-wide size change %u " + "(local_max_size = %llu KB, user_cap = %llu KB)\n", + request.tid, + (unsigned long long)local_max_size >> 1, + (unsigned long long)new_user_size >> 1); - os = drbd_read_state(device); + rv = __cluster_wide_request(resource, &request, reach_immediately); - ns = sanitize_state(device, os, ns, &ssw); - if (ns.i == os.i) - return SS_NOTHING_TO_DO; + have_peers = rv == SS_CW_SUCCESS; + if (have_peers) { + if (wait_event_timeout(resource->state_wait, + cluster_wide_reply_ready(resource), + twopc_timeout(resource))) + rv = get_cluster_wide_reply(resource, NULL); + else + rv = SS_TIMEOUT; - rv = is_valid_transition(os, ns); - if (rv < SS_SUCCESS) - return rv; + if (rv == SS_TIMEOUT || rv == SS_CONCURRENT_ST_CHG) { + long timeout = twopc_retry_timeout(resource, retries++); - if (!(flags & CS_HARD)) { - /* pre-state-change checks ; only look at ns */ - /* See drbd_state_sw_errors in drbd_strings.c */ + drbd_info(device, "Retrying cluster-wide size change after %ums\n", + jiffies_to_msecs(timeout)); - rv = is_valid_state(device, ns); - if (rv < SS_SUCCESS) { - /* If the old state was illegal as well, then let - this happen...*/ + request.cmd = P_TWOPC_ABORT; + twopc_phase2(resource, &request, reach_immediately); - if (is_valid_state(device, os) == rv) - rv = is_valid_soft_transition(os, ns, connection); - } else - rv = is_valid_soft_transition(os, ns, connection); + clear_remote_state_change(resource); + schedule_timeout_interruptible(timeout); + goto retry; + } } - if (rv < SS_SUCCESS) { - if (flags & CS_VERBOSE) - print_st_err(device, os, ns, rv); - return rv; + if (rv >= SS_SUCCESS) { + new_size = drbd_new_dev_size(device, reply->max_possible_size, + new_user_size, dds_flags | DDSF_2PC); + commit_it = new_size != get_capacity(device->vdisk); + + if (commit_it) { + resource->twopc.resize.new_size = new_size; + resource->twopc.resize.diskful_primary_nodes = reply->diskful_primary_nodes; + drbd_info(device, "Committing cluster-wide size change %u (%ums)\n", + request.tid, + jiffies_to_msecs(jiffies - start_time)); + } else { + drbd_info(device, "Aborting cluster-wide size change %u (%ums) size unchanged\n", + request.tid, + jiffies_to_msecs(jiffies - start_time)); + } + } else { + commit_it = false; + drbd_info(device, "Aborting cluster-wide size change %u (%ums) rv = %d\n", + request.tid, + jiffies_to_msecs(jiffies - start_time), + rv); } - print_sanitize_warnings(device, ssw); + request.cmd = commit_it ? P_TWOPC_COMMIT : P_TWOPC_ABORT; + if (have_peers) + twopc_phase2(resource, &request, reach_immediately); - drbd_pr_state_change(device, os, ns, flags); + if (commit_it) { + struct twopc_resize *tr = &resource->twopc.resize; - /* Display changes to the susp* flags that where caused by the call to - sanitize_state(). Only display it here if we where not called from - _conn_request_state() */ - if (!(flags & CS_DC_SUSP)) - conn_pr_state_change(connection, os, ns, - (flags & ~CS_DC_MASK) | CS_DC_SUSP); + tr->diskful_primary_nodes = reply->diskful_primary_nodes; + tr->new_size = new_size; + tr->dds_flags = dds_flags; + tr->user_size = new_user_size; - /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference - * on the ldev here, to be sure the transition -> D_DISKLESS resp. - * drbd_ldev_destroy() won't happen before our corresponding - * after_state_ch works run, where we put_ldev again. */ - if ((os.disk != D_FAILED && ns.disk == D_FAILED) || - (os.disk != D_DISKLESS && ns.disk == D_DISKLESS)) - atomic_inc(&device->local_cnt); + dd = drbd_commit_size_change(device, rs, reach_immediately); + } else { + if (rv == SS_CW_FAILED_BY_PEER) + dd = DS_2PC_NOT_SUPPORTED; + else if (rv >= SS_SUCCESS) + dd = DS_UNCHANGED; + else + dd = DS_2PC_ERR; + } - if (!is_sync_state(os.conn) && is_sync_state(ns.conn)) - clear_bit(RS_DONE, &device->flags); + clear_remote_state_change(resource); + return dd; +} - /* FIXME: Have any flags been set earlier in this function already? */ - state_change = remember_old_state(device->resource, GFP_ATOMIC); +static void twopc_end_nested(struct drbd_resource *resource, enum drbd_packet cmd) +{ + struct drbd_connection *twopc_parent; + u64 im; + struct twopc_reply twopc_reply; + u64 twopc_parent_nodes = 0; + + write_lock_irq(&resource->state_rwlock); + twopc_reply = resource->twopc_reply; + /* Only send replies if we are in a twopc and have not yet sent replies. */ + if (twopc_reply.tid && resource->twopc_prepare_reply_cmd == 0) { + resource->twopc_prepare_reply_cmd = cmd; + twopc_parent_nodes = resource->twopc_parent_nodes; + } + clear_bit(TWOPC_WORK_PENDING, &resource->flags); + write_unlock_irq(&resource->state_rwlock); - /* changes to local_cnt and device flags should be visible before - * changes to state, which again should be visible before anything else - * depending on that change happens. */ - smp_wmb(); - device->state.i = ns.i; - device->resource->susp = ns.susp; - device->resource->susp_nod = ns.susp_nod; - device->resource->susp_fen = ns.susp_fen; - smp_wmb(); + if (!twopc_reply.tid) + return; - remember_new_state(state_change); + for_each_connection_ref(twopc_parent, im, resource) { + if (!(twopc_parent_nodes & NODE_MASK(twopc_parent->peer_node_id))) + continue; - /* put replicated vs not-replicated requests in seperate epochs */ - if (drbd_should_do_remote((union drbd_dev_state)os.i) != - drbd_should_do_remote((union drbd_dev_state)ns.i)) - start_new_tl_epoch(connection); + if (twopc_reply.is_disconnect) + set_bit(DISCONNECT_EXPECTED, &twopc_parent->flags); - if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING) - drbd_print_uuids(device, "attached to UUIDs"); + dynamic_drbd_dbg(twopc_parent, "Nested state change %u result: %s\n", + twopc_reply.tid, drbd_packet_name(cmd)); - /* Wake up role changes, that were delayed because of connection establishing */ - if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS && - no_peer_wf_report_params(connection)) { - clear_bit(STATE_SENT, &connection->flags); - wake_up_all_devices(connection); + drbd_send_twopc_reply(twopc_parent, cmd, &twopc_reply); } + wake_up_all(&resource->twopc_wait); +} - wake_up(&device->misc_wait); - wake_up(&device->state_wait); - wake_up(&connection->ping_wait); - - /* Aborted verify run, or we reached the stop sector. - * Log the last position, unless end-of-device. */ - if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && - ns.conn <= C_CONNECTED) { - device->ov_start_sector = - BM_BIT_TO_SECT(drbd_bm_bits(device) - device->ov_left); - if (device->ov_left) - drbd_info(device, "Online Verify reached sector %llu\n", - (unsigned long long)device->ov_start_sector); - } +static void __nested_twopc_work(struct drbd_resource *resource) +{ + enum drbd_state_rv rv; + enum drbd_packet cmd; - if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && - (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { - drbd_info(device, "Syncer continues.\n"); - device->rs_paused += (long)jiffies - -(long)device->rs_mark_time[device->rs_last_mark]; - if (ns.conn == C_SYNC_TARGET) - mod_timer(&device->resync_timer, jiffies); - } + rv = get_cluster_wide_reply(resource, NULL); + if (rv >= SS_SUCCESS) + cmd = P_TWOPC_YES; + else if (rv == SS_CONCURRENT_ST_CHG || rv == SS_HANDSHAKE_RETRY) + cmd = P_TWOPC_RETRY; + else + cmd = P_TWOPC_NO; + twopc_end_nested(resource, cmd); +} - if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && - (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { - drbd_info(device, "Resync suspended\n"); - device->rs_mark_time[device->rs_last_mark] = jiffies; - } +void nested_twopc_work(struct work_struct *work) +{ + struct drbd_resource *resource = + container_of(work, struct drbd_resource, twopc_work); - if (os.conn == C_CONNECTED && - (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { - unsigned long now = jiffies; - int i; + __nested_twopc_work(resource); - set_ov_position(peer_device, ns.conn); - device->rs_start = now; - device->rs_last_sect_ev = 0; - device->ov_last_oos_size = 0; - device->ov_last_oos_start = 0; + kref_put(&resource->kref, drbd_destroy_resource); +} - for (i = 0; i < DRBD_SYNC_MARKS; i++) { - device->rs_mark_left[i] = device->ov_left; - device->rs_mark_time[i] = now; - } +void drbd_maybe_cluster_wide_reply(struct drbd_resource *resource) +{ + lockdep_assert_held(&resource->state_rwlock); - drbd_rs_controller_reset(peer_device); + if (!resource->remote_state_change || !cluster_wide_reply_ready(resource)) + return; - if (ns.conn == C_VERIFY_S) { - drbd_info(device, "Starting Online Verify from sector %llu\n", - (unsigned long long)device->ov_position); - mod_timer(&device->resync_timer, jiffies); - } + if (resource->twopc_reply.initiator_node_id == resource->res_opts.node_id) { + wake_up_all(&resource->state_wait); + return; } - if (get_ldev(device)) { - u32 mdf = device->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND| - MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE| - MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY); - - mdf &= ~MDF_AL_CLEAN; - if (test_bit(CRASHED_PRIMARY, &device->flags)) - mdf |= MDF_CRASHED_PRIMARY; - if (device->state.role == R_PRIMARY || - (device->state.pdsk < D_INCONSISTENT && device->state.peer == R_PRIMARY)) - mdf |= MDF_PRIMARY_IND; - if (device->state.conn > C_WF_REPORT_PARAMS) - mdf |= MDF_CONNECTED_IND; - if (device->state.disk > D_INCONSISTENT) - mdf |= MDF_CONSISTENT; - if (device->state.disk > D_OUTDATED) - mdf |= MDF_WAS_UP_TO_DATE; - if (device->state.pdsk <= D_OUTDATED && device->state.pdsk >= D_INCONSISTENT) - mdf |= MDF_PEER_OUT_DATED; - if (mdf != device->ldev->md.flags) { - device->ldev->md.flags = mdf; - drbd_md_mark_dirty(device); - } - if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT) - drbd_set_ed_uuid(device, device->ldev->md.uuid[UI_CURRENT]); - put_ldev(device); - } + if (test_and_set_bit(TWOPC_WORK_PENDING, &resource->flags)) + return; - /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ - if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT && - os.peer == R_SECONDARY && ns.peer == R_PRIMARY) - set_bit(CONSIDER_RESYNC, &device->flags); - - /* Receiver should clean up itself */ - if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) - drbd_thread_stop_nowait(&connection->receiver); - - /* Now the receiver finished cleaning up itself, it should die */ - if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) - drbd_thread_stop_nowait(&connection->receiver); - - /* Upon network failure, we need to restart the receiver. */ - if (os.conn > C_WF_CONNECTION && - ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) - drbd_thread_restart_nowait(&connection->receiver); - - /* Resume AL writing if we get a connection */ - if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { - drbd_resume_al(device); - connection->connect_cnt++; - } - - /* remember last attach time so request_timer_fn() won't - * kill newly established sessions while we are still trying to thaw - * previously frozen IO */ - if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && - ns.disk > D_NEGOTIATING) - device->last_reattach_jif = jiffies; - - ascw = kmalloc_obj(*ascw, GFP_ATOMIC); - if (ascw) { - ascw->os = os; - ascw->ns = ns; - ascw->flags = flags; - ascw->w.cb = w_after_state_ch; - ascw->device = device; - ascw->done = done; - ascw->state_change = state_change; - drbd_queue_work(&connection->sender_work, - &ascw->w); - } else { - drbd_err(device, "Could not kmalloc an ascw\n"); - } + kref_get(&resource->kref); + schedule_work(&resource->twopc_work); +} +enum drbd_state_rv +nested_twopc_request(struct drbd_resource *resource, struct twopc_request *request) +{ + u64 nodes_to_reach, reach_immediately; + enum drbd_packet cmd = request->cmd; + enum drbd_state_rv rv; + bool have_peers; + + write_lock_irq(&resource->state_rwlock); + nodes_to_reach = request->nodes_to_reach; + reach_immediately = directly_connected_nodes(resource, NOW) & nodes_to_reach; + nodes_to_reach &= ~(reach_immediately | NODE_MASK(resource->res_opts.node_id)); + request->nodes_to_reach = nodes_to_reach; + write_unlock_irq(&resource->state_rwlock); + + rv = __cluster_wide_request(resource, request, reach_immediately); + have_peers = rv == SS_CW_SUCCESS; + if (cmd == P_TWOPC_PREPARE || cmd == P_TWOPC_PREP_RSZ) { + if (rv < SS_SUCCESS) + twopc_end_nested(resource, P_TWOPC_NO); + else if (!have_peers && cluster_wide_reply_ready(resource)) /* no nested nodes */ + __nested_twopc_work(resource); + } return rv; } -static int w_after_state_ch(struct drbd_work *w, int unused) +static bool has_up_to_date_peer_disks(struct drbd_device *device) { - struct after_state_chg_work *ascw = - container_of(w, struct after_state_chg_work, w); - struct drbd_device *device = ascw->device; + struct drbd_peer_device *peer_device; - after_state_ch(device, ascw->os, ascw->ns, ascw->flags, ascw->state_change); - forget_state_change(ascw->state_change); - if (ascw->flags & CS_WAIT_COMPLETE) - complete(ascw->done); - kfree(ascw); + for_each_peer_device(peer_device, device) + if (peer_device->disk_state[NEW] == D_UP_TO_DATE) + return true; + return false; +} - return 0; +static void disconnect_where_resync_target(struct drbd_device *device) +{ + struct drbd_peer_device *peer_device; + + for_each_peer_device(peer_device, device) + if (is_sync_target_state(peer_device, NEW)) + __change_cstate(peer_device->connection, C_TEAR_DOWN); } -static void abw_start_sync(struct drbd_device *device, int rv) +static bool do_change_role(struct change_context *context, enum change_phase phase) { - if (rv) { - drbd_err(device, "Writing the bitmap failed not starting resync.\n"); - _drbd_request_state(device, NS(conn, C_CONNECTED), CS_VERBOSE); - return; + struct drbd_resource *resource = context->resource; + enum drbd_role role = context->val.role; + int flags = context->flags; + struct drbd_device *device; + int vnr; + + resource->role[NEW] = role; + + rcu_read_lock(); + idr_for_each_entry(&resource->devices, device, vnr) { + if (role == R_PRIMARY && (flags & CS_FP_LOCAL_UP_TO_DATE)) { + if (device->disk_state[NEW] < D_UP_TO_DATE && + device->disk_state[NEW] >= D_INCONSISTENT && + !has_up_to_date_peer_disks(device)) { + device->disk_state[NEW] = D_UP_TO_DATE; + /* adding it to the context so that it gets sent to the peers */ + context->mask.disk |= disk_MASK; + context->val.disk |= D_UP_TO_DATE; + disconnect_where_resync_target(device); + } + } + + if (role == R_PRIMARY && (flags & CS_FP_OUTDATE_PEERS)) { + struct drbd_peer_device *peer_device; + for_each_peer_device_rcu(peer_device, device) { + if (peer_device->disk_state[NEW] == D_UNKNOWN) + __change_peer_disk_state(peer_device, D_OUTDATED); + } + } + + if (role == R_PRIMARY && phase == PH_COMMIT) { + u64 reachable_nodes = resource->twopc_reply.reachable_nodes; + struct drbd_peer_device *peer_device; + + for_each_peer_device_rcu(peer_device, device) { + if (NODE_MASK(peer_device->node_id) & reachable_nodes && + peer_device->disk_state[NEW] == D_UNKNOWN && + want_bitmap(peer_device)) + __change_peer_disk_state(peer_device, D_OUTDATED); + } + } } + rcu_read_unlock(); - switch (device->state.conn) { - case C_STARTING_SYNC_T: - _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); - break; - case C_STARTING_SYNC_S: - drbd_start_resync(device, C_SYNC_SOURCE); - break; + return phase != PH_PREPARE || + context->resource->role[NOW] != context->val.role; +} + +enum drbd_state_rv change_role(struct drbd_resource *resource, + enum drbd_role role, + enum chg_state_flags flags, + const char *tag, + const char **err_str) +{ + struct change_context role_context = { + .resource = resource, + .vnr = -1, + .mask = { { .role = role_MASK } }, + .val = { { .role = role } }, + .target_node_id = -1, + .flags = flags | CS_SERIALIZE, + .err_str = err_str, + }; + enum drbd_state_rv rv; + bool got_state_sem = false; + + if (role == R_SECONDARY) { + if (!(flags & CS_ALREADY_SERIALIZED)) { + down(&resource->state_sem); + got_state_sem = true; + role_context.flags |= CS_ALREADY_SERIALIZED; + } + role_context.change_local_state_last = true; } + rv = change_cluster_wide_state(do_change_role, &role_context, tag); + if (got_state_sem) + up(&resource->state_sem); + return rv; } -int drbd_bitmap_io_from_worker(struct drbd_device *device, - int (*io_fn)(struct drbd_device *, struct drbd_peer_device *), - char *why, enum bm_flag flags, - struct drbd_peer_device *peer_device) +void __change_io_susp_user(struct drbd_resource *resource, bool value) { - int rv; + resource->susp_user[NEW] = value; +} - D_ASSERT(device, current == first_peer_device(device)->connection->worker.task); +enum drbd_state_rv change_io_susp_user(struct drbd_resource *resource, + bool value, + enum chg_state_flags flags) +{ + unsigned long irq_flags; - /* open coded non-blocking drbd_suspend_io(device); */ - atomic_inc(&device->suspend_cnt); + begin_state_change(resource, &irq_flags, flags); + __change_io_susp_user(resource, value); + return end_state_change(resource, &irq_flags, value ? "suspend-io" : "resume-io"); +} - drbd_bm_lock(device, why, flags); - rv = io_fn(device, peer_device); - drbd_bm_unlock(device); +void __change_io_susp_no_data(struct drbd_resource *resource, bool value) +{ + resource->susp_nod[NEW] = value; +} - drbd_resume_io(device); +void __change_io_susp_fencing(struct drbd_connection *connection, bool value) +{ + connection->susp_fen[NEW] = value; +} - return rv; +void __change_io_susp_quorum(struct drbd_resource *resource, bool value) +{ + resource->susp_quorum[NEW] = value; } -int notify_resource_state_change(struct sk_buff *skb, - unsigned int seq, - void *state_change, - enum drbd_notification_type type) +void __change_disk_state(struct drbd_device *device, enum drbd_disk_state disk_state) { - struct drbd_resource_state_change *resource_state_change = state_change; - struct drbd_resource *resource = resource_state_change->resource; - struct resource_info resource_info = { - .res_role = resource_state_change->role[NEW], - .res_susp = resource_state_change->susp[NEW], - .res_susp_nod = resource_state_change->susp_nod[NEW], - .res_susp_fen = resource_state_change->susp_fen[NEW], - }; + device->disk_state[NEW] = disk_state; +} + +void __downgrade_disk_states(struct drbd_resource *resource, enum drbd_disk_state disk_state) +{ + struct drbd_device *device; + int vnr; - return notify_resource_state(skb, seq, resource, &resource_info, type); + rcu_read_lock(); + idr_for_each_entry(&resource->devices, device, vnr) { + if (device->disk_state[NEW] > disk_state) + __change_disk_state(device, disk_state); + } + rcu_read_unlock(); } -int notify_connection_state_change(struct sk_buff *skb, - unsigned int seq, - void *state_change, - enum drbd_notification_type type) +void __outdate_myself(struct drbd_resource *resource) { - struct drbd_connection_state_change *p = state_change; - struct drbd_connection *connection = p->connection; - struct connection_info connection_info = { - .conn_connection_state = p->cstate[NEW], - .conn_role = p->peer_role[NEW], - }; + struct drbd_device *device; + int vnr; - return notify_connection_state(skb, seq, connection, &connection_info, type); + idr_for_each_entry(&resource->devices, device, vnr) { + if (device->disk_state[NOW] > D_OUTDATED) + __change_disk_state(device, D_OUTDATED); + } } -int notify_device_state_change(struct sk_buff *skb, - unsigned int seq, - void *state_change, - enum drbd_notification_type type) +static bool device_has_connected_peer_devices(struct drbd_device *device) { - struct drbd_device_state_change *device_state_change = state_change; - struct drbd_device *device = device_state_change->device; - struct device_info device_info = { - .dev_disk_state = device_state_change->disk_state[NEW], - }; + struct drbd_peer_device *peer_device; - return notify_device_state(skb, seq, device, &device_info, type); + for_each_peer_device(peer_device, device) + if (peer_device->repl_state[NOW] >= L_ESTABLISHED) + return true; + return false; } -int notify_peer_device_state_change(struct sk_buff *skb, - unsigned int seq, - void *state_change, - enum drbd_notification_type type) +static bool device_has_peer_devices_with_disk(struct drbd_device *device) { - struct drbd_peer_device_state_change *p = state_change; - struct drbd_peer_device *peer_device = p->peer_device; - struct peer_device_info peer_device_info = { - .peer_repl_state = p->repl_state[NEW], - .peer_disk_state = p->disk_state[NEW], - .peer_resync_susp_user = p->resync_susp_user[NEW], - .peer_resync_susp_peer = p->resync_susp_peer[NEW], - .peer_resync_susp_dependency = p->resync_susp_dependency[NEW], - }; + struct drbd_peer_device *peer_device; + bool rv = false; + + for_each_peer_device(peer_device, device) { + if (peer_device->connection->cstate[NOW] == C_CONNECTED) { + /* We expect to receive up-to-date UUIDs soon. + To avoid a race in receive_state, "clear" uuids while + holding state_rwlock. I.e. atomic with the state change */ + clear_bit(UUIDS_RECEIVED, &peer_device->flags); + if (peer_device->disk_state[NOW] > D_DISKLESS) + rv = true; + } + } - return notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type); + return rv; } -static void broadcast_state_change(struct drbd_state_change *state_change) +static void restore_outdated_in_pdsk(struct drbd_device *device) { - struct drbd_resource_state_change *resource_state_change = &state_change->resource[0]; - bool resource_state_has_changed; - unsigned int n_device, n_connection, n_peer_device, n_peer_devices; - int (*last_func)(struct sk_buff *, unsigned int, - void *, enum drbd_notification_type) = NULL; - void *last_arg = NULL; + struct drbd_peer_device *peer_device; -#define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW]) -#define FINAL_STATE_CHANGE(type) \ - ({ if (last_func) \ - last_func(NULL, 0, last_arg, type); \ - }) -#define REMEMBER_STATE_CHANGE(func, arg, type) \ - ({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \ - last_func = func; \ - last_arg = arg; \ - }) + if (!get_ldev_if_state(device, D_ATTACHING)) + return; - mutex_lock(¬ification_mutex); + for_each_peer_device(peer_device, device) { + int node_id = peer_device->connection->peer_node_id; + struct drbd_peer_md *peer_md = &device->ldev->md.peers[node_id]; - resource_state_has_changed = - HAS_CHANGED(resource_state_change->role) || - HAS_CHANGED(resource_state_change->susp) || - HAS_CHANGED(resource_state_change->susp_nod) || - HAS_CHANGED(resource_state_change->susp_fen); + if ((peer_md->flags & MDF_PEER_OUTDATED) && + peer_device->disk_state[NEW] == D_UNKNOWN) + __change_peer_disk_state(peer_device, D_OUTDATED); + } - if (resource_state_has_changed) - REMEMBER_STATE_CHANGE(notify_resource_state_change, - resource_state_change, NOTIFY_CHANGE); + put_ldev(device); +} - for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { - struct drbd_connection_state_change *connection_state_change = - &state_change->connections[n_connection]; +static bool do_twopc_after_lost_peer(struct change_context *context, enum change_phase phase) +{ + struct drbd_resource *resource = context->resource; + struct twopc_reply *reply = &resource->twopc_reply; + u64 directly_reachable = directly_connected_nodes(resource, NEW) | + NODE_MASK(resource->res_opts.node_id); + bool pri_incapable = reply->primary_nodes && !reply->weak_nodes; /* TWOPC_PRI_INCAPABLE */ + + if (phase == PH_COMMIT && (reply->primary_nodes & ~directly_reachable && !pri_incapable)) { + __outdate_myself(resource); + } else { + struct drbd_device *device; + int vnr; - if (HAS_CHANGED(connection_state_change->peer_role) || - HAS_CHANGED(connection_state_change->cstate)) - REMEMBER_STATE_CHANGE(notify_connection_state_change, - connection_state_change, NOTIFY_CHANGE); + idr_for_each_entry(&resource->devices, device, vnr) { + if (device->disk_state[NOW] == D_CONSISTENT && + may_return_to_up_to_date(device, NOW)) + __change_disk_state(device, D_UP_TO_DATE); + } } - for (n_device = 0; n_device < state_change->n_devices; n_device++) { - struct drbd_device_state_change *device_state_change = - &state_change->devices[n_device]; + return phase != PH_PREPARE || reply->reachable_nodes != NODE_MASK(resource->res_opts.node_id); +} + +static enum drbd_state_rv twopc_after_lost_peer(struct drbd_resource *resource, + enum chg_state_flags flags) +{ + struct change_context context = { + .resource = resource, + .vnr = -1, + .mask = { }, + .val = { }, + .target_node_id = -1, + .flags = flags | (resource->res_opts.quorum != QOU_OFF ? CS_FORCE_RECALC : 0), + .change_local_state_last = false, + }; + + /* The other nodes get the request for an empty state change. I.e. they + will agree to this change request. At commit time we know where to + go from the D_CONSISTENT, since we got the primary mask. */ + return change_cluster_wide_state(do_twopc_after_lost_peer, &context, "lost-peer"); +} + +void drbd_empty_twopc_work_fn(struct work_struct *work) +{ + struct drbd_resource *resource = container_of(work, struct drbd_resource, empty_twopc); - if (HAS_CHANGED(device_state_change->disk_state)) - REMEMBER_STATE_CHANGE(notify_device_state_change, - device_state_change, NOTIFY_CHANGE); - } + twopc_after_lost_peer(resource, CS_VERBOSE); - n_peer_devices = state_change->n_devices * state_change->n_connections; - for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) { - struct drbd_peer_device_state_change *p = - &state_change->peer_devices[n_peer_device]; + clear_bit(TRY_BECOME_UP_TO_DATE_PENDING, &resource->flags); + wake_up_all(&resource->state_wait); - if (HAS_CHANGED(p->disk_state) || - HAS_CHANGED(p->repl_state) || - HAS_CHANGED(p->resync_susp_user) || - HAS_CHANGED(p->resync_susp_peer) || - HAS_CHANGED(p->resync_susp_dependency)) - REMEMBER_STATE_CHANGE(notify_peer_device_state_change, - p, NOTIFY_CHANGE); + kref_put(&resource->kref, drbd_destroy_resource); +} + +static bool do_change_disk_state(struct change_context *context, enum change_phase phase) +{ + struct drbd_device *device = + container_of(context, struct change_disk_state_context, context)->device; + bool cluster_wide_state_change = false; + + if (device->disk_state[NOW] == D_ATTACHING && + context->val.disk == D_NEGOTIATING) { + if (device_has_peer_devices_with_disk(device)) { + cluster_wide_state_change = + supports_two_phase_commit(device->resource); + } else { + /* very last part of attach */ + /* ldev_safe: D_ATTACHING->D_NEGOTIATING, state_rwlock held, ldev exists */ + context->val.disk = disk_state_from_md(device); + restore_outdated_in_pdsk(device); + } + } else if (device->disk_state[NOW] != D_DETACHING && + context->val.disk == D_DETACHING && + device_has_connected_peer_devices(device)) { + cluster_wide_state_change = true; } + __change_disk_state(device, context->val.disk); + return phase != PH_PREPARE || cluster_wide_state_change; +} - FINAL_STATE_CHANGE(NOTIFY_CHANGE); - mutex_unlock(¬ification_mutex); +enum drbd_state_rv change_disk_state(struct drbd_device *device, + enum drbd_disk_state disk_state, + enum chg_state_flags flags, + const char *tag, + const char **err_str) +{ + struct change_disk_state_context disk_state_context = { + .context = { + .resource = device->resource, + .vnr = device->vnr, + .mask = { { .disk = disk_MASK } }, + .val = { { .disk = disk_state } }, + .target_node_id = -1, + .flags = flags, + .change_local_state_last = true, + .err_str = err_str, + }, + .device = device, + }; -#undef HAS_CHANGED -#undef FINAL_STATE_CHANGE -#undef REMEMBER_STATE_CHANGE + return change_cluster_wide_state(do_change_disk_state, + &disk_state_context.context, tag); } -/* takes old and new peer disk state */ -static bool lost_contact_to_peer_data(enum drbd_disk_state os, enum drbd_disk_state ns) +void __change_cstate(struct drbd_connection *connection, enum drbd_conn_state cstate) { - if ((os >= D_INCONSISTENT && os != D_UNKNOWN && os != D_OUTDATED) - && (ns < D_INCONSISTENT || ns == D_UNKNOWN || ns == D_OUTDATED)) - return true; + if (cstate == C_DISCONNECTING) + set_bit(DISCONNECT_EXPECTED, &connection->flags); - /* Scenario, starting with normal operation - * Connected Primary/Secondary UpToDate/UpToDate - * NetworkFailure Primary/Unknown UpToDate/DUnknown (frozen) - * ... - * Connected Primary/Secondary UpToDate/Diskless (resumed; needs to bump uuid!) - */ - if (os == D_UNKNOWN - && (ns == D_DISKLESS || ns == D_FAILED || ns == D_OUTDATED)) - return true; + connection->cstate[NEW] = cstate; + if (cstate < C_CONNECTED) { + struct drbd_peer_device *peer_device; + int vnr; - return false; + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + __change_repl_state(peer_device, L_OFF); + rcu_read_unlock(); + } } -/** - * after_state_ch() - Perform after state change actions that may sleep - * @device: DRBD device. - * @os: old state. - * @ns: new state. - * @flags: Flags - * @state_change: state change to broadcast - */ -static void after_state_ch(struct drbd_device *device, union drbd_state os, - union drbd_state ns, enum chg_state_flags flags, - struct drbd_state_change *state_change) +static bool connection_has_connected_peer_devices(struct drbd_connection *connection) { - struct drbd_resource *resource = device->resource; - struct drbd_peer_device *peer_device = first_peer_device(device); - struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; - struct sib_info sib; + struct drbd_peer_device *peer_device; + int vnr; - broadcast_state_change(state_change); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + if (peer_device->repl_state[NOW] >= L_ESTABLISHED) + return true; + } + return false; +} - sib.sib_reason = SIB_STATE_CHANGE; - sib.os = os; - sib.ns = ns; +enum outdate_what { OUTDATE_NOTHING, OUTDATE_DISKS, OUTDATE_PEER_DISKS }; - if ((os.disk != D_UP_TO_DATE || os.pdsk != D_UP_TO_DATE) - && (ns.disk == D_UP_TO_DATE && ns.pdsk == D_UP_TO_DATE)) { - clear_bit(CRASHED_PRIMARY, &device->flags); - if (device->p_uuid) - device->p_uuid[UI_FLAGS] &= ~((u64)2); +static enum outdate_what outdate_on_disconnect(struct drbd_connection *connection) +{ + struct drbd_resource *resource = connection->resource; + + if (connection->cstate[NOW] == C_CONNECTED && + (connection->fencing_policy >= FP_RESOURCE || + connection->resource->res_opts.quorum != QOU_OFF) && + resource->role[NOW] != connection->peer_role[NOW]) { + /* primary politely disconnects from secondary, + * tells peer to please outdate itself */ + if (resource->role[NOW] == R_PRIMARY) + return OUTDATE_PEER_DISKS; + + /* secondary politely disconnect from primary, + * proposes to outdate itself. */ + if (connection->peer_role[NOW] == R_PRIMARY) + return OUTDATE_DISKS; } + return OUTDATE_NOTHING; +} - /* Inform userspace about the change... */ - drbd_bcast_event(device, &sib); +static void __change_cstate_and_outdate(struct drbd_connection *connection, + enum drbd_conn_state cstate, + enum outdate_what outdate_what) +{ + __change_cstate(connection, cstate); + switch (outdate_what) { + case OUTDATE_DISKS: + __downgrade_disk_states(connection->resource, D_OUTDATED); + break; + case OUTDATE_PEER_DISKS: + __downgrade_peer_disk_states(connection, D_OUTDATED); + break; + case OUTDATE_NOTHING: + break; + } +} - if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) && - (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) - drbd_khelper(device, "pri-on-incon-degr"); +void apply_connect(struct drbd_connection *connection, bool commit) +{ + struct drbd_peer_device *peer_device; + int vnr; - /* Here we have the actions that are performed after a - state change. This function might sleep */ + if (!commit || connection->cstate[NEW] != C_CONNECTED) + return; - if (ns.susp_nod) { - enum drbd_req_event what = NOTHING; + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + union drbd_state s = peer_device->connect_state; - spin_lock_irq(&device->resource->req_lock); - if (os.conn < C_CONNECTED && conn_lowest_conn(connection) >= C_CONNECTED) - what = RESEND; + if (s.disk != D_MASK) + __change_disk_state(device, s.disk); + if (device->disk_state[NOW] != D_NEGOTIATING) + __change_repl_state(peer_device, s.conn); + __change_peer_disk_state(peer_device, s.pdsk); + __change_resync_susp_peer(peer_device, s.peer_isp); - if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && - conn_lowest_disk(connection) == D_UP_TO_DATE) - what = RESTART_FROZEN_DISK_IO; + if (s.conn == L_OFF) + __change_cstate(connection, C_DISCONNECTING); - if (resource->susp_nod && what != NOTHING) { - _tl_restart(connection, what); - _conn_request_state(connection, - (union drbd_state) { { .susp_nod = 1 } }, - (union drbd_state) { { .susp_nod = 0 } }, - CS_VERBOSE); - } - spin_unlock_irq(&device->resource->req_lock); + if (commit) + clear_bit(DISCARD_MY_DATA, &peer_device->flags); } +} - if (ns.susp_fen) { - spin_lock_irq(&device->resource->req_lock); - if (resource->susp_fen && conn_lowest_conn(connection) >= C_CONNECTED) { - /* case2: The connection was established again: */ - struct drbd_peer_device *peer_device; - int vnr; - - rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) - clear_bit(NEW_CUR_UUID, &peer_device->device->flags); - rcu_read_unlock(); - - /* We should actively create a new uuid, _before_ - * we resume/resent, if the peer is diskless - * (recovery from a multiple error scenario). - * Currently, this happens with a slight delay - * below when checking lost_contact_to_peer_data() ... - */ - _tl_restart(connection, RESEND); - _conn_request_state(connection, - (union drbd_state) { { .susp_fen = 1 } }, - (union drbd_state) { { .susp_fen = 0 } }, - CS_VERBOSE); - } - spin_unlock_irq(&device->resource->req_lock); - } - - /* Became sync source. With protocol >= 96, we still need to send out - * the sync uuid now. Need to do that before any drbd_send_state, or - * the other side may go "paused sync" before receiving the sync uuids, - * which is unexpected. */ - if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) && - (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) && - connection->agreed_pro_version >= 96 && get_ldev(device)) { - drbd_gen_and_send_sync_uuid(peer_device); - put_ldev(device); - } +struct change_cstate_context { + struct change_context context; + struct drbd_connection *connection; + enum outdate_what outdate_what; +}; - /* Do not change the order of the if above and the two below... */ - if (os.pdsk == D_DISKLESS && - ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) { /* attach on the peer */ - /* we probably will start a resync soon. - * make sure those things are properly reset. */ - device->rs_total = 0; - device->rs_failed = 0; - atomic_set(&device->rs_pending_cnt, 0); - drbd_rs_cancel_all(device); - - drbd_send_uuids(peer_device); - drbd_send_state(peer_device, ns); - } - /* No point in queuing send_bitmap if we don't have a connection - * anymore, so check also the _current_ state, not only the new state - * at the time this work was queued. */ - if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S && - device->state.conn == C_WF_BITMAP_S) - drbd_queue_bitmap_io(device, &drbd_send_bitmap, NULL, - "send_bitmap (WFBitMapS)", - BM_LOCKED_TEST_ALLOWED, peer_device); - - /* Lost contact to peer's copy of the data */ - if (lost_contact_to_peer_data(os.pdsk, ns.pdsk)) { - if (get_ldev(device)) { - if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && - device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { - if (drbd_suspended(device)) { - set_bit(NEW_CUR_UUID, &device->flags); - } else { - drbd_uuid_new_current(device); - drbd_send_uuids(peer_device); - } +static bool do_change_cstate(struct change_context *context, enum change_phase phase) +{ + struct change_cstate_context *cstate_context = + container_of(context, struct change_cstate_context, context); + struct drbd_connection *connection = cstate_context->connection; + struct drbd_resource *resource = context->resource; + struct twopc_reply *reply = &resource->twopc_reply; + + if (phase == PH_PREPARE) { + cstate_context->outdate_what = OUTDATE_NOTHING; + if (context->val.conn == C_DISCONNECTING && !(context->flags & CS_HARD)) { + cstate_context->outdate_what = + outdate_on_disconnect(connection); + switch (cstate_context->outdate_what) { + case OUTDATE_DISKS: + context->mask.disk = disk_MASK; + context->val.disk = D_OUTDATED; + break; + case OUTDATE_PEER_DISKS: + context->mask.pdsk = pdsk_MASK; + context->val.pdsk = D_OUTDATED; + break; + case OUTDATE_NOTHING: + break; } - put_ldev(device); } } + if ((context->val.conn == C_CONNECTED && connection->cstate[NEW] == C_CONNECTING) || + context->val.conn != C_CONNECTED) + __change_cstate_and_outdate(connection, + context->val.conn, + cstate_context->outdate_what); + + if (context->val.conn == C_CONNECTED && + connection->agreed_pro_version >= 117) + apply_connect(connection, phase == PH_COMMIT); + + if (phase == PH_COMMIT) { + u64 directly_reachable = directly_connected_nodes(resource, NEW) | + NODE_MASK(resource->res_opts.node_id); + + if (reply->primary_nodes & ~directly_reachable) + __outdate_myself(resource); + } - if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) { - if (os.peer != R_PRIMARY && ns.peer == R_PRIMARY && - device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { - drbd_uuid_new_current(device); - drbd_send_uuids(peer_device); - } - /* D_DISKLESS Peer becomes secondary */ - if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) - /* We may still be Primary ourselves. - * No harm done if the bitmap still changes, - * redirtied pages will follow later. */ - drbd_bitmap_io_from_worker(device, &drbd_bm_write, - "demote diskless peer", BM_LOCKED_SET_ALLOWED, peer_device); - put_ldev(device); + if (context->val.conn == C_CONNECTED && connection->peer_role[NOW] == R_UNKNOWN) { + enum drbd_role target_role = + (reply->primary_nodes & NODE_MASK(context->target_node_id)) ? + R_PRIMARY : R_SECONDARY; + + __change_peer_role(connection, target_role); } - /* Write out all changed bits on demote. - * Though, no need to da that just yet - * if there is a resync going on still */ - if (os.role == R_PRIMARY && ns.role == R_SECONDARY && - device->state.conn <= C_CONNECTED && get_ldev(device)) { - /* No changes to the bitmap expected this time, so assert that, - * even though no harm was done if it did change. */ - drbd_bitmap_io_from_worker(device, &drbd_bm_write, - "demote", BM_LOCKED_TEST_ALLOWED, peer_device); - put_ldev(device); + return phase != PH_PREPARE || + context->val.conn == C_CONNECTED || + (context->val.conn == C_DISCONNECTING && + connection_has_connected_peer_devices(connection)); +} + +/** + * change_cstate_tag() - change the connection state of a connection + * @connection: DRBD connection. + * @cstate: The connection state to change to. + * @flags: State change flags. + * @tag: State change tag to print in status messages. + * @err_str: Pointer to save the error string to. + * + * When disconnecting from a peer, we may also need to outdate the local or + * peer disks depending on the fencing policy. This cannot easily be split + * into two state changes. + */ +enum drbd_state_rv change_cstate_tag(struct drbd_connection *connection, + enum drbd_conn_state cstate, + enum chg_state_flags flags, + const char *tag, + const char **err_str) +{ + struct change_cstate_context cstate_context = { + .context = { + .resource = connection->resource, + .vnr = -1, + .mask = { { .conn = conn_MASK } }, + .val = { { .conn = cstate } }, + .target_node_id = connection->peer_node_id, + .flags = flags, + .change_local_state_last = true, + .err_str = err_str, + }, + .connection = connection, + }; + + if (cstate == C_CONNECTED) { + cstate_context.context.mask.role = role_MASK; + cstate_context.context.val.role = connection->resource->role[NOW]; } - /* Last part of the attaching process ... */ - if (ns.conn >= C_CONNECTED && - os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { - drbd_send_sizes(peer_device, 0, 0); /* to start sync... */ - drbd_send_uuids(peer_device); - drbd_send_state(peer_device, ns); - } - - /* We want to pause/continue resync, tell peer. */ - if (ns.conn >= C_CONNECTED && - ((os.aftr_isp != ns.aftr_isp) || - (os.user_isp != ns.user_isp))) - drbd_send_state(peer_device, ns); - - /* In case one of the isp bits got set, suspend other devices. */ - if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && - (ns.aftr_isp || ns.peer_isp || ns.user_isp)) - suspend_other_sg(device); - - /* Make sure the peer gets informed about eventual state - changes (ISP bits) while we were in WFReportParams. */ - if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) - drbd_send_state(peer_device, ns); - - if (os.conn != C_AHEAD && ns.conn == C_AHEAD) - drbd_send_state(peer_device, ns); - - /* We are in the progress to start a full sync... */ - if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || - (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) - /* no other bitmap changes expected during this phase */ - drbd_queue_bitmap_io(device, - &drbd_bmio_set_n_write, &abw_start_sync, - "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED, - peer_device); - - /* first half of local IO error, failure to attach, - * or administrative detach */ - if (os.disk != D_FAILED && ns.disk == D_FAILED) { - enum drbd_io_error_p eh = EP_PASS_ON; - int was_io_error = 0; - /* corresponding get_ldev was in _drbd_set_state, to serialize - * our cleanup here with the transition to D_DISKLESS. - * But is is still not save to dreference ldev here, since - * we might come from an failed Attach before ldev was set. */ - if (device->ldev) { - rcu_read_lock(); - eh = rcu_dereference(device->ldev->disk_conf)->on_io_error; - rcu_read_unlock(); + /* + * Hard connection state changes like a protocol error or forced + * disconnect may occur while we are holding resource->state_sem. In + * that case, omit CS_SERIALIZE so that we don't deadlock trying to + * grab that mutex again. + */ + if (!(flags & CS_HARD)) + cstate_context.context.flags |= CS_SERIALIZE; - was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags); - - /* Intentionally call this handler first, before drbd_send_state(). - * See: 2932204 drbd: call local-io-error handler early - * People may chose to hard-reset the box from this handler. - * It is useful if this looks like a "regular node crash". */ - if (was_io_error && eh == EP_CALL_HELPER) - drbd_khelper(device, "local-io-error"); - - /* Immediately allow completion of all application IO, - * that waits for completion from the local disk, - * if this was a force-detach due to disk_timeout - * or administrator request (drbdsetup detach --force). - * Do NOT abort otherwise. - * Aborting local requests may cause serious problems, - * if requests are completed to upper layers already, - * and then later the already submitted local bio completes. - * This can cause DMA into former bio pages that meanwhile - * have been re-used for other things. - * So aborting local requests may cause crashes, - * or even worse, silent data corruption. - */ - if (test_and_clear_bit(FORCE_DETACH, &device->flags)) - tl_abort_disk_io(device); + return change_cluster_wide_state(do_change_cstate, &cstate_context.context, tag); +} - /* current state still has to be D_FAILED, - * there is only one way out: to D_DISKLESS, - * and that may only happen after our put_ldev below. */ - if (device->state.disk != D_FAILED) - drbd_err(device, - "ASSERT FAILED: disk is %s during detach\n", - drbd_disk_str(device->state.disk)); +void __change_peer_role(struct drbd_connection *connection, enum drbd_role peer_role) +{ + connection->peer_role[NEW] = peer_role; +} - if (ns.conn >= C_CONNECTED) - drbd_send_state(peer_device, ns); +void __change_repl_state(struct drbd_peer_device *peer_device, enum drbd_repl_state repl_state) +{ + peer_device->repl_state[NEW] = repl_state; + if (repl_state > L_OFF) + peer_device->connection->cstate[NEW] = C_CONNECTED; +} - drbd_rs_cancel_all(device); +struct change_repl_context { + struct change_context context; + struct drbd_peer_device *peer_device; +}; - /* In case we want to get something to stable storage still, - * this may be the last chance. - * Following put_ldev may transition to D_DISKLESS. */ - drbd_md_sync(device); - } - put_ldev(device); - } +static bool do_change_repl_state(struct change_context *context, enum change_phase phase) +{ + struct change_repl_context *repl_context = + container_of(context, struct change_repl_context, context); + struct drbd_peer_device *peer_device = repl_context->peer_device; + enum drbd_repl_state *repl_state = peer_device->repl_state; + enum drbd_repl_state new_repl_state = context->val.conn; + bool cluster_wide = context->flags & CS_CLUSTER_WIDE; + + __change_repl_state(peer_device, new_repl_state); + + return phase != PH_PREPARE || + ((repl_state[NOW] >= L_ESTABLISHED && + (new_repl_state == L_STARTING_SYNC_S || new_repl_state == L_STARTING_SYNC_T)) || + (repl_state[NOW] == L_ESTABLISHED && + (new_repl_state == L_VERIFY_S || new_repl_state == L_OFF)) || + (repl_state[NOW] == L_ESTABLISHED && cluster_wide && + (new_repl_state == L_WF_BITMAP_S || new_repl_state == L_WF_BITMAP_T))); +} - /* second half of local IO error, failure to attach, - * or administrative detach, - * after local_cnt references have reached zero again */ - if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) { - /* We must still be diskless, - * re-attach has to be serialized with this! */ - if (device->state.disk != D_DISKLESS) - drbd_err(device, - "ASSERT FAILED: disk is %s while going diskless\n", - drbd_disk_str(device->state.disk)); - - if (ns.conn >= C_CONNECTED) - drbd_send_state(peer_device, ns); - /* corresponding get_ldev in __drbd_set_state - * this may finally trigger drbd_ldev_destroy. */ - put_ldev(device); - } +enum drbd_state_rv change_repl_state(struct drbd_peer_device *peer_device, + enum drbd_repl_state new_repl_state, + enum chg_state_flags flags, + const char *tag) +{ + struct change_repl_context repl_context = { + .context = { + .resource = peer_device->device->resource, + .vnr = peer_device->device->vnr, + .mask = { { .conn = conn_MASK } }, + .val = { { .conn = new_repl_state } }, + .target_node_id = peer_device->node_id, + .flags = flags + }, + .peer_device = peer_device + }; - /* Notify peer that I had a local IO error, and did not detached.. */ - if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) - drbd_send_state(peer_device, ns); - - /* Disks got bigger while they were detached */ - if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && - test_and_clear_bit(RESYNC_AFTER_NEG, &device->flags)) { - if (ns.conn == C_CONNECTED) - resync_after_online_grow(device); - } - - /* A resync finished or aborted, wake paused devices... */ - if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) || - (os.peer_isp && !ns.peer_isp) || - (os.user_isp && !ns.user_isp)) - resume_next_sg(device); - - /* sync target done with resync. Explicitly notify peer, even though - * it should (at least for non-empty resyncs) already know itself. */ - if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) - drbd_send_state(peer_device, ns); - - /* Verify finished, or reached stop sector. Peer did not know about - * the stop sector, and we may even have changed the stop sector during - * verify to interrupt/stop early. Send the new state. */ - if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED - && verify_can_do_stop_sector(device)) - drbd_send_state(peer_device, ns); - - /* This triggers bitmap writeout of potentially still unwritten pages - * if the resync finished cleanly, or aborted because of peer disk - * failure, or on transition from resync back to AHEAD/BEHIND. - * - * Connection loss is handled in drbd_disconnected() by the receiver. - * - * For resync aborted because of local disk failure, we cannot do - * any bitmap writeout anymore. - * - * No harm done if some bits change during this phase. - */ - if ((os.conn > C_CONNECTED && os.conn < C_AHEAD) && - (ns.conn == C_CONNECTED || ns.conn >= C_AHEAD) && get_ldev(device)) { - drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL, - "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED, - peer_device); - put_ldev(device); - } + if (new_repl_state == L_WF_BITMAP_S || new_repl_state == L_VERIFY_S) + repl_context.context.change_local_state_last = true; - if (ns.disk == D_DISKLESS && - ns.conn == C_STANDALONE && - ns.role == R_SECONDARY) { - if (os.aftr_isp != ns.aftr_isp) - resume_next_sg(device); - } + return change_cluster_wide_state(do_change_repl_state, &repl_context.context, tag); +} - drbd_md_sync(device); +enum drbd_state_rv stable_change_repl_state(struct drbd_peer_device *peer_device, + enum drbd_repl_state repl_state, + enum chg_state_flags flags, + const char *tag) +{ + return stable_state_change(peer_device->device->resource, + change_repl_state(peer_device, repl_state, flags, tag)); } -struct after_conn_state_chg_work { - struct drbd_work w; - enum drbd_conns oc; - union drbd_state ns_min; - union drbd_state ns_max; /* new, max state, over all devices */ - enum chg_state_flags flags; - struct drbd_connection *connection; - struct drbd_state_change *state_change; -}; +void __change_peer_disk_state(struct drbd_peer_device *peer_device, enum drbd_disk_state disk_state) +{ + peer_device->disk_state[NEW] = disk_state; +} -static int w_after_conn_state_ch(struct drbd_work *w, int unused) +void __downgrade_peer_disk_states(struct drbd_connection *connection, enum drbd_disk_state disk_state) { - struct after_conn_state_chg_work *acscw = - container_of(w, struct after_conn_state_chg_work, w); - struct drbd_connection *connection = acscw->connection; - enum drbd_conns oc = acscw->oc; - union drbd_state ns_max = acscw->ns_max; struct drbd_peer_device *peer_device; int vnr; - broadcast_state_change(acscw->state_change); - forget_state_change(acscw->state_change); - kfree(acscw); + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + if (peer_device->disk_state[NEW] > disk_state) + __change_peer_disk_state(peer_device, disk_state); + } + rcu_read_unlock(); +} - /* Upon network configuration, we need to start the receiver */ - if (oc == C_STANDALONE && ns_max.conn == C_UNCONNECTED) - drbd_thread_start(&connection->receiver); +enum drbd_state_rv change_peer_disk_state(struct drbd_peer_device *peer_device, + enum drbd_disk_state disk_state, + enum chg_state_flags flags, + const char *tag) +{ + struct drbd_resource *resource = peer_device->device->resource; + unsigned long irq_flags; - if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) { - struct net_conf *old_conf; + begin_state_change(resource, &irq_flags, flags); + __change_peer_disk_state(peer_device, disk_state); + return end_state_change(resource, &irq_flags, tag); +} - mutex_lock(¬ification_mutex); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) - notify_peer_device_state(NULL, 0, peer_device, NULL, - NOTIFY_DESTROY | NOTIFY_CONTINUES); - notify_connection_state(NULL, 0, connection, NULL, NOTIFY_DESTROY); - mutex_unlock(¬ification_mutex); +void __change_resync_susp_user(struct drbd_peer_device *peer_device, + bool value) +{ + peer_device->resync_susp_user[NEW] = value; +} - mutex_lock(&connection->resource->conf_update); - old_conf = connection->net_conf; - connection->my_addr_len = 0; - connection->peer_addr_len = 0; - RCU_INIT_POINTER(connection->net_conf, NULL); - conn_free_crypto(connection); - mutex_unlock(&connection->resource->conf_update); +enum drbd_state_rv change_resync_susp_user(struct drbd_peer_device *peer_device, + bool value, + enum chg_state_flags flags) +{ + struct drbd_resource *resource = peer_device->device->resource; + unsigned long irq_flags; - kvfree_rcu_mightsleep(old_conf); - } + begin_state_change(resource, &irq_flags, flags); + __change_resync_susp_user(peer_device, value); + return end_state_change(resource, &irq_flags, value ? "pause-sync" : "resume-sync"); +} - if (ns_max.susp_fen) { - /* case1: The outdate peer handler is successful: */ - if (ns_max.pdsk <= D_OUTDATED) { - rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - if (test_bit(NEW_CUR_UUID, &device->flags)) { - drbd_uuid_new_current(device); - clear_bit(NEW_CUR_UUID, &device->flags); - } - } - rcu_read_unlock(); - spin_lock_irq(&connection->resource->req_lock); - _tl_restart(connection, CONNECTION_LOST_WHILE_PENDING); - _conn_request_state(connection, - (union drbd_state) { { .susp_fen = 1 } }, - (union drbd_state) { { .susp_fen = 0 } }, - CS_VERBOSE); - spin_unlock_irq(&connection->resource->req_lock); - } - } - conn_md_sync(connection); - kref_put(&connection->kref, drbd_destroy_connection); +void __change_resync_susp_peer(struct drbd_peer_device *peer_device, + bool value) +{ + peer_device->resync_susp_peer[NEW] = value; +} - return 0; +void __change_resync_susp_dependency(struct drbd_peer_device *peer_device, + bool value) +{ + peer_device->resync_susp_dependency[NEW] = value; } -static void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf) +static void log_current_uuids(struct drbd_device *device) { - enum chg_state_flags flags = ~0; struct drbd_peer_device *peer_device; - int vnr, first_vol = 1; - union drbd_dev_state os, cs = { - { .role = R_SECONDARY, - .peer = R_UNKNOWN, - .conn = connection->cstate, - .disk = D_DISKLESS, - .pdsk = D_UNKNOWN, - } }; + struct drbd_connection *connection; + char msg[120]; + int ret, pos = 0; rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - os = device->state; - - if (first_vol) { - cs = os; - first_vol = 0; + for_each_peer_device_rcu(peer_device, device) { + if (peer_device->disk_state[NOW] != D_UP_TO_DATE) continue; + connection = peer_device->connection; + ret = snprintf(msg + pos, 120 - pos, "%s: %016llX ", + rcu_dereference(connection->transport.net_conf)->name, + peer_device->current_uuid); + if (ret > 0) + pos += ret; + if (pos >= 120) + break; + } + rcu_read_unlock(); + drbd_warn(device, "%s", msg); +} + +bool drbd_res_data_accessible(struct drbd_resource *resource) +{ + bool data_accessible = false; + struct drbd_device *device; + int vnr; + + idr_for_each_entry(&resource->devices, device, vnr) { + if (drbd_data_accessible(device, NOW)) { + data_accessible = true; + break; } + } - if (cs.role != os.role) - flags &= ~CS_DC_ROLE; + return data_accessible; +} - if (cs.peer != os.peer) - flags &= ~CS_DC_PEER; +/** + * calc_data_accessible() - returns if up-to-data data is reachable + * + * @state_change: where to get the state information from + * @n_device: index into the devices array + * @which: OLD or NEW + * + * calc_data_accessible() returns true if either the local disk is up-to-date + * or of the peers. The related drbd_data_accessible() computes the same + * result from different inputs. + */ +static bool calc_data_accessible(struct drbd_state_change *state_change, int n_device, + enum which_state which) +{ + struct drbd_device_state_change *device_state_change = &state_change->devices[n_device]; + enum drbd_disk_state *disk_state = device_state_change->disk_state; + int n_connection; - if (cs.conn != os.conn) - flags &= ~CS_DC_CONN; + if (disk_state[which] == D_UP_TO_DATE) + return true; - if (cs.disk != os.disk) - flags &= ~CS_DC_DISK; + for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { + struct drbd_peer_device_state_change *peer_device_state_change = + &state_change->peer_devices[ + n_device * state_change->n_connections + n_connection]; + struct drbd_peer_device *peer_device = peer_device_state_change->peer_device; + enum drbd_disk_state *peer_disk_state = peer_device_state_change->disk_state; + struct net_conf *nc; + bool allow_remote_read; - if (cs.pdsk != os.pdsk) - flags &= ~CS_DC_PDSK; + rcu_read_lock(); + nc = rcu_dereference(peer_device->connection->transport.net_conf); + allow_remote_read = nc->allow_remote_read; + rcu_read_unlock(); + if (nc && !allow_remote_read) + continue; + if (peer_disk_state[which] == D_UP_TO_DATE) + return true; } - rcu_read_unlock(); - *pf |= CS_DC_MASK; - *pf &= flags; - (*pcs).i = cs.i; + return false; } -static enum drbd_state_rv -conn_is_valid_transition(struct drbd_connection *connection, union drbd_state mask, union drbd_state val, - enum chg_state_flags flags) +/** + * drbd_data_accessible() - returns if up-to-data data is reachable + * + * @device: the device, the question is about + * @which: OLD, NEW, or NOW (Only use OLD within a state change!) + * + * drbd_data_accessible() returns true if either the local disk is up-to-date + * or of the peers. The related calc_data_accessible() computes the same + * result from different inputs. + */ +bool drbd_data_accessible(struct drbd_device *device, enum which_state which) { - enum drbd_state_rv rv = SS_SUCCESS; - union drbd_state ns, os; struct drbd_peer_device *peer_device; - int vnr; + bool data_accessible = false; - rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - os = drbd_read_state(device); - ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); - - if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) - ns.disk = os.disk; + if (device->disk_state[which] == D_UP_TO_DATE) + return true; - if (ns.i == os.i) + rcu_read_lock(); + for_each_peer_device_rcu(peer_device, device) { + struct net_conf *nc; + nc = rcu_dereference(peer_device->connection->transport.net_conf); + if (nc && !nc->allow_remote_read) continue; - - rv = is_valid_transition(os, ns); - - if (rv >= SS_SUCCESS && !(flags & CS_HARD)) { - rv = is_valid_state(device, ns); - if (rv < SS_SUCCESS) { - if (is_valid_state(device, os) == rv) - rv = is_valid_soft_transition(os, ns, connection); - } else - rv = is_valid_soft_transition(os, ns, connection); - } - - if (rv < SS_SUCCESS) { - if (flags & CS_VERBOSE) - print_st_err(device, os, ns, rv); + if (peer_device->disk_state[which] == D_UP_TO_DATE) { + data_accessible = true; break; } } rcu_read_unlock(); - return rv; + return data_accessible; } - -static void -conn_set_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val, - union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags) +/* drbd_data_accessible() and exposable_data_uuid() have the same structure. By intention. */ +static u64 exposable_data_uuid(struct drbd_device *device) { - union drbd_state ns, os, ns_max = { }; - union drbd_state ns_min = { - { .role = R_MASK, - .peer = R_MASK, - .conn = val.conn, - .disk = D_MASK, - .pdsk = D_MASK - } }; struct drbd_peer_device *peer_device; - enum drbd_state_rv rv; - int vnr, number_of_volumes = 0; - - if (mask.conn == C_MASK) { - /* remember last connect time so request_timer_fn() won't - * kill newly established sessions while we are still trying to thaw - * previously frozen IO */ - if (connection->cstate != C_WF_REPORT_PARAMS && val.conn == C_WF_REPORT_PARAMS) - connection->last_reconnect_jif = jiffies; + u64 uuid = 0; - connection->cstate = val.conn; + if (get_ldev_if_state(device, D_UP_TO_DATE)) { + uuid = device->ldev->md.current_uuid; + put_ldev(device); + return uuid; } rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - number_of_volumes++; - os = drbd_read_state(device); - ns = apply_mask_val(os, mask, val); - ns = sanitize_state(device, os, ns, NULL); - - if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) - ns.disk = os.disk; - - rv = _drbd_set_state(device, ns, flags, NULL); - BUG_ON(rv < SS_SUCCESS); - ns.i = device->state.i; - ns_max.role = max_role(ns.role, ns_max.role); - ns_max.peer = max_role(ns.peer, ns_max.peer); - ns_max.conn = max_t(enum drbd_conns, ns.conn, ns_max.conn); - ns_max.disk = max_t(enum drbd_disk_state, ns.disk, ns_max.disk); - ns_max.pdsk = max_t(enum drbd_disk_state, ns.pdsk, ns_max.pdsk); - - ns_min.role = min_role(ns.role, ns_min.role); - ns_min.peer = min_role(ns.peer, ns_min.peer); - ns_min.conn = min_t(enum drbd_conns, ns.conn, ns_min.conn); - ns_min.disk = min_t(enum drbd_disk_state, ns.disk, ns_min.disk); - ns_min.pdsk = min_t(enum drbd_disk_state, ns.pdsk, ns_min.pdsk); + for_each_peer_device_rcu(peer_device, device) { + struct net_conf *nc; + nc = rcu_dereference(peer_device->connection->transport.net_conf); + if (nc && !nc->allow_remote_read) + continue; + if (peer_device->disk_state[NOW] == D_UP_TO_DATE && + (uuid & ~UUID_PRIMARY) != (peer_device->current_uuid & ~UUID_PRIMARY)) { + if (!uuid) { + uuid = peer_device->current_uuid; + continue; + } + drbd_err(device, "Multiple UpToDate peers have different current UUIDs\n"); + log_current_uuids(device); + } } rcu_read_unlock(); - if (number_of_volumes == 0) { - ns_min = ns_max = (union drbd_state) { { - .role = R_SECONDARY, - .peer = R_UNKNOWN, - .conn = val.conn, - .disk = D_DISKLESS, - .pdsk = D_UNKNOWN - } }; - } - - ns_min.susp = ns_max.susp = connection->resource->susp; - ns_min.susp_nod = ns_max.susp_nod = connection->resource->susp_nod; - ns_min.susp_fen = ns_max.susp_fen = connection->resource->susp_fen; - - *pns_min = ns_min; - *pns_max = ns_max; + return uuid; } -static enum drbd_state_rv -_conn_rq_cond(struct drbd_connection *connection, union drbd_state mask, union drbd_state val) +static void ensure_exposed_data_uuid(struct drbd_device *device) { - enum drbd_state_rv err, rv = SS_UNKNOWN_ERROR; /* continue waiting */; - - if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &connection->flags)) - rv = SS_CW_SUCCESS; - - if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &connection->flags)) - rv = SS_CW_FAILED_BY_PEER; + u64 uuid = exposable_data_uuid(device); - err = conn_is_valid_transition(connection, mask, val, 0); - if (err == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS) - return rv; + if (uuid) + drbd_uuid_set_exposed(device, uuid, true); - return err; } -enum drbd_state_rv -_conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val, - enum chg_state_flags flags) +/* Between 9.1.7 and 9.1.12 drbd was setting MDF_NODE_EXISTS for all peers. + * With that the flag got useless. It is a meta-data flag that persists. + * Clear it for all not configured nodes if we find it in every peer slot. + */ +static void check_wrongly_set_mdf_exists(struct drbd_device *device) { - enum drbd_state_rv rv = SS_SUCCESS; - struct after_conn_state_chg_work *acscw; - enum drbd_conns oc = connection->cstate; - union drbd_state ns_max, ns_min, os; - bool have_mutex = false; - struct drbd_state_change *state_change; + struct drbd_resource *resource = device->resource; + const int my_node_id = resource->res_opts.node_id; + bool wrong = true; + int node_id; - if (mask.conn) { - rv = is_valid_conn_transition(oc, val.conn); - if (rv < SS_SUCCESS) - goto abort; - } + if (!get_ldev(device)) + return; - rv = conn_is_valid_transition(connection, mask, val, flags); - if (rv < SS_SUCCESS) - goto abort; - - if (oc == C_WF_REPORT_PARAMS && val.conn == C_DISCONNECTING && - !(flags & (CS_LOCAL_ONLY | CS_HARD))) { - - /* This will be a cluster-wide state change. - * Need to give up the spinlock, grab the mutex, - * then send the state change request, ... */ - spin_unlock_irq(&connection->resource->req_lock); - mutex_lock(&connection->cstate_mutex); - have_mutex = true; - - set_bit(CONN_WD_ST_CHG_REQ, &connection->flags); - if (conn_send_state_req(connection, mask, val)) { - /* sending failed. */ - clear_bit(CONN_WD_ST_CHG_REQ, &connection->flags); - rv = SS_CW_FAILED_BY_PEER; - /* need to re-aquire the spin lock, though */ - goto abort_unlocked; - } - - if (val.conn == C_DISCONNECTING) - set_bit(DISCONNECT_SENT, &connection->flags); - - /* ... and re-aquire the spinlock. - * If _conn_rq_cond() returned >= SS_SUCCESS, we must call - * conn_set_state() within the same spinlock. */ - spin_lock_irq(&connection->resource->req_lock); - wait_event_lock_irq(connection->ping_wait, - (rv = _conn_rq_cond(connection, mask, val)), - connection->resource->req_lock); - clear_bit(CONN_WD_ST_CHG_REQ, &connection->flags); - if (rv < SS_SUCCESS) - goto abort; - } - - state_change = remember_old_state(connection->resource, GFP_ATOMIC); - conn_old_common_state(connection, &os, &flags); - flags |= CS_DC_SUSP; - conn_set_state(connection, mask, val, &ns_min, &ns_max, flags); - conn_pr_state_change(connection, os, ns_max, flags); - remember_new_state(state_change); - - acscw = kmalloc_obj(*acscw, GFP_ATOMIC); - if (acscw) { - acscw->oc = os.conn; - acscw->ns_min = ns_min; - acscw->ns_max = ns_max; - acscw->flags = flags; - acscw->w.cb = w_after_conn_state_ch; - kref_get(&connection->kref); - acscw->connection = connection; - acscw->state_change = state_change; - drbd_queue_work(&connection->sender_work, &acscw->w); - } else { - drbd_err(connection, "Could not kmalloc an acscw\n"); - } + rcu_read_lock(); - abort: - if (have_mutex) { - /* mutex_unlock() "... must not be used in interrupt context.", - * so give up the spinlock, then re-aquire it */ - spin_unlock_irq(&connection->resource->req_lock); - abort_unlocked: - mutex_unlock(&connection->cstate_mutex); - spin_lock_irq(&connection->resource->req_lock); - } - if (rv < SS_SUCCESS && flags & CS_VERBOSE) { - drbd_err(connection, "State change failed: %s\n", drbd_set_st_err_str(rv)); - drbd_err(connection, " mask = 0x%x val = 0x%x\n", mask.i, val.i); - drbd_err(connection, " old_conn:%s wanted_conn:%s\n", drbd_conn_str(oc), drbd_conn_str(val.conn)); - } - return rv; -} + for (node_id = 0; node_id < DRBD_NODE_ID_MAX; node_id++) { + struct drbd_peer_device *peer_device = peer_device_by_node_id(device, node_id); + struct drbd_peer_md *peer_md = &device->ldev->md.peers[node_id]; -enum drbd_state_rv -conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val, - enum chg_state_flags flags) -{ - enum drbd_state_rv rv; + if (!(peer_md->flags & MDF_NODE_EXISTS || peer_device || node_id == my_node_id)) { + wrong = false; + break; + } + } - spin_lock_irq(&connection->resource->req_lock); - rv = _conn_request_state(connection, mask, val, flags); - spin_unlock_irq(&connection->resource->req_lock); + if (wrong) { + for (node_id = 0; node_id < DRBD_NODE_ID_MAX; node_id++) { + struct drbd_peer_device *peer_device = peer_device_by_node_id(device, node_id); + struct drbd_peer_md *peer_md = &device->ldev->md.peers[node_id]; - return rv; + if (!peer_device) + peer_md->flags &= ~MDF_NODE_EXISTS; + } + if (!test_bit(WRONG_MDF_EXISTS, &resource->flags)) { + set_bit(WRONG_MDF_EXISTS, &resource->flags); + drbd_warn(resource, "Clearing excess MDF_NODE_EXISTS flags\n"); + } + } + rcu_read_unlock(); + put_ldev(device); } diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h index 75e671a3c5d1..eaaf1a9c641f 100644 --- a/include/linux/drbd_genl.h +++ b/include/linux/drbd_genl.h @@ -236,6 +236,7 @@ GENL_struct(DRBD_NLA_DEVICE_CONF, 14, device_conf, __u32_field_def(1, DRBD_F_INVARIANT, max_bio_size, DRBD_MAX_BIO_SIZE_DEF) __flg_field_def(2, 0 /* OPTIONAL */, intentional_diskless, DRBD_DISK_DISKLESS_DEF) __u32_field_def(3, 0 /* OPTIONAL */, block_size, DRBD_BLOCK_SIZE_DEF) + __u32_field_def(4, 0 /* OPTIONAL */, discard_granularity, DRBD_DISCARD_GRANULARITY_DEF) ) GENL_struct(DRBD_NLA_RESOURCE_INFO, 15, resource_info, @@ -357,6 +358,7 @@ GENL_struct(DRBD_NLA_PEER_DEVICE_OPTS, 27, peer_device_conf, #if (PRO_FEATURES & DRBD_FF_RESYNC_WITHOUT_REPLICATION) || !defined(__KERNEL__) __flg_field_def(8, 0 /* OPTIONAL */, resync_without_replication, DRBD_RESYNC_WITHOUT_REPLICATION_DEF) #endif + __flg_field_def(9, 0 /* OPTIONAL */, peer_tiebreaker, DRBD_PEER_TIEBREAKER_DEF) ) GENL_struct(DRBD_NLA_PATH_PARMS, 28, path_parms, diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index ed38f94d43c6..bbcb5b0dc3be 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -313,6 +313,11 @@ #define DRBD_BLOCK_SIZE_DEF 512 #define DRBD_BLOCK_SIZE_SCALE '1' /* Bytes */ +#define DRBD_DISCARD_GRANULARITY_SCALE '1' /* Bytes */ +#define DRBD_DISCARD_GRANULARITY_MIN 0U /* 0 = disable discards */ +#define DRBD_DISCARD_GRANULARITY_MAX (128U<<20) /* 128 MiB, current DRBD_MAX_BATCH_BIO_SIZE */ +#define DRBD_DISCARD_GRANULARITY_DEF 0xFFFFFFFFU /* sentinel: not configured; use legacy behavior */ + /* By default freeze IO, if set error all IOs as quick as possible */ #define DRBD_ON_NO_QUORUM_DEF ONQ_SUSPEND_IO @@ -326,6 +331,8 @@ #define DRBD_LOAD_BALANCE_PATHS_DEF 0U +#define DRBD_PEER_TIEBREAKER_DEF 1U + #define DRBD_RDMA_CTRL_RCVBUF_SIZE_MIN 0U #define DRBD_RDMA_CTRL_RCVBUF_SIZE_MAX (10U<<20) #define DRBD_RDMA_CTRL_RCVBUF_SIZE_DEF 0 -- 2.53.0