Merge tag 'ceph-for-4.13-rc1' of git://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
 "The main item here is support for v12.y.z ("Luminous") clusters:
  RESEND_ON_SPLIT, RADOS_BACKOFF, OSDMAP_PG_UPMAP and CRUSH_CHOOSE_ARGS
  feature bits, and various other changes in the RADOS client protocol.

  On top of that we have a new fsc mount option to allow supplying
  fscache uniquifier (similar to NFS) and the usual pile of filesystem
  fixes from Zheng"

* tag 'ceph-for-4.13-rc1' of git://github.com/ceph/ceph-client: (44 commits)
  libceph: advertise support for NEW_OSDOP_ENCODING and SERVER_LUMINOUS
  libceph: osd_state is 32 bits wide in luminous
  crush: remove an obsolete comment
  crush: crush_init_workspace starts with struct crush_work
  libceph, crush: per-pool crush_choose_arg_map for crush_do_rule()
  crush: implement weight and id overrides for straw2
  libceph: apply_upmap()
  libceph: compute actual pgid in ceph_pg_to_up_acting_osds()
  libceph: pg_upmap[_items] infrastructure
  libceph: ceph_decode_skip_* helpers
  libceph: kill __{insert,lookup,remove}_pg_mapping()
  libceph: introduce and switch to decode_pg_mapping()
  libceph: don't pass pgid by value
  libceph: respect RADOS_BACKOFF backoffs
  libceph: make DEFINE_RB_* helpers more general
  libceph: avoid unnecessary pi lookups in calc_target()
  libceph: use target pi for calc_target() calculations
  libceph: always populate t->target_{oid,oloc} in calc_target()
  libceph: make sure need_resend targets reflect latest map
  libceph: delete from need_resend_linger before check_linger_pool_dne()
  ...
This commit is contained in:
Linus Torvalds
2017-07-11 12:12:28 -07:00
28 changed files with 2319 additions and 487 deletions

View File

@@ -2,103 +2,174 @@
#define __CEPH_FEATURES
/*
* feature bits
* Each time we reclaim bits for reuse we need to specify another bit
* that, if present, indicates we have the new incarnation of that
* feature. Base case is 1 (first use).
*/
#define CEPH_FEATURE_UID (1ULL<<0)
#define CEPH_FEATURE_NOSRCADDR (1ULL<<1)
#define CEPH_FEATURE_MONCLOCKCHECK (1ULL<<2)
#define CEPH_FEATURE_FLOCK (1ULL<<3)
#define CEPH_FEATURE_SUBSCRIBE2 (1ULL<<4)
#define CEPH_FEATURE_MONNAMES (1ULL<<5)
#define CEPH_FEATURE_RECONNECT_SEQ (1ULL<<6)
#define CEPH_FEATURE_DIRLAYOUTHASH (1ULL<<7)
#define CEPH_FEATURE_OBJECTLOCATOR (1ULL<<8)
#define CEPH_FEATURE_PGID64 (1ULL<<9)
#define CEPH_FEATURE_INCSUBOSDMAP (1ULL<<10)
#define CEPH_FEATURE_PGPOOL3 (1ULL<<11)
#define CEPH_FEATURE_OSDREPLYMUX (1ULL<<12)
#define CEPH_FEATURE_OSDENC (1ULL<<13)
#define CEPH_FEATURE_OMAP (1ULL<<14)
#define CEPH_FEATURE_MONENC (1ULL<<15)
#define CEPH_FEATURE_QUERY_T (1ULL<<16)
#define CEPH_FEATURE_INDEP_PG_MAP (1ULL<<17)
#define CEPH_FEATURE_CRUSH_TUNABLES (1ULL<<18)
#define CEPH_FEATURE_CHUNKY_SCRUB (1ULL<<19)
#define CEPH_FEATURE_MON_NULLROUTE (1ULL<<20)
#define CEPH_FEATURE_MON_GV (1ULL<<21)
#define CEPH_FEATURE_BACKFILL_RESERVATION (1ULL<<22)
#define CEPH_FEATURE_MSG_AUTH (1ULL<<23)
#define CEPH_FEATURE_RECOVERY_RESERVATION (1ULL<<24)
#define CEPH_FEATURE_CRUSH_TUNABLES2 (1ULL<<25)
#define CEPH_FEATURE_CREATEPOOLID (1ULL<<26)
#define CEPH_FEATURE_REPLY_CREATE_INODE (1ULL<<27)
#define CEPH_FEATURE_OSD_HBMSGS (1ULL<<28)
#define CEPH_FEATURE_MDSENC (1ULL<<29)
#define CEPH_FEATURE_OSDHASHPSPOOL (1ULL<<30)
#define CEPH_FEATURE_MON_SINGLE_PAXOS (1ULL<<31)
#define CEPH_FEATURE_OSD_SNAPMAPPER (1ULL<<32)
#define CEPH_FEATURE_MON_SCRUB (1ULL<<33)
#define CEPH_FEATURE_OSD_PACKED_RECOVERY (1ULL<<34)
#define CEPH_FEATURE_OSD_CACHEPOOL (1ULL<<35)
#define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */
#define CEPH_FEATURE_EXPORT_PEER (1ULL<<37)
#define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38)
#define CEPH_FEATURE_OSD_TMAP2OMAP (1ULL<<38) /* overlap with EC */
/* The process supports new-style OSDMap encoding. Monitors also use
this bit to determine if peers support NAK messages. */
#define CEPH_FEATURE_OSDMAP_ENC (1ULL<<39)
#define CEPH_FEATURE_MDS_INLINE_DATA (1ULL<<40)
#define CEPH_FEATURE_CRUSH_TUNABLES3 (1ULL<<41)
#define CEPH_FEATURE_OSD_PRIMARY_AFFINITY (1ULL<<41) /* overlap w/ tunables3 */
#define CEPH_FEATURE_MSGR_KEEPALIVE2 (1ULL<<42)
#define CEPH_FEATURE_OSD_POOLRESEND (1ULL<<43)
#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2 (1ULL<<44)
#define CEPH_FEATURE_OSD_SET_ALLOC_HINT (1ULL<<45)
#define CEPH_FEATURE_OSD_FADVISE_FLAGS (1ULL<<46)
#define CEPH_FEATURE_OSD_REPOP (1ULL<<46) /* overlap with fadvise */
#define CEPH_FEATURE_OSD_OBJECT_DIGEST (1ULL<<46) /* overlap with fadvise */
#define CEPH_FEATURE_OSD_TRANSACTION_MAY_LAYOUT (1ULL<<46) /* overlap w/ fadvise */
#define CEPH_FEATURE_MDS_QUOTA (1ULL<<47)
#define CEPH_FEATURE_CRUSH_V4 (1ULL<<48) /* straw2 buckets */
#define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49)
// duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY
#define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */
#define CEPH_FEATURE_MON_METADATA (1ULL<<50)
#define CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT (1ULL<<51) /* can sort objs bitwise */
#define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52)
#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53)
#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54)
#define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55)
#define CEPH_FEATURE_NEW_OSDOP_ENCODING (1ULL<<56) /* New, v7 encoding */
#define CEPH_FEATURE_MON_STATEFUL_SUB (1ULL<<57) /* stateful mon subscription */
#define CEPH_FEATURE_MON_ROUTE_OSDMAP (1ULL<<57) /* peon sends osdmaps */
#define CEPH_FEATURE_CRUSH_TUNABLES5 (1ULL<<58) /* chooseleaf stable mode */
// duplicated since it was introduced at the same time as CEPH_FEATURE_CRUSH_TUNABLES5
#define CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING (1ULL<<58) /* New, v7 encoding */
#define CEPH_FEATURE_FS_FILE_LAYOUT_V2 (1ULL<<58) /* file_layout_t */
#define CEPH_FEATURE_INCARNATION_1 (0ull)
#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // CEPH_FEATURE_SERVER_JEWEL
#define DEFINE_CEPH_FEATURE(bit, incarnation, name) \
const static uint64_t CEPH_FEATURE_##name = (1ULL<<bit); \
const static uint64_t CEPH_FEATUREMASK_##name = \
(1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
/* this bit is ignored but still advertised by release *when* */
#define DEFINE_CEPH_FEATURE_DEPRECATED(bit, incarnation, name, when) \
const static uint64_t DEPRECATED_CEPH_FEATURE_##name = (1ULL<<bit); \
const static uint64_t DEPRECATED_CEPH_FEATUREMASK_##name = \
(1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
/*
* The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature
* vector to evaluate to 64 bit ~0. To cope, we designate 1ULL << 63
* to mean 33 bit ~0, and introduce a helper below to do the
* translation.
*
* This was introduced by ceph.git commit
* 9ea02b84104045c2ffd7e7f4e7af512953855ecd v0.58-657-g9ea02b8
* and fixed by ceph.git commit
* 4255b5c2fb54ae40c53284b3ab700fdfc7e61748 v0.65-263-g4255b5c
* this bit is ignored by release *unused* and not advertised by
* release *unadvertised*
*/
#define CEPH_FEATURE_RESERVED (1ULL<<63)
#define DEFINE_CEPH_FEATURE_RETIRED(bit, inc, name, unused, unadvertised)
/*
* test for a feature. this test is safer than a typical mask against
* the bit because it ensures that we have the bit AND the marker for the
* bit's incarnation. this must be used in any case where the features
* bits may include an old meaning of the bit.
*/
#define CEPH_HAVE_FEATURE(x, name) \
(((x) & (CEPH_FEATUREMASK_##name)) == (CEPH_FEATUREMASK_##name))
/*
* Notes on deprecation:
*
* A *major* release is a release through which all upgrades must pass
* (e.g., jewel). For example, no pre-jewel server will ever talk to
* a post-jewel server (mon, osd, etc).
*
* For feature bits used *only* on the server-side:
*
* - In the first phase we indicate that a feature is DEPRECATED as of
* a particular release. This is the first major release X (say,
* jewel) that does not depend on its peers advertising the feature.
* That is, it safely assumes its peers all have the feature. We
* indicate this with the DEPRECATED macro. For example,
*
* DEFINE_CEPH_FEATURE_DEPRECATED( 2, 1, MONCLOCKCHECK, JEWEL)
*
* because 10.2.z (jewel) did not care if its peers advertised this
* feature bit.
*
* - In the second phase we stop advertising the the bit and call it
* RETIRED. This can normally be done in the *next* major release
* following the one in which we marked the feature DEPRECATED. In
* the above example, for 12.0.z (luminous) we can say:
*
* DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
*
* - The bit can be reused in the first post-luminous release, 13.0.z
* (m).
*
* This ensures that no two versions who have different meanings for
* the bit ever speak to each other.
*/
DEFINE_CEPH_FEATURE( 0, 1, UID)
DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR)
DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE( 3, 1, FLOCK)
DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2)
DEFINE_CEPH_FEATURE( 5, 1, MONNAMES)
DEFINE_CEPH_FEATURE( 6, 1, RECONNECT_SEQ)
DEFINE_CEPH_FEATURE( 7, 1, DIRLAYOUTHASH)
DEFINE_CEPH_FEATURE( 8, 1, OBJECTLOCATOR)
DEFINE_CEPH_FEATURE( 9, 1, PGID64)
DEFINE_CEPH_FEATURE(10, 1, INCSUBOSDMAP)
DEFINE_CEPH_FEATURE(11, 1, PGPOOL3)
DEFINE_CEPH_FEATURE(12, 1, OSDREPLYMUX)
DEFINE_CEPH_FEATURE(13, 1, OSDENC)
DEFINE_CEPH_FEATURE_RETIRED(14, 1, OMAP, HAMMER, JEWEL)
DEFINE_CEPH_FEATURE(14, 2, SERVER_KRAKEN)
DEFINE_CEPH_FEATURE(15, 1, MONENC)
DEFINE_CEPH_FEATURE_RETIRED(16, 1, QUERY_T, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE_RETIRED(17, 1, INDEP_PG_MAP, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE(18, 1, CRUSH_TUNABLES)
DEFINE_CEPH_FEATURE_RETIRED(19, 1, CHUNKY_SCRUB, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE_RETIRED(20, 1, MON_NULLROUTE, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE_RETIRED(21, 1, MON_GV, HAMMER, JEWEL)
DEFINE_CEPH_FEATURE(21, 2, SERVER_LUMINOUS)
DEFINE_CEPH_FEATURE(21, 2, RESEND_ON_SPLIT) // overlap
DEFINE_CEPH_FEATURE(21, 2, RADOS_BACKOFF) // overlap
DEFINE_CEPH_FEATURE(21, 2, OSDMAP_PG_UPMAP) // overlap
DEFINE_CEPH_FEATURE(21, 2, CRUSH_CHOOSE_ARGS) // overlap
DEFINE_CEPH_FEATURE_RETIRED(22, 1, BACKFILL_RESERVATION, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE(23, 1, MSG_AUTH)
DEFINE_CEPH_FEATURE_RETIRED(24, 1, RECOVERY_RESERVATION, JEWEL, LUNINOUS)
DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2)
DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID)
DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE)
DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL)
DEFINE_CEPH_FEATURE(28, 2, SERVER_M)
DEFINE_CEPH_FEATURE(29, 1, MDSENC)
DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL)
DEFINE_CEPH_FEATURE(31, 1, MON_SINGLE_PAXOS) // deprecate me
DEFINE_CEPH_FEATURE_RETIRED(32, 1, OSD_SNAPMAPPER, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE_RETIRED(33, 1, MON_SCRUB, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE_RETIRED(34, 1, OSD_PACKED_RECOVERY, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE(35, 1, OSD_CACHEPOOL)
DEFINE_CEPH_FEATURE(36, 1, CRUSH_V2)
DEFINE_CEPH_FEATURE(37, 1, EXPORT_PEER)
DEFINE_CEPH_FEATURE(38, 1, OSD_ERASURE_CODES)
DEFINE_CEPH_FEATURE(38, 1, OSD_OSD_TMAP2OMAP) // overlap
DEFINE_CEPH_FEATURE(39, 1, OSDMAP_ENC)
DEFINE_CEPH_FEATURE(40, 1, MDS_INLINE_DATA)
DEFINE_CEPH_FEATURE(41, 1, CRUSH_TUNABLES3)
DEFINE_CEPH_FEATURE(41, 1, OSD_PRIMARY_AFFINITY) // overlap
DEFINE_CEPH_FEATURE(42, 1, MSGR_KEEPALIVE2)
DEFINE_CEPH_FEATURE(43, 1, OSD_POOLRESEND)
DEFINE_CEPH_FEATURE(44, 1, ERASURE_CODE_PLUGINS_V2)
DEFINE_CEPH_FEATURE_RETIRED(45, 1, OSD_SET_ALLOC_HINT, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE(46, 1, OSD_FADVISE_FLAGS)
DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_REPOP, JEWEL, LUMINOUS) // overlap
DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_OBJECT_DIGEST, JEWEL, LUMINOUS) // overlap
DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_TRANSACTION_MAY_LAYOUT, JEWEL, LUMINOUS) // overlap
DEFINE_CEPH_FEATURE(47, 1, MDS_QUOTA)
DEFINE_CEPH_FEATURE(48, 1, CRUSH_V4)
DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_MIN_SIZE_RECOVERY, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_PROXY_FEATURES, JEWEL, LUMINOUS) // overlap
DEFINE_CEPH_FEATURE(50, 1, MON_METADATA)
DEFINE_CEPH_FEATURE(51, 1, OSD_BITWISE_HOBJ_SORT)
DEFINE_CEPH_FEATURE(52, 1, OSD_PROXY_WRITE_FEATURES)
DEFINE_CEPH_FEATURE(53, 1, ERASURE_CODE_PLUGINS_V3)
DEFINE_CEPH_FEATURE(54, 1, OSD_HITSET_GMT)
DEFINE_CEPH_FEATURE(55, 1, HAMMER_0_94_4)
DEFINE_CEPH_FEATURE(56, 1, NEW_OSDOP_ENCODING)
DEFINE_CEPH_FEATURE(57, 1, MON_STATEFUL_SUB)
DEFINE_CEPH_FEATURE(57, 1, MON_ROUTE_OSDMAP) // overlap
DEFINE_CEPH_FEATURE(57, 1, OSDSUBOP_NO_SNAPCONTEXT) // overlap
DEFINE_CEPH_FEATURE(57, 1, SERVER_JEWEL) // overlap
DEFINE_CEPH_FEATURE(58, 1, CRUSH_TUNABLES5)
DEFINE_CEPH_FEATURE(58, 1, NEW_OSDOPREPLY_ENCODING) // overlap
DEFINE_CEPH_FEATURE(58, 1, FS_FILE_LAYOUT_V2) // overlap
DEFINE_CEPH_FEATURE(59, 1, FS_BTIME)
DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap
DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap
DEFINE_CEPH_FEATURE(60, 1, BLKIN_TRACING) // *do not share this bit*
DEFINE_CEPH_FEATURE(61, 1, RESERVED2) // unused, but slow down!
DEFINE_CEPH_FEATURE(62, 1, RESERVED) // do not use; used as a sentinal
DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facing
static inline u64 ceph_sanitize_features(u64 features)
{
if (features & CEPH_FEATURE_RESERVED) {
/* everything through OSD_SNAPMAPPER */
return 0x1ffffffffull;
} else {
return features;
}
}
/*
* Features supported.
@@ -113,6 +184,11 @@ static inline u64 ceph_sanitize_features(u64 features)
CEPH_FEATURE_PGPOOL3 | \
CEPH_FEATURE_OSDENC | \
CEPH_FEATURE_CRUSH_TUNABLES | \
CEPH_FEATURE_SERVER_LUMINOUS | \
CEPH_FEATURE_RESEND_ON_SPLIT | \
CEPH_FEATURE_RADOS_BACKOFF | \
CEPH_FEATURE_OSDMAP_PG_UPMAP | \
CEPH_FEATURE_CRUSH_CHOOSE_ARGS | \
CEPH_FEATURE_MSG_AUTH | \
CEPH_FEATURE_CRUSH_TUNABLES2 | \
CEPH_FEATURE_REPLY_CREATE_INODE | \
@@ -126,7 +202,11 @@ static inline u64 ceph_sanitize_features(u64 features)
CEPH_FEATURE_CRUSH_TUNABLES3 | \
CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \
CEPH_FEATURE_MSGR_KEEPALIVE2 | \
CEPH_FEATURE_OSD_POOLRESEND | \
CEPH_FEATURE_CRUSH_V4 | \
CEPH_FEATURE_NEW_OSDOP_ENCODING | \
CEPH_FEATURE_SERVER_JEWEL | \
CEPH_FEATURE_MON_STATEFUL_SUB | \
CEPH_FEATURE_CRUSH_TUNABLES5 | \
CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING)

View File

@@ -147,6 +147,7 @@ struct ceph_dir_layout {
#define CEPH_MSG_OSD_OP 42
#define CEPH_MSG_OSD_OPREPLY 43
#define CEPH_MSG_WATCH_NOTIFY 44
#define CEPH_MSG_OSD_BACKOFF 61
/* watch-notify operations */

View File

@@ -132,6 +132,66 @@ bad:
return ERR_PTR(-ERANGE);
}
/*
* skip helpers
*/
#define ceph_decode_skip_n(p, end, n, bad) \
do { \
ceph_decode_need(p, end, n, bad); \
*p += n; \
} while (0)
#define ceph_decode_skip_64(p, end, bad) \
ceph_decode_skip_n(p, end, sizeof(u64), bad)
#define ceph_decode_skip_32(p, end, bad) \
ceph_decode_skip_n(p, end, sizeof(u32), bad)
#define ceph_decode_skip_16(p, end, bad) \
ceph_decode_skip_n(p, end, sizeof(u16), bad)
#define ceph_decode_skip_8(p, end, bad) \
ceph_decode_skip_n(p, end, sizeof(u8), bad)
#define ceph_decode_skip_string(p, end, bad) \
do { \
u32 len; \
\
ceph_decode_32_safe(p, end, len, bad); \
ceph_decode_skip_n(p, end, len, bad); \
} while (0)
#define ceph_decode_skip_set(p, end, type, bad) \
do { \
u32 len; \
\
ceph_decode_32_safe(p, end, len, bad); \
while (len--) \
ceph_decode_skip_##type(p, end, bad); \
} while (0)
#define ceph_decode_skip_map(p, end, ktype, vtype, bad) \
do { \
u32 len; \
\
ceph_decode_32_safe(p, end, len, bad); \
while (len--) { \
ceph_decode_skip_##ktype(p, end, bad); \
ceph_decode_skip_##vtype(p, end, bad); \
} \
} while (0)
#define ceph_decode_skip_map_of_map(p, end, ktype1, ktype2, vtype2, bad) \
do { \
u32 len; \
\
ceph_decode_32_safe(p, end, len, bad); \
while (len--) { \
ceph_decode_skip_##ktype1(p, end, bad); \
ceph_decode_skip_map(p, end, ktype2, vtype2, bad); \
} \
} while (0)
/*
* struct ceph_timespec <-> struct timespec
*/

View File

@@ -184,10 +184,11 @@ static inline int calc_pages_for(u64 off, u64 len)
(off >> PAGE_SHIFT);
}
/*
* These are not meant to be generic - an integer key is assumed.
*/
#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
#define RB_BYVAL(a) (a)
#define RB_BYPTR(a) (&(a))
#define RB_CMP3WAY(a, b) ((a) < (b) ? -1 : (a) > (b))
#define DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \
static void insert_##name(struct rb_root *root, type *t) \
{ \
struct rb_node **n = &root->rb_node; \
@@ -197,11 +198,13 @@ static void insert_##name(struct rb_root *root, type *t) \
\
while (*n) { \
type *cur = rb_entry(*n, type, nodefld); \
int cmp; \
\
parent = *n; \
if (t->keyfld < cur->keyfld) \
cmp = cmpexp(keyexp(t->keyfld), keyexp(cur->keyfld)); \
if (cmp < 0) \
n = &(*n)->rb_left; \
else if (t->keyfld > cur->keyfld) \
else if (cmp > 0) \
n = &(*n)->rb_right; \
else \
BUG(); \
@@ -217,19 +220,24 @@ static void erase_##name(struct rb_root *root, type *t) \
RB_CLEAR_NODE(&t->nodefld); \
}
#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \
extern type __lookup_##name##_key; \
static type *lookup_##name(struct rb_root *root, \
typeof(__lookup_##name##_key.keyfld) key) \
/*
* @lookup_param_type is a parameter and not constructed from (@type,
* @keyfld) with typeof() because adding const is too unwieldy.
*/
#define DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, cmpexp, keyexp, \
lookup_param_type, nodefld) \
static type *lookup_##name(struct rb_root *root, lookup_param_type key) \
{ \
struct rb_node *n = root->rb_node; \
\
while (n) { \
type *cur = rb_entry(n, type, nodefld); \
int cmp; \
\
if (key < cur->keyfld) \
cmp = cmpexp(key, keyexp(cur->keyfld)); \
if (cmp < 0) \
n = n->rb_left; \
else if (key > cur->keyfld) \
else if (cmp > 0) \
n = n->rb_right; \
else \
return cur; \
@@ -238,6 +246,23 @@ static type *lookup_##name(struct rb_root *root, \
return NULL; \
}
#define DEFINE_RB_FUNCS2(name, type, keyfld, cmpexp, keyexp, \
lookup_param_type, nodefld) \
DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \
DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, cmpexp, keyexp, \
lookup_param_type, nodefld)
/*
* Shorthands for integer keys.
*/
#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, RB_CMP3WAY, RB_BYVAL, nodefld)
#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \
extern type __lookup_##name##_key; \
DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, RB_CMP3WAY, RB_BYVAL, \
typeof(__lookup_##name##_key.keyfld), nodefld)
#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld) \
DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)

View File

@@ -44,6 +44,8 @@ struct ceph_connection_operations {
struct ceph_msg_header *hdr,
int *skip);
void (*reencode_message) (struct ceph_msg *msg);
int (*sign_message) (struct ceph_msg *msg);
int (*check_message_signature) (struct ceph_msg *msg);
};

View File

@@ -1,6 +1,7 @@
#ifndef _FS_CEPH_OSD_CLIENT_H
#define _FS_CEPH_OSD_CLIENT_H
#include <linux/bitrev.h>
#include <linux/completion.h>
#include <linux/kref.h>
#include <linux/mempool.h>
@@ -36,6 +37,8 @@ struct ceph_osd {
struct ceph_connection o_con;
struct rb_root o_requests;
struct rb_root o_linger_requests;
struct rb_root o_backoff_mappings;
struct rb_root o_backoffs_by_id;
struct list_head o_osd_lru;
struct ceph_auth_handshake o_auth;
unsigned long lru_ttl;
@@ -136,7 +139,8 @@ struct ceph_osd_request_target {
struct ceph_object_id target_oid;
struct ceph_object_locator target_oloc;
struct ceph_pg pgid;
struct ceph_pg pgid; /* last raw pg we mapped to */
struct ceph_spg spgid; /* last actual spg we mapped to */
u32 pg_num;
u32 pg_num_mask;
struct ceph_osds acting;
@@ -148,6 +152,9 @@ struct ceph_osd_request_target {
unsigned int flags; /* CEPH_OSD_FLAG_* */
bool paused;
u32 epoch;
u32 last_force_resend;
int osd;
};
@@ -193,7 +200,6 @@ struct ceph_osd_request {
unsigned long r_stamp; /* jiffies, send or check time */
unsigned long r_start_stamp; /* jiffies */
int r_attempts;
u32 r_last_force_resend;
u32 r_map_dne_bound;
struct ceph_osd_req_op r_ops[];
@@ -203,6 +209,23 @@ struct ceph_request_redirect {
struct ceph_object_locator oloc;
};
/*
* osd request identifier
*
* caller name + incarnation# + tid to unique identify this request
*/
struct ceph_osd_reqid {
struct ceph_entity_name name;
__le64 tid;
__le32 inc;
} __packed;
struct ceph_blkin_trace_info {
__le64 trace_id;
__le64 span_id;
__le64 parent_span_id;
} __packed;
typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
u64 notifier_id, void *data, size_t data_len);
typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);
@@ -221,7 +244,6 @@ struct ceph_osd_linger_request {
struct list_head pending_lworks;
struct ceph_osd_request_target t;
u32 last_force_resend;
u32 map_dne_bound;
struct timespec mtime;
@@ -256,6 +278,48 @@ struct ceph_watch_item {
struct ceph_entity_addr addr;
};
struct ceph_spg_mapping {
struct rb_node node;
struct ceph_spg spgid;
struct rb_root backoffs;
};
struct ceph_hobject_id {
void *key;
size_t key_len;
void *oid;
size_t oid_len;
u64 snapid;
u32 hash;
u8 is_max;
void *nspace;
size_t nspace_len;
s64 pool;
/* cache */
u32 hash_reverse_bits;
};
static inline void ceph_hoid_build_hash_cache(struct ceph_hobject_id *hoid)
{
hoid->hash_reverse_bits = bitrev32(hoid->hash);
}
/*
* PG-wide backoff: [begin, end)
* per-object backoff: begin == end
*/
struct ceph_osd_backoff {
struct rb_node spg_node;
struct rb_node id_node;
struct ceph_spg spgid;
u64 id;
struct ceph_hobject_id *begin;
struct ceph_hobject_id *end;
};
#define CEPH_LINGER_ID_START 0xffff000000000000ULL
struct ceph_osd_client {

View File

@@ -24,7 +24,15 @@ struct ceph_pg {
uint32_t seed;
};
#define CEPH_SPG_NOSHARD -1
struct ceph_spg {
struct ceph_pg pgid;
s8 shard;
};
int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs);
#define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id
together */
@@ -135,10 +143,14 @@ struct ceph_pg_mapping {
struct {
int len;
int osds[];
} pg_temp;
} pg_temp, pg_upmap;
struct {
int osd;
} primary_temp;
struct {
int len;
int from_to[][2];
} pg_upmap_items;
};
};
@@ -150,13 +162,17 @@ struct ceph_osdmap {
u32 flags; /* CEPH_OSDMAP_* */
u32 max_osd; /* size of osd_state, _offload, _addr arrays */
u8 *osd_state; /* CEPH_OSD_* */
u32 *osd_state; /* CEPH_OSD_* */
u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
struct ceph_entity_addr *osd_addr;
struct rb_root pg_temp;
struct rb_root primary_temp;
/* remap (post-CRUSH, pre-up) */
struct rb_root pg_upmap; /* PG := raw set */
struct rb_root pg_upmap_items; /* from -> to within raw set */
u32 *osd_primary_affinity;
struct rb_root pg_pools;
@@ -187,7 +203,7 @@ static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
return !ceph_osd_is_up(map, osd);
}
extern char *ceph_osdmap_state_str(char *str, int len, int state);
char *ceph_osdmap_state_str(char *str, int len, u32 state);
extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd);
static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
@@ -198,11 +214,13 @@ static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
return &map->osd_addr[osd];
}
#define CEPH_PGID_ENCODING_LEN (1 + 8 + 4 + 4)
static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
{
__u8 version;
if (!ceph_has_room(p, end, 1 + 8 + 4 + 4)) {
if (!ceph_has_room(p, end, CEPH_PGID_ENCODING_LEN)) {
pr_warn("incomplete pg encoding\n");
return -EINVAL;
}
@@ -240,6 +258,8 @@ static inline void ceph_osds_init(struct ceph_osds *set)
void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num,
u32 new_pg_num);
bool ceph_is_new_interval(const struct ceph_osds *old_acting,
const struct ceph_osds *new_acting,
const struct ceph_osds *old_up,
@@ -262,15 +282,24 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
u64 off, u64 len,
u64 *bno, u64 *oxoff, u64 *oxlen);
int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
const struct ceph_object_id *oid,
const struct ceph_object_locator *oloc,
struct ceph_pg *raw_pgid);
int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
struct ceph_object_id *oid,
struct ceph_object_locator *oloc,
const struct ceph_object_id *oid,
const struct ceph_object_locator *oloc,
struct ceph_pg *raw_pgid);
void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid,
struct ceph_osds *up,
struct ceph_osds *acting);
bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid,
struct ceph_spg *spgid);
int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
const struct ceph_pg *raw_pgid);

View File

@@ -439,6 +439,12 @@ enum {
const char *ceph_osd_watch_op_name(int o);
enum {
CEPH_OSD_BACKOFF_OP_BLOCK = 1,
CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2,
CEPH_OSD_BACKOFF_OP_UNBLOCK = 3,
};
/*
* an individual object operation. each may be accompanied by some data
* payload

View File

@@ -2,6 +2,7 @@
#define CEPH_CRUSH_CRUSH_H
#ifdef __KERNEL__
# include <linux/rbtree.h>
# include <linux/types.h>
#else
# include "crush_compat.h"
@@ -137,6 +138,68 @@ struct crush_bucket {
};
/** @ingroup API
*
* Replacement weights for each item in a bucket. The size of the
* array must be exactly the size of the straw2 bucket, just as the
* item_weights array.
*
*/
struct crush_weight_set {
__u32 *weights; /*!< 16.16 fixed point weights
in the same order as items */
__u32 size; /*!< size of the __weights__ array */
};
/** @ingroup API
*
* Replacement weights and ids for a given straw2 bucket, for
* placement purposes.
*
* When crush_do_rule() chooses the Nth item from a straw2 bucket, the
* replacement weights found at __weight_set[N]__ are used instead of
* the weights from __item_weights__. If __N__ is greater than
* __weight_set_size__, the weights found at __weight_set_size-1__ are
* used instead. For instance if __weight_set__ is:
*
* [ [ 0x10000, 0x20000 ], // position 0
* [ 0x20000, 0x40000 ] ] // position 1
*
* choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ]
* choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ]
* choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ]
* etc.
*
*/
struct crush_choose_arg {
__s32 *ids; /*!< values to use instead of items */
__u32 ids_size; /*!< size of the __ids__ array */
struct crush_weight_set *weight_set; /*!< weight replacements for
a given position */
__u32 weight_set_size; /*!< size of the __weight_set__ array */
};
/** @ingroup API
*
* Replacement weights and ids for each bucket in the crushmap. The
* __size__ of the __args__ array must be exactly the same as the
* __map->max_buckets__.
*
* The __crush_choose_arg__ at index N will be used when choosing
* an item from the bucket __map->buckets[N]__ bucket, provided it
* is a straw2 bucket.
*
*/
struct crush_choose_arg_map {
#ifdef __KERNEL__
struct rb_node node;
u64 choose_args_index;
#endif
struct crush_choose_arg *args; /*!< replacement for each bucket
in the crushmap */
__u32 size; /*!< size of the __args__ array */
};
struct crush_bucket_uniform {
struct crush_bucket h;
__u32 item_weight; /* 16-bit fixed point; all items equally weighted */
@@ -236,6 +299,9 @@ struct crush_map {
__u32 allowed_bucket_algs;
__u32 *choose_tries;
#else
/* CrushWrapper::choose_args */
struct rb_root choose_args;
#endif
};

View File

@@ -11,11 +11,10 @@
#include "crush.h"
extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size);
extern int crush_do_rule(const struct crush_map *map,
int ruleno,
int x, int *result, int result_max,
const __u32 *weights, int weight_max,
void *cwin);
int crush_do_rule(const struct crush_map *map,
int ruleno, int x, int *result, int result_max,
const __u32 *weight, int weight_max,
void *cwin, const struct crush_choose_arg *choose_args);
/*
* Returns the exact amount of workspace that will need to be used