--- zzzz-none-000/linux-3.10.107/net/ceph/osd_client.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/net/ceph/osd_client.c 2021-02-04 17:41:59.000000000 +0000 @@ -30,8 +30,11 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); static void __register_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req); +static void __unregister_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); static void __unregister_linger_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req); +static void __enqueue_request(struct ceph_osd_request *req); static void __send_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req); @@ -117,11 +120,13 @@ } #endif /* CONFIG_BLOCK */ -#define osd_req_op_data(oreq, whch, typ, fld) \ - ({ \ - BUG_ON(whch >= (oreq)->r_num_ops); \ - &(oreq)->r_ops[whch].typ.fld; \ - }) +#define osd_req_op_data(oreq, whch, typ, fld) \ +({ \ + struct ceph_osd_request *__oreq = (oreq); \ + unsigned int __whch = (whch); \ + BUG_ON(__whch >= __oreq->r_num_ops); \ + &__oreq->r_ops[__whch].typ.fld; \ +}) static struct ceph_osd_data * osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which) @@ -282,6 +287,7 @@ switch (op->op) { case CEPH_OSD_OP_READ: case CEPH_OSD_OP_WRITE: + case CEPH_OSD_OP_WRITEFULL: ceph_osd_data_release(&op->extent.osd_data); break; case CEPH_OSD_OP_CALL: @@ -289,6 +295,13 @@ ceph_osd_data_release(&op->cls.request_data); ceph_osd_data_release(&op->cls.response_data); break; + case CEPH_OSD_OP_SETXATTR: + case CEPH_OSD_OP_CMPXATTR: + ceph_osd_data_release(&op->xattr.osd_data); + break; + case CEPH_OSD_OP_STAT: + ceph_osd_data_release(&op->raw_data_in); + break; default: break; } @@ -297,12 +310,21 @@ /* * requests */ -void ceph_osdc_release_request(struct kref *kref) +static void ceph_osdc_release_request(struct kref *kref) { - struct ceph_osd_request *req; + struct ceph_osd_request *req = container_of(kref, + struct ceph_osd_request, r_kref); unsigned int which; - req = container_of(kref, struct ceph_osd_request, r_kref); + dout("%s %p (r_request %p r_reply %p)\n", __func__, req, + req->r_request, req->r_reply); + WARN_ON(!RB_EMPTY_NODE(&req->r_node)); + WARN_ON(!list_empty(&req->r_req_lru_item)); + WARN_ON(!list_empty(&req->r_osd_item)); + WARN_ON(!list_empty(&req->r_linger_item)); + WARN_ON(!list_empty(&req->r_linger_osd_item)); + WARN_ON(req->r_osd); + if (req->r_request) ceph_msg_put(req->r_request); if (req->r_reply) { @@ -320,7 +342,22 @@ kmem_cache_free(ceph_osd_request_cache, req); } -EXPORT_SYMBOL(ceph_osdc_release_request); + +void ceph_osdc_get_request(struct ceph_osd_request *req) +{ + dout("%s %p (was %d)\n", __func__, req, + atomic_read(&req->r_kref.refcount)); + kref_get(&req->r_kref); +} +EXPORT_SYMBOL(ceph_osdc_get_request); + +void ceph_osdc_put_request(struct ceph_osd_request *req) +{ + dout("%s %p (was %d)\n", __func__, req, + atomic_read(&req->r_kref.refcount)); + kref_put(&req->r_kref, ceph_osdc_release_request); +} +EXPORT_SYMBOL(ceph_osdc_put_request); struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, @@ -338,7 +375,7 @@ msg_size = 4 + 4 + 8 + 8 + 4+8; msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ msg_size += 1 + 8 + 4 + 4; /* pg_t */ - msg_size += 4 + MAX_OBJ_NAME_SIZE; + msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ msg_size += 2 + num_ops*sizeof(struct ceph_osd_op); msg_size += 8; /* snapid */ msg_size += 8; /* snap_seq */ @@ -364,10 +401,13 @@ RB_CLEAR_NODE(&req->r_node); INIT_LIST_HEAD(&req->r_unsafe_item); INIT_LIST_HEAD(&req->r_linger_item); - INIT_LIST_HEAD(&req->r_linger_osd); + INIT_LIST_HEAD(&req->r_linger_osd_item); INIT_LIST_HEAD(&req->r_req_lru_item); INIT_LIST_HEAD(&req->r_osd_item); + req->r_base_oloc.pool = -1; + req->r_target_oloc.pool = -1; + /* create reply message */ if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); @@ -401,67 +441,9 @@ static bool osd_req_opcode_valid(u16 opcode) { switch (opcode) { - case CEPH_OSD_OP_READ: - case CEPH_OSD_OP_STAT: - case CEPH_OSD_OP_MAPEXT: - case CEPH_OSD_OP_MASKTRUNC: - case CEPH_OSD_OP_SPARSE_READ: - case CEPH_OSD_OP_NOTIFY: - case CEPH_OSD_OP_NOTIFY_ACK: - case CEPH_OSD_OP_ASSERT_VER: - case CEPH_OSD_OP_WRITE: - case CEPH_OSD_OP_WRITEFULL: - case CEPH_OSD_OP_TRUNCATE: - case CEPH_OSD_OP_ZERO: - case CEPH_OSD_OP_DELETE: - case CEPH_OSD_OP_APPEND: - case CEPH_OSD_OP_STARTSYNC: - case CEPH_OSD_OP_SETTRUNC: - case CEPH_OSD_OP_TRIMTRUNC: - case CEPH_OSD_OP_TMAPUP: - case CEPH_OSD_OP_TMAPPUT: - case CEPH_OSD_OP_TMAPGET: - case CEPH_OSD_OP_CREATE: - case CEPH_OSD_OP_ROLLBACK: - case CEPH_OSD_OP_WATCH: - case CEPH_OSD_OP_OMAPGETKEYS: - case CEPH_OSD_OP_OMAPGETVALS: - case CEPH_OSD_OP_OMAPGETHEADER: - case CEPH_OSD_OP_OMAPGETVALSBYKEYS: - case CEPH_OSD_OP_OMAPSETVALS: - case CEPH_OSD_OP_OMAPSETHEADER: - case CEPH_OSD_OP_OMAPCLEAR: - case CEPH_OSD_OP_OMAPRMKEYS: - case CEPH_OSD_OP_OMAP_CMP: - case CEPH_OSD_OP_CLONERANGE: - case CEPH_OSD_OP_ASSERT_SRC_VERSION: - case CEPH_OSD_OP_SRC_CMPXATTR: - case CEPH_OSD_OP_GETXATTR: - case CEPH_OSD_OP_GETXATTRS: - case CEPH_OSD_OP_CMPXATTR: - case CEPH_OSD_OP_SETXATTR: - case CEPH_OSD_OP_SETXATTRS: - case CEPH_OSD_OP_RESETXATTRS: - case CEPH_OSD_OP_RMXATTR: - case CEPH_OSD_OP_PULL: - case CEPH_OSD_OP_PUSH: - case CEPH_OSD_OP_BALANCEREADS: - case CEPH_OSD_OP_UNBALANCEREADS: - case CEPH_OSD_OP_SCRUB: - case CEPH_OSD_OP_SCRUB_RESERVE: - case CEPH_OSD_OP_SCRUB_UNRESERVE: - case CEPH_OSD_OP_SCRUB_STOP: - case CEPH_OSD_OP_SCRUB_MAP: - case CEPH_OSD_OP_WRLOCK: - case CEPH_OSD_OP_WRUNLOCK: - case CEPH_OSD_OP_RDLOCK: - case CEPH_OSD_OP_RDUNLOCK: - case CEPH_OSD_OP_UPLOCK: - case CEPH_OSD_OP_DNLOCK: - case CEPH_OSD_OP_CALL: - case CEPH_OSD_OP_PGLS: - case CEPH_OSD_OP_PGLS_FILTER: - return true; +#define GENERATE_CASE(op, opcode, str) case CEPH_OSD_OP_##op: return true; +__CEPH_FORALL_OSD_OPS(GENERATE_CASE) +#undef GENERATE_CASE default: return false; } @@ -474,7 +456,7 @@ */ static struct ceph_osd_req_op * _osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, - u16 opcode) + u16 opcode, u32 flags) { struct ceph_osd_req_op *op; @@ -484,14 +466,15 @@ op = &osd_req->r_ops[which]; memset(op, 0, sizeof (*op)); op->op = opcode; + op->flags = flags; return op; } void osd_req_op_init(struct ceph_osd_request *osd_req, - unsigned int which, u16 opcode) + unsigned int which, u16 opcode, u32 flags) { - (void)_osd_req_op_init(osd_req, which, opcode); + (void)_osd_req_op_init(osd_req, which, opcode, flags); } EXPORT_SYMBOL(osd_req_op_init); @@ -500,16 +483,19 @@ u64 offset, u64 length, u64 truncate_size, u32 truncate_seq) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, + opcode, 0); size_t payload_len = 0; - BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE); + BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && + opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO && + opcode != CEPH_OSD_OP_TRUNCATE); op->extent.offset = offset; op->extent.length = length; op->extent.truncate_size = truncate_size; op->extent.truncate_seq = truncate_seq; - if (opcode == CEPH_OSD_OP_WRITE) + if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL) payload_len += length; op->payload_len = payload_len; @@ -538,7 +524,8 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *class, const char *method) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, + opcode, 0); struct ceph_pagelist *pagelist; size_t payload_len = 0; size_t size; @@ -571,11 +558,46 @@ } EXPORT_SYMBOL(osd_req_op_cls_init); +int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, + u16 opcode, const char *name, const void *value, + size_t size, u8 cmp_op, u8 cmp_mode) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, + opcode, 0); + struct ceph_pagelist *pagelist; + size_t payload_len; + + BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); + + pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); + if (!pagelist) + return -ENOMEM; + + ceph_pagelist_init(pagelist); + + payload_len = strlen(name); + op->xattr.name_len = payload_len; + ceph_pagelist_append(pagelist, name, payload_len); + + op->xattr.value_len = size; + ceph_pagelist_append(pagelist, value, size); + payload_len += size; + + op->xattr.cmp_op = cmp_op; + op->xattr.cmp_mode = cmp_mode; + + ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); + op->payload_len = payload_len; + return 0; +} +EXPORT_SYMBOL(osd_req_op_xattr_init); + void osd_req_op_watch_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u64 cookie, u64 version, int flag) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, + opcode, 0); BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH); @@ -586,6 +608,27 @@ } EXPORT_SYMBOL(osd_req_op_watch_init); +void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, + unsigned int which, + u64 expected_object_size, + u64 expected_write_size) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, + CEPH_OSD_OP_SETALLOCHINT, + 0); + + op->alloc_hint.expected_object_size = expected_object_size; + op->alloc_hint.expected_write_size = expected_write_size; + + /* + * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed + * not worth a feature bit. Set FAILOK per-op flag to make + * sure older osds don't trip over an unsupported opcode. + */ + op->flags |= CEPH_OSD_OP_FLAG_FAILOK; +} +EXPORT_SYMBOL(osd_req_op_alloc_hint_init); + static void ceph_osdc_msg_data_add(struct ceph_msg *msg, struct ceph_osd_data *osd_data) { @@ -631,7 +674,11 @@ break; case CEPH_OSD_OP_READ: case CEPH_OSD_OP_WRITE: - if (src->op == CEPH_OSD_OP_WRITE) + case CEPH_OSD_OP_WRITEFULL: + case CEPH_OSD_OP_ZERO: + case CEPH_OSD_OP_TRUNCATE: + if (src->op == CEPH_OSD_OP_WRITE || + src->op == CEPH_OSD_OP_WRITEFULL) request_data_len = src->extent.length; dst->extent.offset = cpu_to_le64(src->extent.offset); dst->extent.length = cpu_to_le64(src->extent.length); @@ -640,7 +687,8 @@ dst->extent.truncate_seq = cpu_to_le32(src->extent.truncate_seq); osd_data = &src->extent.osd_data; - if (src->op == CEPH_OSD_OP_WRITE) + if (src->op == CEPH_OSD_OP_WRITE || + src->op == CEPH_OSD_OP_WRITEFULL) ceph_osdc_msg_data_add(req->r_request, osd_data); else ceph_osdc_msg_data_add(req->r_reply, osd_data); @@ -673,6 +721,25 @@ dst->watch.ver = cpu_to_le64(src->watch.ver); dst->watch.flag = src->watch.flag; break; + case CEPH_OSD_OP_SETALLOCHINT: + dst->alloc_hint.expected_object_size = + cpu_to_le64(src->alloc_hint.expected_object_size); + dst->alloc_hint.expected_write_size = + cpu_to_le64(src->alloc_hint.expected_write_size); + break; + case CEPH_OSD_OP_SETXATTR: + case CEPH_OSD_OP_CMPXATTR: + dst->xattr.name_len = cpu_to_le32(src->xattr.name_len); + dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); + dst->xattr.cmp_op = src->xattr.cmp_op; + dst->xattr.cmp_mode = src->xattr.cmp_mode; + osd_data = &src->xattr.osd_data; + ceph_osdc_msg_data_add(req->r_request, osd_data); + request_data_len = osd_data->pagelist->length; + break; + case CEPH_OSD_OP_CREATE: + case CEPH_OSD_OP_DELETE: + break; default: pr_err("unsupported osd opcode %s\n", ceph_osd_op_name(src->op)); @@ -680,7 +747,9 @@ return 0; } + dst->op = cpu_to_le16(src->op); + dst->flags = cpu_to_le32(src->flags); dst->payload_len = cpu_to_le32(src->payload_len); return request_data_len; @@ -700,7 +769,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, struct ceph_vino vino, - u64 off, u64 *plen, int num_ops, + u64 off, u64 *plen, + unsigned int which, int num_ops, int opcode, int flags, struct ceph_snap_context *snapc, u32 truncate_seq, @@ -711,11 +781,11 @@ u64 objnum = 0; u64 objoff = 0; u64 objlen = 0; - u32 object_size; - u64 object_base; int r; - BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE); + BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && + opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE && + opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE); req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool, GFP_NOFS); @@ -731,34 +801,29 @@ return ERR_PTR(r); } - object_size = le32_to_cpu(layout->fl_object_size); - object_base = off - objoff; - if (!(truncate_seq == 1 && truncate_size == -1ULL)) { - if (truncate_size <= object_base) { - truncate_size = 0; - } else { - truncate_size -= object_base; - if (truncate_size > object_size) - truncate_size = object_size; + if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) { + osd_req_op_init(req, which, opcode, 0); + } else { + u32 object_size = le32_to_cpu(layout->fl_object_size); + u32 object_base = off - objoff; + if (!(truncate_seq == 1 && truncate_size == -1ULL)) { + if (truncate_size <= object_base) { + truncate_size = 0; + } else { + truncate_size -= object_base; + if (truncate_size > object_size) + truncate_size = object_size; + } } + osd_req_op_extent_init(req, which, opcode, objoff, objlen, + truncate_size, truncate_seq); } - osd_req_op_extent_init(req, 0, opcode, objoff, objlen, - truncate_size, truncate_seq); - - /* - * A second op in the ops array means the caller wants to - * also issue a include a 'startsync' command so that the - * osd will flush data quickly. - */ - if (num_ops > 1) - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); - - req->r_file_layout = *layout; /* keep a copy */ + req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); - snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", - vino.ino, objnum); - req->r_oid_len = strlen(req->r_oid); + snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name), + "%llx.%08llx", vino.ino, objnum); + req->r_base_oid.name_len = strlen(req->r_base_oid.name); return req; } @@ -829,6 +894,37 @@ return NULL; } +static void __kick_linger_request(struct ceph_osd_request *req) +{ + struct ceph_osd_client *osdc = req->r_osdc; + struct ceph_osd *osd = req->r_osd; + + /* + * Linger requests need to be resent with a new tid to avoid + * the dup op detection logic on the OSDs. Achieve this with + * a re-register dance instead of open-coding. + */ + ceph_osdc_get_request(req); + if (!list_empty(&req->r_linger_item)) + __unregister_linger_request(osdc, req); + else + __unregister_request(osdc, req); + __register_request(osdc, req); + ceph_osdc_put_request(req); + + /* + * Unless request has been registered as both normal and + * lingering, __unregister{,_linger}_request clears r_osd. + * However, here we need to preserve r_osd to make sure we + * requeue on the same OSD. + */ + WARN_ON(req->r_osd || !osd); + req->r_osd = osd; + + dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid); + __enqueue_request(req); +} + /* * Resubmit requests pending on the given osd. */ @@ -837,12 +933,14 @@ { struct ceph_osd_request *req, *nreq; LIST_HEAD(resend); + LIST_HEAD(resend_linger); int err; - dout("__kick_osd_requests osd%d\n", osd->o_osd); + dout("%s osd%d\n", __func__, osd->o_osd); err = __reset_osd(osdc, osd); if (err) return; + /* * Build up a list of requests to resend by traversing the * osd's list of requests. Requests for a given object are @@ -863,33 +961,32 @@ list_for_each_entry(req, &osd->o_requests, r_osd_item) { if (!req->r_sent) break; - list_move_tail(&req->r_req_lru_item, &resend); - dout("requeueing %p tid %llu osd%d\n", req, req->r_tid, - osd->o_osd); - if (!req->r_linger) + + if (!req->r_linger) { + dout("%s requeueing %p tid %llu\n", __func__, req, + req->r_tid); + list_move_tail(&req->r_req_lru_item, &resend); req->r_flags |= CEPH_OSD_FLAG_RETRY; + } else { + list_move_tail(&req->r_req_lru_item, &resend_linger); + } } list_splice(&resend, &osdc->req_unsent); /* - * Linger requests are re-registered before sending, which - * sets up a new tid for each. We add them to the unsent - * list at the end to keep things in tid order. + * Both registered and not yet registered linger requests are + * enqueued with a new tid on the same OSD. We add/move them + * to req_unsent/o_requests at the end to keep things in tid + * order. */ list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, - r_linger_osd) { - /* - * reregister request prior to unregistering linger so - * that r_osd is preserved. - */ - BUG_ON(!list_empty(&req->r_req_lru_item)); - __register_request(osdc, req); - list_add_tail(&req->r_req_lru_item, &osdc->req_unsent); - list_add_tail(&req->r_osd_item, &req->r_osd->o_requests); - __unregister_linger_request(osdc, req); - dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid, - osd->o_osd); + r_linger_osd_item) { + WARN_ON(!list_empty(&req->r_req_lru_item)); + __kick_linger_request(req); } + + list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item) + __kick_linger_request(req); } /* @@ -954,10 +1051,11 @@ { dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), atomic_read(&osd->o_ref) - 1); - if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) { + if (atomic_dec_and_test(&osd->o_ref)) { struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; - ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer); + if (osd->o_auth.authorizer) + ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer); kfree(osd); } } @@ -1002,10 +1100,21 @@ static void __move_osd_to_lru(struct ceph_osd_client *osdc, struct ceph_osd *osd) { - dout("__move_osd_to_lru %p\n", osd); + dout("%s %p\n", __func__, osd); BUG_ON(!list_empty(&osd->o_osd_lru)); + list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); - osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ; + osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl; +} + +static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc, + struct ceph_osd *osd) +{ + dout("%s %p\n", __func__, osd); + + if (list_empty(&osd->o_requests) && + list_empty(&osd->o_linger_requests)) + __move_osd_to_lru(osdc, osd); } static void __remove_osd_from_lru(struct ceph_osd *osd) @@ -1048,8 +1157,8 @@ !ceph_con_opened(&osd->o_con)) { struct ceph_osd_request *req; - dout(" osd addr hasn't changed and connection never opened," - " letting msgr retry"); + dout("osd addr hasn't changed and connection never opened, " + "letting msgr retry\n"); /* touch each r_stamp for handle_timeout()'s benfit */ list_for_each_entry(req, &osd->o_requests, r_osd_item) req->r_stamp = jiffies; @@ -1106,7 +1215,7 @@ static void __schedule_osd_timeout(struct ceph_osd_client *osdc) { schedule_delayed_work(&osdc->timeout_work, - osdc->client->options->osd_keepalive_timeout * HZ); + osdc->client->options->osd_keepalive_timeout); } static void __cancel_osd_timeout(struct ceph_osd_client *osdc) @@ -1147,6 +1256,7 @@ dout("__unregister_request %p tid %lld\n", req, req->r_tid); rb_erase(&req->r_node, &osdc->requests); + RB_CLEAR_NODE(&req->r_node); osdc->num_requests--; if (req->r_osd) { @@ -1154,12 +1264,8 @@ ceph_msg_revoke(req->r_request); list_del_init(&req->r_osd_item); - if (list_empty(&req->r_osd->o_requests) && - list_empty(&req->r_osd->o_linger_requests)) { - dout("moving osd to %p lru\n", req->r_osd); - __move_osd_to_lru(osdc, req->r_osd); - } - if (list_empty(&req->r_linger_item)) + maybe_move_osd_to_lru(osdc, req->r_osd); + if (list_empty(&req->r_linger_osd_item)) req->r_osd = NULL; } @@ -1186,45 +1292,39 @@ static void __register_linger_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { - dout("__register_linger_request %p\n", req); + dout("%s %p tid %llu\n", __func__, req, req->r_tid); + WARN_ON(!req->r_linger); + ceph_osdc_get_request(req); list_add_tail(&req->r_linger_item, &osdc->req_linger); if (req->r_osd) - list_add_tail(&req->r_linger_osd, + list_add_tail(&req->r_linger_osd_item, &req->r_osd->o_linger_requests); } static void __unregister_linger_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { - dout("__unregister_linger_request %p\n", req); + WARN_ON(!req->r_linger); + + if (list_empty(&req->r_linger_item)) { + dout("%s %p tid %llu not registered\n", __func__, req, + req->r_tid); + return; + } + + dout("%s %p tid %llu\n", __func__, req, req->r_tid); list_del_init(&req->r_linger_item); - if (req->r_osd) { - list_del_init(&req->r_linger_osd); - if (list_empty(&req->r_osd->o_requests) && - list_empty(&req->r_osd->o_linger_requests)) { - dout("moving osd to %p lru\n", req->r_osd); - __move_osd_to_lru(osdc, req->r_osd); - } + if (req->r_osd) { + list_del_init(&req->r_linger_osd_item); + maybe_move_osd_to_lru(osdc, req->r_osd); if (list_empty(&req->r_osd_item)) req->r_osd = NULL; } ceph_osdc_put_request(req); } -void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) -{ - mutex_lock(&osdc->request_mutex); - if (req->r_linger) { - req->r_linger = 0; - __unregister_linger_request(osdc, req); - } - mutex_unlock(&osdc->request_mutex); -} -EXPORT_SYMBOL(ceph_osdc_unregister_linger_request); - void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { @@ -1252,6 +1352,61 @@ } /* + * Calculate mapping of a request to a PG. Takes tiering into account. + */ +static int __calc_request_pg(struct ceph_osdmap *osdmap, + struct ceph_osd_request *req, + struct ceph_pg *pg_out) +{ + bool need_check_tiering; + + need_check_tiering = false; + if (req->r_target_oloc.pool == -1) { + req->r_target_oloc = req->r_base_oloc; /* struct */ + need_check_tiering = true; + } + if (req->r_target_oid.name_len == 0) { + ceph_oid_copy(&req->r_target_oid, &req->r_base_oid); + need_check_tiering = true; + } + + if (need_check_tiering && + (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { + struct ceph_pg_pool_info *pi; + + pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool); + if (pi) { + if ((req->r_flags & CEPH_OSD_FLAG_READ) && + pi->read_tier >= 0) + req->r_target_oloc.pool = pi->read_tier; + if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && + pi->write_tier >= 0) + req->r_target_oloc.pool = pi->write_tier; + } + /* !pi is caught in ceph_oloc_oid_to_pg() */ + } + + return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc, + &req->r_target_oid, pg_out); +} + +static void __enqueue_request(struct ceph_osd_request *req) +{ + struct ceph_osd_client *osdc = req->r_osdc; + + dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1); + + if (req->r_osd) { + __remove_osd_from_lru(req->r_osd); + list_add_tail(&req->r_osd_item, &req->r_osd->o_requests); + list_move_tail(&req->r_req_lru_item, &osdc->req_unsent); + } else { + list_move_tail(&req->r_req_lru_item, &osdc->req_notarget); + } +} + +/* * Pick an osd (the first 'up' osd in the pg), allocate the osd struct * (as needed), and set the request r_osd appropriately. If there is * no up osd, set r_osd to NULL. Move the request to the appropriate list @@ -1266,24 +1421,22 @@ { struct ceph_pg pgid; int acting[CEPH_PG_MAX_SIZE]; - int o = -1, num = 0; + int num, o; int err; bool was_paused; dout("map_request %p tid %lld\n", req, req->r_tid); - err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap, - ceph_file_layout_pg_pool(req->r_file_layout)); + + err = __calc_request_pg(osdc->osdmap, req, &pgid); if (err) { list_move(&req->r_req_lru_item, &osdc->req_notarget); return err; } req->r_pgid = pgid; - err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); - if (err > 0) { - o = acting[0]; - num = err; - } + num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o); + if (num < 0) + num = 0; was_paused = req->r_paused; req->r_paused = __req_should_be_paused(osdc, req); @@ -1310,6 +1463,7 @@ if (req->r_osd) { __cancel_request(req); list_del_init(&req->r_osd_item); + list_del_init(&req->r_linger_osd_item); req->r_osd = NULL; } @@ -1330,13 +1484,7 @@ &osdc->osdmap->osd_addr[o]); } - if (req->r_osd) { - __remove_osd_from_lru(req->r_osd); - list_add_tail(&req->r_osd_item, &req->r_osd->o_requests); - list_move_tail(&req->r_req_lru_item, &osdc->req_unsent); - } else { - list_move_tail(&req->r_req_lru_item, &osdc->req_notarget); - } + __enqueue_request(req); err = 1; /* osd or pg changed */ out: @@ -1358,7 +1506,7 @@ /* fill in message content that changes each time we send it */ put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); put_unaligned_le32(req->r_flags, req->r_request_flags); - put_unaligned_le64(req->r_pgid.pool, req->r_request_pool); + put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool); p = req->r_request_pgid; ceph_encode_64(&p, req->r_pgid.pool); ceph_encode_32(&p, req->r_pgid.seed); @@ -1389,6 +1537,40 @@ } /* + * Caller should hold map_sem for read and request_mutex. + */ +static int __ceph_osdc_start_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req, + bool nofail) +{ + int rc; + + __register_request(osdc, req); + req->r_sent = 0; + req->r_got_reply = 0; + rc = __map_request(osdc, req, 0); + if (rc < 0) { + if (nofail) { + dout("osdc_start_request failed map, " + " will retry %lld\n", req->r_tid); + rc = 0; + } else { + __unregister_request(osdc, req); + } + return rc; + } + + if (req->r_osd == NULL) { + dout("send_request %p no up osds in pg\n", req); + ceph_monc_request_next_osdmap(&osdc->client->monc); + } else { + __send_queued(osdc); + } + + return 0; +} + +/* * Timeout callback, called every N seconds when 1 or more osd * requests has been active for more than N seconds. When this * happens, we ping all OSDs with requests who have timed out to @@ -1401,10 +1583,9 @@ { struct ceph_osd_client *osdc = container_of(work, struct ceph_osd_client, timeout_work.work); + struct ceph_options *opts = osdc->client->options; struct ceph_osd_request *req; struct ceph_osd *osd; - unsigned long keepalive = - osdc->client->options->osd_keepalive_timeout * HZ; struct list_head slow_osds; dout("timeout\n"); down_read(&osdc->map_sem); @@ -1420,7 +1601,8 @@ */ INIT_LIST_HEAD(&slow_osds); list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { - if (time_before(jiffies, req->r_stamp + keepalive)) + if (time_before(jiffies, + req->r_stamp + opts->osd_keepalive_timeout)) break; osd = req->r_osd; @@ -1447,8 +1629,7 @@ struct ceph_osd_client *osdc = container_of(work, struct ceph_osd_client, osds_timeout_work.work); - unsigned long delay = - osdc->client->options->osd_idle_ttl * HZ >> 2; + unsigned long delay = osdc->client->options->osd_idle_ttl / 4; dout("osds timeout\n"); down_read(&osdc->map_sem); @@ -1459,6 +1640,109 @@ round_jiffies_relative(delay)); } +static int ceph_oloc_decode(void **p, void *end, + struct ceph_object_locator *oloc) +{ + u8 struct_v, struct_cv; + u32 len; + void *struct_end; + int ret = 0; + + ceph_decode_need(p, end, 1 + 1 + 4, e_inval); + struct_v = ceph_decode_8(p); + struct_cv = ceph_decode_8(p); + if (struct_v < 3) { + pr_warn("got v %d < 3 cv %d of ceph_object_locator\n", + struct_v, struct_cv); + goto e_inval; + } + if (struct_cv > 6) { + pr_warn("got v %d cv %d > 6 of ceph_object_locator\n", + struct_v, struct_cv); + goto e_inval; + } + len = ceph_decode_32(p); + ceph_decode_need(p, end, len, e_inval); + struct_end = *p + len; + + oloc->pool = ceph_decode_64(p); + *p += 4; /* skip preferred */ + + len = ceph_decode_32(p); + if (len > 0) { + pr_warn("ceph_object_locator::key is set\n"); + goto e_inval; + } + + if (struct_v >= 5) { + len = ceph_decode_32(p); + if (len > 0) { + pr_warn("ceph_object_locator::nspace is set\n"); + goto e_inval; + } + } + + if (struct_v >= 6) { + s64 hash = ceph_decode_64(p); + if (hash != -1) { + pr_warn("ceph_object_locator::hash is set\n"); + goto e_inval; + } + } + + /* skip the rest */ + *p = struct_end; +out: + return ret; + +e_inval: + ret = -EINVAL; + goto out; +} + +static int ceph_redirect_decode(void **p, void *end, + struct ceph_request_redirect *redir) +{ + u8 struct_v, struct_cv; + u32 len; + void *struct_end; + int ret; + + ceph_decode_need(p, end, 1 + 1 + 4, e_inval); + struct_v = ceph_decode_8(p); + struct_cv = ceph_decode_8(p); + if (struct_cv > 1) { + pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n", + struct_v, struct_cv); + goto e_inval; + } + len = ceph_decode_32(p); + ceph_decode_need(p, end, len, e_inval); + struct_end = *p + len; + + ret = ceph_oloc_decode(p, end, &redir->oloc); + if (ret) + goto out; + + len = ceph_decode_32(p); + if (len > 0) { + pr_warn("ceph_request_redirect::object_name is set\n"); + goto e_inval; + } + + len = ceph_decode_32(p); + *p += len; /* skip osd_instructions */ + + /* skip the rest */ + *p = struct_end; +out: + return ret; + +e_inval: + ret = -EINVAL; + goto out; +} + static void complete_request(struct ceph_osd_request *req) { complete_all(&req->r_safe_completion); /* fsync waiter */ @@ -1468,11 +1752,11 @@ * handle osd op reply. either call the callback if it is specified, * or do the completion to wake up the waiting thread. */ -static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, - struct ceph_connection *con) +static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) { void *p, *end; struct ceph_osd_request *req; + struct ceph_request_redirect redir; u64 tid; int object_len; unsigned int numops; @@ -1511,6 +1795,7 @@ osdmap_epoch = ceph_decode_32(&p); /* lookup */ + down_read(&osdc->map_sem); mutex_lock(&osdc->request_mutex); req = __lookup_request(osdc, tid); if (req == NULL) { @@ -1542,8 +1827,8 @@ } bytes = le32_to_cpu(msg->hdr.data_len); if (payload_len != bytes) { - pr_warning("sum of op payload lens %d != data_len %d", - payload_len, bytes); + pr_warn("sum of op payload lens %d != data_len %d\n", + payload_len, bytes); goto bad_put; } @@ -1552,10 +1837,40 @@ for (i = 0; i < numops; i++) req->r_reply_op_result[i] = ceph_decode_32(&p); - already_completed = req->r_got_reply; + if (le16_to_cpu(msg->hdr.version) >= 6) { + p += 8 + 4; /* skip replay_version */ + p += 8; /* skip user_version */ + + err = ceph_redirect_decode(&p, end, &redir); + if (err) + goto bad_put; + } else { + redir.oloc.pool = -1; + } - if (!req->r_got_reply) { + if (redir.oloc.pool != -1) { + dout("redirect pool %lld\n", redir.oloc.pool); + + __unregister_request(osdc, req); + + req->r_target_oloc = redir.oloc; /* struct */ + + /* + * Start redirect requests with nofail=true. If + * mapping fails, request will end up on the notarget + * list, waiting for the new osdmap (which can take + * a while), even though the original request mapped + * successfully. In the future we might want to follow + * original request's nofail setting here. + */ + err = __ceph_osdc_start_request(osdc, req, true); + BUG_ON(err); + goto out_unlock; + } + + already_completed = req->r_got_reply; + if (!req->r_got_reply) { req->r_result = result; dout("handle_reply result %d bytes %d\n", req->r_result, bytes); @@ -1569,8 +1884,7 @@ req->r_got_reply = 1; } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { dout("handle_reply tid %llu dup ack\n", tid); - mutex_unlock(&osdc->request_mutex); - goto done; + goto out_unlock; } dout("handle_reply tid %llu flags %d\n", tid, flags); @@ -1585,6 +1899,7 @@ __unregister_request(osdc, req); mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); if (!already_completed) { if (req->r_unsafe_callback && @@ -1602,15 +1917,27 @@ complete_request(req); } -done: +out: dout("req=%p req->r_linger=%d\n", req, req->r_linger); ceph_osdc_put_request(req); return; +out_unlock: + mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); + goto out; bad_put: + req->r_result = -EIO; + __unregister_request(osdc, req); + if (req->r_callback) + req->r_callback(req, msg); + else + complete_all(&req->r_completion); + complete_request(req); ceph_osdc_put_request(req); bad_mutex: mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); bad: pr_err("corrupt osd_op_reply got %d %d\n", (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len)); @@ -1824,7 +2151,7 @@ int skipped_map = 0; dout("taking full map %u len %d\n", epoch, maplen); - newmap = osdmap_decode(&p, p+maplen); + newmap = ceph_osdmap_decode(&p, p+maplen); if (IS_ERR(newmap)) { err = PTR_ERR(newmap); goto bad; @@ -1873,7 +2200,6 @@ pr_err("osdc handle_map corrupt msg\n"); ceph_msg_dump(msg); up_write(&osdc->map_sem); - return; } /* @@ -2050,29 +2376,23 @@ if (event) { event_work = kmalloc(sizeof(*event_work), GFP_NOIO); if (!event_work) { - dout("ERROR: could not allocate event_work\n"); - goto done_err; + pr_err("couldn't allocate event_work\n"); + ceph_osdc_put_event(event); + return; } INIT_WORK(&event_work->work, do_event_work); event_work->event = event; event_work->ver = ver; event_work->notify_id = notify_id; event_work->opcode = opcode; - if (!queue_work(osdc->notify_wq, &event_work->work)) { - dout("WARNING: failed to queue notify event work\n"); - goto done_err; - } + + queue_work(osdc->notify_wq, &event_work->work); } return; -done_err: - ceph_osdc_put_event(event); - return; - bad: pr_err("osdc handle_watch_notify corrupt msg\n"); - return; } /* @@ -2123,10 +2443,11 @@ ceph_encode_32(&p, -1); /* preferred */ /* oid */ - ceph_encode_32(&p, req->r_oid_len); - memcpy(p, req->r_oid, req->r_oid_len); - dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len); - p += req->r_oid_len; + ceph_encode_32(&p, req->r_base_oid.name_len); + memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len); + dout("oid '%.*s' len %d\n", req->r_base_oid.name_len, + req->r_base_oid.name, req->r_base_oid.name_len); + p += req->r_base_oid.name_len; /* ops--can imply data */ ceph_encode_16(&p, (u16)req->r_num_ops); @@ -2180,39 +2501,40 @@ struct ceph_osd_request *req, bool nofail) { - int rc = 0; + int rc; down_read(&osdc->map_sem); mutex_lock(&osdc->request_mutex); - __register_request(osdc, req); - req->r_sent = 0; - req->r_got_reply = 0; - rc = __map_request(osdc, req, 0); - if (rc < 0) { - if (nofail) { - dout("osdc_start_request failed map, " - " will retry %lld\n", req->r_tid); - rc = 0; - } else { - __unregister_request(osdc, req); - } - goto out_unlock; - } - if (req->r_osd == NULL) { - dout("send_request %p no up osds in pg\n", req); - ceph_monc_request_next_osdmap(&osdc->client->monc); - } else { - __send_queued(osdc); - } - rc = 0; -out_unlock: + + rc = __ceph_osdc_start_request(osdc, req, nofail); + mutex_unlock(&osdc->request_mutex); up_read(&osdc->map_sem); + return rc; } EXPORT_SYMBOL(ceph_osdc_start_request); /* + * Unregister a registered request. The request is not completed (i.e. + * no callbacks or wakeups) - higher layers are supposed to know what + * they are canceling. + */ +void ceph_osdc_cancel_request(struct ceph_osd_request *req) +{ + struct ceph_osd_client *osdc = req->r_osdc; + + mutex_lock(&osdc->request_mutex); + if (req->r_linger) + __unregister_linger_request(osdc, req); + __unregister_request(osdc, req); + mutex_unlock(&osdc->request_mutex); + + dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid); +} +EXPORT_SYMBOL(ceph_osdc_cancel_request); + +/* * wait for a request to complete */ int ceph_osdc_wait_request(struct ceph_osd_client *osdc, @@ -2220,18 +2542,18 @@ { int rc; + dout("%s %p tid %llu\n", __func__, req, req->r_tid); + rc = wait_for_completion_interruptible(&req->r_completion); if (rc < 0) { - mutex_lock(&osdc->request_mutex); - __cancel_request(req); - __unregister_request(osdc, req); - mutex_unlock(&osdc->request_mutex); + dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid); + ceph_osdc_cancel_request(req); complete_request(req); - dout("wait_request tid %llu canceled/timed out\n", req->r_tid); return rc; } - dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result); + dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid, + req->r_result); return req->r_result; } EXPORT_SYMBOL(ceph_osdc_wait_request); @@ -2274,7 +2596,7 @@ * Call all pending notify callbacks - for use after a watch is * unregistered, to make sure no more callbacks for it will be invoked */ -extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc) +void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc) { flush_workqueue(osdc->notify_wq); } @@ -2311,7 +2633,7 @@ osdc->event_count = 0; schedule_delayed_work(&osdc->osds_timeout_work, - round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ)); + round_jiffies_relative(osdc->client->options->osd_idle_ttl)); err = -ENOMEM; osdc->req_mempool = mempool_create_kmalloc_pool(10, @@ -2333,9 +2655,12 @@ err = -ENOMEM; osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify"); if (!osdc->notify_wq) - goto out_msgpool; + goto out_msgpool_reply; + return 0; +out_msgpool_reply: + ceph_msgpool_destroy(&osdc->msgpool_op_reply); out_msgpool: ceph_msgpool_destroy(&osdc->msgpool_op); out_mempool: @@ -2375,7 +2700,7 @@ dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, vino.snap, off, *plen); - req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1, + req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, truncate_seq, truncate_size, false); @@ -2418,7 +2743,7 @@ int page_align = off & ~PAGE_MASK; BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */ - req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1, + req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, snapc, truncate_seq, truncate_size, @@ -2483,7 +2808,7 @@ ceph_osdc_handle_map(osdc, msg); break; case CEPH_MSG_OSD_OPREPLY: - handle_reply(osdc, msg, con); + handle_reply(osdc, msg); break; case CEPH_MSG_WATCH_NOTIFY: handle_watch_notify(osdc, msg); @@ -2498,8 +2823,9 @@ } /* - * lookup and return message for incoming reply. set up reply message - * pages. + * Lookup and return message for incoming reply. Don't try to do + * anything about a larger than preallocated data portion of the + * message at the moment - for now, just skip the message. */ static struct ceph_msg *get_reply(struct ceph_connection *con, struct ceph_msg_header *hdr, @@ -2509,7 +2835,7 @@ struct ceph_osd_client *osdc = osd->o_osdc; struct ceph_msg *m; struct ceph_osd_request *req; - int front = le32_to_cpu(hdr->front_len); + int front_len = le32_to_cpu(hdr->front_len); int data_len = le32_to_cpu(hdr->data_len); u64 tid; @@ -2517,59 +2843,42 @@ mutex_lock(&osdc->request_mutex); req = __lookup_request(osdc, tid); if (!req) { - *skip = 1; + dout("%s osd%d tid %llu unknown, skipping\n", __func__, + osd->o_osd, tid); m = NULL; - dout("get_reply unknown tid %llu from osd%d\n", tid, - osd->o_osd); + *skip = 1; goto out; } - if (req->r_reply->con) - dout("%s revoking msg %p from old con %p\n", __func__, - req->r_reply, req->r_reply->con); ceph_msg_revoke_incoming(req->r_reply); - if (front > req->r_reply->front.iov_len) { - pr_warning("get_reply front %d > preallocated %d\n", - front, (int)req->r_reply->front.iov_len); - m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS, false); + if (front_len > req->r_reply->front_alloc_len) { + pr_warn("%s osd%d tid %llu front %d > preallocated %d\n", + __func__, osd->o_osd, req->r_tid, front_len, + req->r_reply->front_alloc_len); + m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, + false); if (!m) goto out; ceph_msg_put(req->r_reply); req->r_reply = m; } - m = ceph_msg_get(req->r_reply); - if (data_len > 0) { - struct ceph_osd_data *osd_data; - - /* - * XXX This is assuming there is only one op containing - * XXX page data. Probably OK for reads, but this - * XXX ought to be done more generally. - */ - osd_data = osd_req_op_extent_osd_data(req, 0); - if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { - if (osd_data->pages && - unlikely(osd_data->length < data_len)) { - - pr_warning("tid %lld reply has %d bytes " - "we had only %llu bytes ready\n", - tid, data_len, osd_data->length); - *skip = 1; - ceph_msg_put(m); - m = NULL; - goto out; - } - } + if (data_len > req->r_reply->data_length) { + pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n", + __func__, osd->o_osd, req->r_tid, data_len, + req->r_reply->data_length); + m = NULL; + *skip = 1; + goto out; } - *skip = 0; + + m = ceph_msg_get(req->r_reply); dout("get_reply tid %lld %p\n", tid, m); out: mutex_unlock(&osdc->request_mutex); return m; - } static struct ceph_msg *alloc_msg(struct ceph_connection *con, @@ -2667,6 +2976,22 @@ return ceph_monc_validate_auth(&osdc->client->monc); } +static int osd_sign_message(struct ceph_msg *msg) +{ + struct ceph_osd *o = msg->con->private; + struct ceph_auth_handshake *auth = &o->o_auth; + + return ceph_auth_sign_message(auth, msg); +} + +static int osd_check_message_signature(struct ceph_msg *msg) +{ + struct ceph_osd *o = msg->con->private; + struct ceph_auth_handshake *auth = &o->o_auth; + + return ceph_auth_check_message_signature(auth, msg); +} + static const struct ceph_connection_operations osd_con_ops = { .get = get_osd_con, .put = put_osd_con, @@ -2675,5 +3000,7 @@ .verify_authorizer_reply = verify_authorizer_reply, .invalidate_authorizer = invalidate_authorizer, .alloc_msg = alloc_msg, + .sign_message = osd_sign_message, + .check_message_signature = osd_check_message_signature, .fault = osd_reset, };