From 6f1a1d103b48b1533a9c804e7a069e2c8e937ce7 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Wed, 25 Mar 2020 11:47:06 +0100 Subject: [PATCH 001/427] ima: Switch to ima_hash_algo for boot aggregate boot_aggregate is the first entry of IMA measurement list. Its purpose is to link pre-boot measurements to IMA measurements. As IMA was designed to work with a TPM 1.2, the SHA1 PCR bank was always selected even if a TPM 2.0 with support for stronger hash algorithms is available. This patch first tries to find a PCR bank with the IMA default hash algorithm. If it does not find it, it selects the SHA256 PCR bank for TPM 2.0 and SHA1 for TPM 1.2. Ultimately, it selects SHA1 also for TPM 2.0 if the SHA256 PCR bank is not found. If none of the PCR banks above can be found, boot_aggregate file digest is filled with zeros, as for TPM bypass, making it impossible to perform a remote attestation of the system. Cc: stable@vger.kernel.org # 5.1.x Fixes: 879b589210a9 ("tpm: retrieve digest size of unknown algorithms with PCR read") Reported-by: Jerry Snitselaar Suggested-by: James Bottomley Signed-off-by: Roberto Sassu Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_crypto.c | 47 +++++++++++++++++++++++++---- security/integrity/ima/ima_init.c | 20 +++++++++--- 2 files changed, 57 insertions(+), 10 deletions(-) diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index 423c84f95a14..8e445a671225 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -655,18 +655,29 @@ static void __init ima_pcrread(u32 idx, struct tpm_digest *d) } /* - * Calculate the boot aggregate hash + * The boot_aggregate is a cumulative hash over TPM registers 0 - 7. With + * TPM 1.2 the boot_aggregate was based on reading the SHA1 PCRs, but with + * TPM 2.0 hash agility, TPM chips could support multiple TPM PCR banks, + * allowing firmware to configure and enable different banks. + * + * Knowing which TPM bank is read to calculate the boot_aggregate digest + * needs to be conveyed to a verifier. For this reason, use the same + * hash algorithm for reading the TPM PCRs as for calculating the boot + * aggregate digest as stored in the measurement list. */ -static int __init ima_calc_boot_aggregate_tfm(char *digest, +static int __init ima_calc_boot_aggregate_tfm(char *digest, u16 alg_id, struct crypto_shash *tfm) { - struct tpm_digest d = { .alg_id = TPM_ALG_SHA1, .digest = {0} }; + struct tpm_digest d = { .alg_id = alg_id, .digest = {0} }; int rc; u32 i; SHASH_DESC_ON_STACK(shash, tfm); shash->tfm = tfm; + pr_devel("calculating the boot-aggregate based on TPM bank: %04x\n", + d.alg_id); + rc = crypto_shash_init(shash); if (rc != 0) return rc; @@ -675,7 +686,8 @@ static int __init ima_calc_boot_aggregate_tfm(char *digest, for (i = TPM_PCR0; i < TPM_PCR8; i++) { ima_pcrread(i, &d); /* now accumulate with current aggregate */ - rc = crypto_shash_update(shash, d.digest, TPM_DIGEST_SIZE); + rc = crypto_shash_update(shash, d.digest, + crypto_shash_digestsize(tfm)); } if (!rc) crypto_shash_final(shash, digest); @@ -685,14 +697,37 @@ static int __init ima_calc_boot_aggregate_tfm(char *digest, int __init ima_calc_boot_aggregate(struct ima_digest_data *hash) { struct crypto_shash *tfm; - int rc; + u16 crypto_id, alg_id; + int rc, i, bank_idx = -1; + + for (i = 0; i < ima_tpm_chip->nr_allocated_banks; i++) { + crypto_id = ima_tpm_chip->allocated_banks[i].crypto_id; + if (crypto_id == hash->algo) { + bank_idx = i; + break; + } + + if (crypto_id == HASH_ALGO_SHA256) + bank_idx = i; + + if (bank_idx == -1 && crypto_id == HASH_ALGO_SHA1) + bank_idx = i; + } + + if (bank_idx == -1) { + pr_err("No suitable TPM algorithm for boot aggregate\n"); + return 0; + } + + hash->algo = ima_tpm_chip->allocated_banks[bank_idx].crypto_id; tfm = ima_alloc_tfm(hash->algo); if (IS_ERR(tfm)) return PTR_ERR(tfm); hash->length = crypto_shash_digestsize(tfm); - rc = ima_calc_boot_aggregate_tfm(hash->digest, tfm); + alg_id = ima_tpm_chip->allocated_banks[bank_idx].alg_id; + rc = ima_calc_boot_aggregate_tfm(hash->digest, alg_id, tfm); ima_free_tfm(tfm); diff --git a/security/integrity/ima/ima_init.c b/security/integrity/ima/ima_init.c index 567468188a61..fc1e1002b48d 100644 --- a/security/integrity/ima/ima_init.c +++ b/security/integrity/ima/ima_init.c @@ -25,7 +25,7 @@ struct tpm_chip *ima_tpm_chip; /* Add the boot aggregate to the IMA measurement list and extend * the PCR register. * - * Calculate the boot aggregate, a SHA1 over tpm registers 0-7, + * Calculate the boot aggregate, a hash over tpm registers 0-7, * assuming a TPM chip exists, and zeroes if the TPM chip does not * exist. Add the boot aggregate measurement to the measurement * list and extend the PCR register. @@ -49,15 +49,27 @@ static int __init ima_add_boot_aggregate(void) int violation = 0; struct { struct ima_digest_data hdr; - char digest[TPM_DIGEST_SIZE]; + char digest[TPM_MAX_DIGEST_SIZE]; } hash; memset(iint, 0, sizeof(*iint)); memset(&hash, 0, sizeof(hash)); iint->ima_hash = &hash.hdr; - iint->ima_hash->algo = HASH_ALGO_SHA1; - iint->ima_hash->length = SHA1_DIGEST_SIZE; + iint->ima_hash->algo = ima_hash_algo; + iint->ima_hash->length = hash_digest_size[ima_hash_algo]; + /* + * With TPM 2.0 hash agility, TPM chips could support multiple TPM + * PCR banks, allowing firmware to configure and enable different + * banks. The SHA1 bank is not necessarily enabled. + * + * Use the same hash algorithm for reading the TPM PCRs as for + * calculating the boot aggregate digest. Preference is given to + * the configured IMA default hash algorithm. Otherwise, use the + * TCG required banks - SHA256 for TPM 2.0, SHA1 for TPM 1.2. + * Ultimately select SHA1 also for TPM 2.0 if the SHA256 PCR bank + * is not found. + */ if (ima_tpm_chip) { result = ima_calc_boot_aggregate(&hash.hdr); if (result < 0) { From e144d6b265415ddbdc54b3f17f4f95133effa5a8 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Wed, 25 Mar 2020 11:47:07 +0100 Subject: [PATCH 002/427] ima: Evaluate error in init_ima() Evaluate error in init_ima() before register_blocking_lsm_notifier() and return if not zero. Cc: stable@vger.kernel.org # 5.3.x Fixes: b16942455193 ("ima: use the lsm policy update notifier") Signed-off-by: Roberto Sassu Reviewed-by: James Morris Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 9d0abedeae77..f96f151294e6 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -792,6 +792,9 @@ static int __init init_ima(void) error = ima_init(); } + if (error) + return error; + error = register_blocking_lsm_notifier(&ima_lsm_policy_notifier); if (error) pr_warn("Couldn't register LSM notifier, error %d\n", error); From 7ca79645a1f8837c3850b881a2c0b43cfba5dc36 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Wed, 25 Mar 2020 11:47:08 +0100 Subject: [PATCH 003/427] ima: Store template digest directly in ima_template_entry In preparation for the patch that calculates a digest for each allocated PCR bank, this patch passes to ima_calc_field_array_hash() the ima_template_entry structure, so that digests can be directly stored in that structure instead of ima_digest_data. Signed-off-by: Roberto Sassu Signed-off-by: Mimi Zohar --- security/integrity/ima/ima.h | 3 +-- security/integrity/ima/ima_api.c | 12 +----------- security/integrity/ima/ima_crypto.c | 18 +++++++----------- 3 files changed, 9 insertions(+), 24 deletions(-) diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 64317d95363e..a2dfe24e04c7 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -138,8 +138,7 @@ int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash); int ima_calc_buffer_hash(const void *buf, loff_t len, struct ima_digest_data *hash); int ima_calc_field_array_hash(struct ima_field_data *field_data, - struct ima_template_desc *desc, int num_fields, - struct ima_digest_data *hash); + struct ima_template_entry *entry); int __init ima_calc_boot_aggregate(struct ima_digest_data *hash); void ima_add_violation(struct file *file, const unsigned char *filename, struct integrity_iint_cache *iint, diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index f6bc00914aa5..2ef5a40c7ca5 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -96,26 +96,16 @@ int ima_store_template(struct ima_template_entry *entry, static const char audit_cause[] = "hashing_error"; char *template_name = entry->template_desc->name; int result; - struct { - struct ima_digest_data hdr; - char digest[TPM_DIGEST_SIZE]; - } hash; if (!violation) { - int num_fields = entry->template_desc->num_fields; - - /* this function uses default algo */ - hash.hdr.algo = HASH_ALGO_SHA1; result = ima_calc_field_array_hash(&entry->template_data[0], - entry->template_desc, - num_fields, &hash.hdr); + entry); if (result < 0) { integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, template_name, op, audit_cause, result, 0); return result; } - memcpy(entry->digest, hash.hdr.digest, hash.hdr.length); } entry->pcr = pcr; result = ima_add_template_entry(entry, violation, op, inode, filename); diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index 8e445a671225..03d73a4009ab 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -464,18 +464,16 @@ out: * Calculate the hash of template data */ static int ima_calc_field_array_hash_tfm(struct ima_field_data *field_data, - struct ima_template_desc *td, - int num_fields, - struct ima_digest_data *hash, + struct ima_template_entry *entry, struct crypto_shash *tfm) { SHASH_DESC_ON_STACK(shash, tfm); + struct ima_template_desc *td = entry->template_desc; + int num_fields = entry->template_desc->num_fields; int rc, i; shash->tfm = tfm; - hash->length = crypto_shash_digestsize(tfm); - rc = crypto_shash_init(shash); if (rc != 0) return rc; @@ -504,24 +502,22 @@ static int ima_calc_field_array_hash_tfm(struct ima_field_data *field_data, } if (!rc) - rc = crypto_shash_final(shash, hash->digest); + rc = crypto_shash_final(shash, entry->digest); return rc; } int ima_calc_field_array_hash(struct ima_field_data *field_data, - struct ima_template_desc *desc, int num_fields, - struct ima_digest_data *hash) + struct ima_template_entry *entry) { struct crypto_shash *tfm; int rc; - tfm = ima_alloc_tfm(hash->algo); + tfm = ima_alloc_tfm(HASH_ALGO_SHA1); if (IS_ERR(tfm)) return PTR_ERR(tfm); - rc = ima_calc_field_array_hash_tfm(field_data, desc, num_fields, - hash, tfm); + rc = ima_calc_field_array_hash_tfm(field_data, entry, tfm); ima_free_tfm(tfm); From aa724fe18a8a8285d0071c3bfc932efb090d142d Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Wed, 25 Mar 2020 11:47:09 +0100 Subject: [PATCH 004/427] ima: Switch to dynamically allocated buffer for template digests This patch dynamically allocates the array of tpm_digest structures in ima_alloc_init_template() and ima_restore_template_data(). The size of the array is equal to the number of PCR banks plus ima_extra_slots, to make room for SHA1 and the IMA default hash algorithm, when PCR banks with those algorithms are not allocated. Calculating the SHA1 digest is mandatory, as SHA1 still remains the default hash algorithm for the measurement list. When IMA will support the Crypto Agile format, remaining digests will be also provided. The position in the measurement entry array of the SHA1 digest is stored in the ima_sha1_idx global variable and is determined at IMA initialization time. Signed-off-by: Roberto Sassu Signed-off-by: Mimi Zohar --- security/integrity/ima/ima.h | 6 +++++- security/integrity/ima/ima_api.c | 10 ++++++++++ security/integrity/ima/ima_crypto.c | 10 +++++++++- security/integrity/ima/ima_fs.c | 4 ++-- security/integrity/ima/ima_queue.c | 10 ++++++---- security/integrity/ima/ima_template.c | 15 +++++++++++++-- 6 files changed, 45 insertions(+), 10 deletions(-) diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index a2dfe24e04c7..2a7ed68e6414 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -45,11 +45,15 @@ enum tpm_pcrs { TPM_PCR0 = 0, TPM_PCR8 = 8 }; #define IMA_TEMPLATE_IMA_NAME "ima" #define IMA_TEMPLATE_IMA_FMT "d|n" +#define NR_BANKS(chip) ((chip != NULL) ? chip->nr_allocated_banks : 0) + /* current content of the policy */ extern int ima_policy_flag; /* set during initialization */ extern int ima_hash_algo; +extern int ima_sha1_idx __ro_after_init; +extern int ima_extra_slots __ro_after_init; extern int ima_appraise; extern struct tpm_chip *ima_tpm_chip; @@ -92,7 +96,7 @@ struct ima_template_desc { struct ima_template_entry { int pcr; - u8 digest[TPM_DIGEST_SIZE]; /* sha1 or md5 measurement hash */ + struct tpm_digest *digests; struct ima_template_desc *template_desc; /* template descriptor */ u32 template_data_len; struct ima_field_data template_data[0]; /* template related data */ diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index 2ef5a40c7ca5..78e0b0a7723e 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -27,6 +27,7 @@ void ima_free_template_entry(struct ima_template_entry *entry) for (i = 0; i < entry->template_desc->num_fields; i++) kfree(entry->template_data[i].data); + kfree(entry->digests); kfree(entry); } @@ -38,6 +39,7 @@ int ima_alloc_init_template(struct ima_event_data *event_data, struct ima_template_desc *desc) { struct ima_template_desc *template_desc; + struct tpm_digest *digests; int i, result = 0; if (desc) @@ -50,6 +52,14 @@ int ima_alloc_init_template(struct ima_event_data *event_data, if (!*entry) return -ENOMEM; + digests = kcalloc(NR_BANKS(ima_tpm_chip) + ima_extra_slots, + sizeof(*digests), GFP_NOFS); + if (!digests) { + result = -ENOMEM; + goto out; + } + + (*entry)->digests = digests; (*entry)->template_desc = template_desc; for (i = 0; i < template_desc->num_fields; i++) { const struct ima_template_field *field = diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index 03d73a4009ab..fe02eb28b32b 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -57,6 +57,13 @@ MODULE_PARM_DESC(ahash_bufsize, "Maximum ahash buffer size"); static struct crypto_shash *ima_shash_tfm; static struct crypto_ahash *ima_ahash_tfm; +int ima_sha1_idx __ro_after_init; +/* + * Additional number of slots reserved, as needed, for SHA1 + * and IMA default algo. + */ +int ima_extra_slots __ro_after_init = 1; + int __init ima_init_crypto(void) { long rc; @@ -502,7 +509,8 @@ static int ima_calc_field_array_hash_tfm(struct ima_field_data *field_data, } if (!rc) - rc = crypto_shash_final(shash, entry->digest); + rc = crypto_shash_final(shash, + entry->digests[ima_sha1_idx].digest); return rc; } diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c index a71e822a6e92..8b030a1c5e0d 100644 --- a/security/integrity/ima/ima_fs.c +++ b/security/integrity/ima/ima_fs.c @@ -150,7 +150,7 @@ int ima_measurements_show(struct seq_file *m, void *v) ima_putc(m, &pcr, sizeof(e->pcr)); /* 2nd: template digest */ - ima_putc(m, e->digest, TPM_DIGEST_SIZE); + ima_putc(m, e->digests[ima_sha1_idx].digest, TPM_DIGEST_SIZE); /* 3rd: template name size */ namelen = !ima_canonical_fmt ? strlen(template_name) : @@ -233,7 +233,7 @@ static int ima_ascii_measurements_show(struct seq_file *m, void *v) seq_printf(m, "%2d ", e->pcr); /* 2nd: SHA1 template hash */ - ima_print_digest(m, e->digest, TPM_DIGEST_SIZE); + ima_print_digest(m, e->digests[ima_sha1_idx].digest, TPM_DIGEST_SIZE); /* 3th: template name */ seq_printf(m, " %s", template_name); diff --git a/security/integrity/ima/ima_queue.c b/security/integrity/ima/ima_queue.c index 8753212ddb18..49db71c200b4 100644 --- a/security/integrity/ima/ima_queue.c +++ b/security/integrity/ima/ima_queue.c @@ -55,7 +55,8 @@ static struct ima_queue_entry *ima_lookup_digest_entry(u8 *digest_value, key = ima_hash_key(digest_value); rcu_read_lock(); hlist_for_each_entry_rcu(qe, &ima_htable.queue[key], hnext) { - rc = memcmp(qe->entry->digest, digest_value, TPM_DIGEST_SIZE); + rc = memcmp(qe->entry->digests[ima_sha1_idx].digest, + digest_value, TPM_DIGEST_SIZE); if ((rc == 0) && (qe->entry->pcr == pcr)) { ret = qe; break; @@ -75,7 +76,7 @@ static int get_binary_runtime_size(struct ima_template_entry *entry) int size = 0; size += sizeof(u32); /* pcr */ - size += sizeof(entry->digest); + size += TPM_DIGEST_SIZE; size += sizeof(int); /* template name size field */ size += strlen(entry->template_desc->name); size += sizeof(entry->template_data_len); @@ -107,7 +108,7 @@ static int ima_add_digest_entry(struct ima_template_entry *entry, atomic_long_inc(&ima_htable.len); if (update_htable) { - key = ima_hash_key(entry->digest); + key = ima_hash_key(entry->digests[ima_sha1_idx].digest); hlist_add_head_rcu(&qe->hnext, &ima_htable.queue[key]); } @@ -171,7 +172,8 @@ int ima_add_template_entry(struct ima_template_entry *entry, int violation, mutex_lock(&ima_extend_list_mutex); if (!violation) { - memcpy(digest, entry->digest, sizeof(digest)); + memcpy(digest, entry->digests[ima_sha1_idx].digest, + sizeof(digest)); if (ima_lookup_digest_entry(digest, entry->pcr)) { audit_cause = "hash_exists"; result = -EEXIST; diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c index 062d9ad49afb..de84252e65e9 100644 --- a/security/integrity/ima/ima_template.c +++ b/security/integrity/ima/ima_template.c @@ -301,6 +301,7 @@ static int ima_restore_template_data(struct ima_template_desc *template_desc, int template_data_size, struct ima_template_entry **entry) { + struct tpm_digest *digests; int ret = 0; int i; @@ -309,11 +310,21 @@ static int ima_restore_template_data(struct ima_template_desc *template_desc, if (!*entry) return -ENOMEM; + digests = kcalloc(NR_BANKS(ima_tpm_chip) + ima_extra_slots, + sizeof(*digests), GFP_NOFS); + if (!digests) { + kfree(*entry); + return -ENOMEM; + } + + (*entry)->digests = digests; + ret = ima_parse_buf(template_data, template_data + template_data_size, NULL, template_desc->num_fields, (*entry)->template_data, NULL, NULL, ENFORCE_FIELDS | ENFORCE_BUFEND, "template data"); if (ret < 0) { + kfree((*entry)->digests); kfree(*entry); return ret; } @@ -445,8 +456,8 @@ int ima_restore_measurement_list(loff_t size, void *buf) if (ret < 0) break; - memcpy(entry->digest, hdr[HDR_DIGEST].data, - hdr[HDR_DIGEST].len); + memcpy(entry->digests[ima_sha1_idx].digest, + hdr[HDR_DIGEST].data, hdr[HDR_DIGEST].len); entry->pcr = !ima_canonical_fmt ? *(hdr[HDR_PCR].data) : le32_to_cpu(*(hdr[HDR_PCR].data)); ret = ima_restore_measurement_entry(entry); From 6d94809af6b0830c4dfcad661535a5939bcb8a7d Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Wed, 25 Mar 2020 11:52:48 +0100 Subject: [PATCH 005/427] ima: Allocate and initialize tfm for each PCR bank This patch creates a crypto_shash structure for each allocated PCR bank and for SHA1 if a bank with that algorithm is not currently allocated. Reported-by: kbuild test robot Signed-off-by: Roberto Sassu Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_crypto.c | 145 +++++++++++++++++++++++----- 1 file changed, 119 insertions(+), 26 deletions(-) diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index fe02eb28b32b..ab1c05ad1314 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -57,14 +57,21 @@ MODULE_PARM_DESC(ahash_bufsize, "Maximum ahash buffer size"); static struct crypto_shash *ima_shash_tfm; static struct crypto_ahash *ima_ahash_tfm; +struct ima_algo_desc { + struct crypto_shash *tfm; + enum hash_algo algo; +}; + int ima_sha1_idx __ro_after_init; /* * Additional number of slots reserved, as needed, for SHA1 * and IMA default algo. */ -int ima_extra_slots __ro_after_init = 1; +int ima_extra_slots __ro_after_init; -int __init ima_init_crypto(void) +static struct ima_algo_desc *ima_algo_array; + +static int __init ima_init_ima_crypto(void) { long rc; @@ -83,26 +90,121 @@ int __init ima_init_crypto(void) static struct crypto_shash *ima_alloc_tfm(enum hash_algo algo) { struct crypto_shash *tfm = ima_shash_tfm; - int rc; + int rc, i; if (algo < 0 || algo >= HASH_ALGO__LAST) algo = ima_hash_algo; - if (algo != ima_hash_algo) { - tfm = crypto_alloc_shash(hash_algo_name[algo], 0, 0); - if (IS_ERR(tfm)) { - rc = PTR_ERR(tfm); - pr_err("Can not allocate %s (reason: %d)\n", - hash_algo_name[algo], rc); - } + if (algo == ima_hash_algo) + return tfm; + + for (i = 0; i < NR_BANKS(ima_tpm_chip) + ima_extra_slots; i++) + if (ima_algo_array[i].tfm && ima_algo_array[i].algo == algo) + return ima_algo_array[i].tfm; + + tfm = crypto_alloc_shash(hash_algo_name[algo], 0, 0); + if (IS_ERR(tfm)) { + rc = PTR_ERR(tfm); + pr_err("Can not allocate %s (reason: %d)\n", + hash_algo_name[algo], rc); } return tfm; } +int __init ima_init_crypto(void) +{ + enum hash_algo algo; + long rc; + int i; + + rc = ima_init_ima_crypto(); + if (rc) + return rc; + + ima_sha1_idx = -1; + + for (i = 0; i < NR_BANKS(ima_tpm_chip); i++) { + algo = ima_tpm_chip->allocated_banks[i].crypto_id; + if (algo == HASH_ALGO_SHA1) + ima_sha1_idx = i; + } + + if (ima_sha1_idx < 0) + ima_sha1_idx = NR_BANKS(ima_tpm_chip) + ima_extra_slots++; + + ima_algo_array = kcalloc(NR_BANKS(ima_tpm_chip) + ima_extra_slots, + sizeof(*ima_algo_array), GFP_KERNEL); + if (!ima_algo_array) { + rc = -ENOMEM; + goto out; + } + + for (i = 0; i < NR_BANKS(ima_tpm_chip); i++) { + algo = ima_tpm_chip->allocated_banks[i].crypto_id; + ima_algo_array[i].algo = algo; + + /* unknown TPM algorithm */ + if (algo == HASH_ALGO__LAST) + continue; + + if (algo == ima_hash_algo) { + ima_algo_array[i].tfm = ima_shash_tfm; + continue; + } + + ima_algo_array[i].tfm = ima_alloc_tfm(algo); + if (IS_ERR(ima_algo_array[i].tfm)) { + if (algo == HASH_ALGO_SHA1) { + rc = PTR_ERR(ima_algo_array[i].tfm); + ima_algo_array[i].tfm = NULL; + goto out_array; + } + + ima_algo_array[i].tfm = NULL; + } + } + + if (ima_sha1_idx >= NR_BANKS(ima_tpm_chip)) { + if (ima_hash_algo == HASH_ALGO_SHA1) { + ima_algo_array[ima_sha1_idx].tfm = ima_shash_tfm; + } else { + ima_algo_array[ima_sha1_idx].tfm = + ima_alloc_tfm(HASH_ALGO_SHA1); + if (IS_ERR(ima_algo_array[ima_sha1_idx].tfm)) { + rc = PTR_ERR(ima_algo_array[ima_sha1_idx].tfm); + goto out_array; + } + } + + ima_algo_array[ima_sha1_idx].algo = HASH_ALGO_SHA1; + } + + return 0; +out_array: + for (i = 0; i < NR_BANKS(ima_tpm_chip) + ima_extra_slots; i++) { + if (!ima_algo_array[i].tfm || + ima_algo_array[i].tfm == ima_shash_tfm) + continue; + + crypto_free_shash(ima_algo_array[i].tfm); + } +out: + crypto_free_shash(ima_shash_tfm); + return rc; +} + static void ima_free_tfm(struct crypto_shash *tfm) { - if (tfm != ima_shash_tfm) - crypto_free_shash(tfm); + int i; + + if (tfm == ima_shash_tfm) + return; + + for (i = 0; i < NR_BANKS(ima_tpm_chip) + ima_extra_slots; i++) + if (ima_algo_array[i].tfm == tfm) + return; + + crypto_free_shash(tfm); } /** @@ -472,14 +574,14 @@ out: */ static int ima_calc_field_array_hash_tfm(struct ima_field_data *field_data, struct ima_template_entry *entry, - struct crypto_shash *tfm) + int tfm_idx) { - SHASH_DESC_ON_STACK(shash, tfm); + SHASH_DESC_ON_STACK(shash, ima_algo_array[tfm_idx].tfm); struct ima_template_desc *td = entry->template_desc; int num_fields = entry->template_desc->num_fields; int rc, i; - shash->tfm = tfm; + shash->tfm = ima_algo_array[tfm_idx].tfm; rc = crypto_shash_init(shash); if (rc != 0) @@ -509,8 +611,7 @@ static int ima_calc_field_array_hash_tfm(struct ima_field_data *field_data, } if (!rc) - rc = crypto_shash_final(shash, - entry->digests[ima_sha1_idx].digest); + rc = crypto_shash_final(shash, entry->digests[tfm_idx].digest); return rc; } @@ -518,17 +619,9 @@ static int ima_calc_field_array_hash_tfm(struct ima_field_data *field_data, int ima_calc_field_array_hash(struct ima_field_data *field_data, struct ima_template_entry *entry) { - struct crypto_shash *tfm; int rc; - tfm = ima_alloc_tfm(HASH_ALGO_SHA1); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - rc = ima_calc_field_array_hash_tfm(field_data, entry, tfm); - - ima_free_tfm(tfm); - + rc = ima_calc_field_array_hash_tfm(field_data, entry, ima_sha1_idx); return rc; } From 1ea973df6e2166d1a576cabe5d08925d3261ff9d Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Wed, 25 Mar 2020 11:53:50 +0100 Subject: [PATCH 006/427] ima: Calculate and extend PCR with digests in ima_template_entry This patch modifies ima_calc_field_array_hash() to calculate a template digest for each allocated PCR bank and SHA1. It also passes the tpm_digest array of the template entry to ima_pcr_extend() or in case of a violation, the pre-initialized digests array filled with 0xff. Padding with zeros is still done if the mapping between TPM algorithm ID and crypto ID is unknown. This patch calculates again the template digest when a measurement list is restored. Copying only the SHA1 digest (due to the limitation of the current measurement list format) is not sufficient, as hash collision detection will be done on the digest calculated with the IMA default hash algorithm. Signed-off-by: Roberto Sassu Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_crypto.c | 29 +++++++++++++++++++++++++- security/integrity/ima/ima_queue.c | 30 ++++++++++++++++----------- security/integrity/ima/ima_template.c | 14 +++++++++++-- 3 files changed, 58 insertions(+), 15 deletions(-) diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index ab1c05ad1314..a94972d3f929 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -619,9 +619,36 @@ static int ima_calc_field_array_hash_tfm(struct ima_field_data *field_data, int ima_calc_field_array_hash(struct ima_field_data *field_data, struct ima_template_entry *entry) { - int rc; + u16 alg_id; + int rc, i; rc = ima_calc_field_array_hash_tfm(field_data, entry, ima_sha1_idx); + if (rc) + return rc; + + entry->digests[ima_sha1_idx].alg_id = TPM_ALG_SHA1; + + for (i = 0; i < NR_BANKS(ima_tpm_chip) + ima_extra_slots; i++) { + if (i == ima_sha1_idx) + continue; + + if (i < NR_BANKS(ima_tpm_chip)) { + alg_id = ima_tpm_chip->allocated_banks[i].alg_id; + entry->digests[i].alg_id = alg_id; + } + + /* for unmapped TPM algorithms digest is still a padded SHA1 */ + if (!ima_algo_array[i].tfm) { + memcpy(entry->digests[i].digest, + entry->digests[ima_sha1_idx].digest, + TPM_DIGEST_SIZE); + continue; + } + + rc = ima_calc_field_array_hash_tfm(field_data, entry, i); + if (rc) + return rc; + } return rc; } diff --git a/security/integrity/ima/ima_queue.c b/security/integrity/ima/ima_queue.c index 49db71c200b4..82a9ca43b989 100644 --- a/security/integrity/ima/ima_queue.c +++ b/security/integrity/ima/ima_queue.c @@ -135,18 +135,14 @@ unsigned long ima_get_binary_runtime_size(void) return binary_runtime_size + sizeof(struct ima_kexec_hdr); }; -static int ima_pcr_extend(const u8 *hash, int pcr) +static int ima_pcr_extend(struct tpm_digest *digests_arg, int pcr) { int result = 0; - int i; if (!ima_tpm_chip) return result; - for (i = 0; i < ima_tpm_chip->nr_allocated_banks; i++) - memcpy(digests[i].digest, hash, TPM_DIGEST_SIZE); - - result = tpm_pcr_extend(ima_tpm_chip, pcr, digests); + result = tpm_pcr_extend(ima_tpm_chip, pcr, digests_arg); if (result != 0) pr_err("Error Communicating to TPM chip, result: %d\n", result); return result; @@ -164,7 +160,8 @@ int ima_add_template_entry(struct ima_template_entry *entry, int violation, const char *op, struct inode *inode, const unsigned char *filename) { - u8 digest[TPM_DIGEST_SIZE]; + u8 *digest = entry->digests[ima_sha1_idx].digest; + struct tpm_digest *digests_arg = entry->digests; const char *audit_cause = "hash_added"; char tpm_audit_cause[AUDIT_CAUSE_LEN_MAX]; int audit_info = 1; @@ -172,8 +169,6 @@ int ima_add_template_entry(struct ima_template_entry *entry, int violation, mutex_lock(&ima_extend_list_mutex); if (!violation) { - memcpy(digest, entry->digests[ima_sha1_idx].digest, - sizeof(digest)); if (ima_lookup_digest_entry(digest, entry->pcr)) { audit_cause = "hash_exists"; result = -EEXIST; @@ -189,9 +184,9 @@ int ima_add_template_entry(struct ima_template_entry *entry, int violation, } if (violation) /* invalidate pcr */ - memset(digest, 0xff, sizeof(digest)); + digests_arg = digests; - tpmresult = ima_pcr_extend(digest, entry->pcr); + tpmresult = ima_pcr_extend(digests_arg, entry->pcr); if (tpmresult != 0) { snprintf(tpm_audit_cause, AUDIT_CAUSE_LEN_MAX, "TPM_error(%d)", tpmresult); @@ -217,6 +212,8 @@ int ima_restore_measurement_entry(struct ima_template_entry *entry) int __init ima_init_digests(void) { + u16 digest_size; + u16 crypto_id; int i; if (!ima_tpm_chip) @@ -227,8 +224,17 @@ int __init ima_init_digests(void) if (!digests) return -ENOMEM; - for (i = 0; i < ima_tpm_chip->nr_allocated_banks; i++) + for (i = 0; i < ima_tpm_chip->nr_allocated_banks; i++) { digests[i].alg_id = ima_tpm_chip->allocated_banks[i].alg_id; + digest_size = ima_tpm_chip->allocated_banks[i].digest_size; + crypto_id = ima_tpm_chip->allocated_banks[i].crypto_id; + + /* for unmapped TPM algorithms digest is still a padded SHA1 */ + if (crypto_id == HASH_ALGO__LAST) + digest_size = SHA1_DIGEST_SIZE; + + memset(digests[i].digest, 0xff, digest_size); + } return 0; } diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c index de84252e65e9..5a2def40a733 100644 --- a/security/integrity/ima/ima_template.c +++ b/security/integrity/ima/ima_template.c @@ -357,6 +357,7 @@ static int ima_restore_template_data(struct ima_template_desc *template_desc, int ima_restore_measurement_list(loff_t size, void *buf) { char template_name[MAX_TEMPLATE_NAME_LEN]; + unsigned char zero[TPM_DIGEST_SIZE] = { 0 }; struct ima_kexec_hdr *khdr = buf; struct ima_field_data hdr[HDR__LAST] = { @@ -456,8 +457,17 @@ int ima_restore_measurement_list(loff_t size, void *buf) if (ret < 0) break; - memcpy(entry->digests[ima_sha1_idx].digest, - hdr[HDR_DIGEST].data, hdr[HDR_DIGEST].len); + if (memcmp(hdr[HDR_DIGEST].data, zero, sizeof(zero))) { + ret = ima_calc_field_array_hash( + &entry->template_data[0], + entry); + if (ret < 0) { + pr_err("cannot calculate template digest\n"); + ret = -EINVAL; + break; + } + } + entry->pcr = !ima_canonical_fmt ? *(hdr[HDR_PCR].data) : le32_to_cpu(*(hdr[HDR_PCR].data)); ret = ima_restore_measurement_entry(entry); From 2592677c0486e64a08e0b930a7dfa6fbf77e6fc1 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Wed, 25 Mar 2020 11:54:24 +0100 Subject: [PATCH 007/427] ima: Use ima_hash_algo for collision detection in the measurement list Before calculating a digest for each PCR bank, collisions were detected with a SHA1 digest. This patch includes ima_hash_algo among the algorithms used to calculate the template digest and checks collisions on that digest. The position in the measurement entry array of the template digest calculated with the IMA default hash algorithm is stored in the ima_hash_algo_idx global variable and is determined at IMA initialization time. Signed-off-by: Roberto Sassu Signed-off-by: Mimi Zohar --- security/integrity/ima/ima.h | 1 + security/integrity/ima/ima_crypto.c | 19 ++++++++++++++++++- security/integrity/ima/ima_queue.c | 8 ++++---- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 2a7ed68e6414..467dfdbea25c 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -53,6 +53,7 @@ extern int ima_policy_flag; /* set during initialization */ extern int ima_hash_algo; extern int ima_sha1_idx __ro_after_init; +extern int ima_hash_algo_idx __ro_after_init; extern int ima_extra_slots __ro_after_init; extern int ima_appraise; extern struct tpm_chip *ima_tpm_chip; diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index a94972d3f929..5201f5ec2ce4 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -63,6 +63,7 @@ struct ima_algo_desc { }; int ima_sha1_idx __ro_after_init; +int ima_hash_algo_idx __ro_after_init; /* * Additional number of slots reserved, as needed, for SHA1 * and IMA default algo. @@ -122,15 +123,25 @@ int __init ima_init_crypto(void) return rc; ima_sha1_idx = -1; + ima_hash_algo_idx = -1; for (i = 0; i < NR_BANKS(ima_tpm_chip); i++) { algo = ima_tpm_chip->allocated_banks[i].crypto_id; if (algo == HASH_ALGO_SHA1) ima_sha1_idx = i; + + if (algo == ima_hash_algo) + ima_hash_algo_idx = i; } - if (ima_sha1_idx < 0) + if (ima_sha1_idx < 0) { ima_sha1_idx = NR_BANKS(ima_tpm_chip) + ima_extra_slots++; + if (ima_hash_algo == HASH_ALGO_SHA1) + ima_hash_algo_idx = ima_sha1_idx; + } + + if (ima_hash_algo_idx < 0) + ima_hash_algo_idx = NR_BANKS(ima_tpm_chip) + ima_extra_slots++; ima_algo_array = kcalloc(NR_BANKS(ima_tpm_chip) + ima_extra_slots, sizeof(*ima_algo_array), GFP_KERNEL); @@ -179,6 +190,12 @@ int __init ima_init_crypto(void) ima_algo_array[ima_sha1_idx].algo = HASH_ALGO_SHA1; } + if (ima_hash_algo_idx >= NR_BANKS(ima_tpm_chip) && + ima_hash_algo_idx != ima_sha1_idx) { + ima_algo_array[ima_hash_algo_idx].tfm = ima_shash_tfm; + ima_algo_array[ima_hash_algo_idx].algo = ima_hash_algo; + } + return 0; out_array: for (i = 0; i < NR_BANKS(ima_tpm_chip) + ima_extra_slots; i++) { diff --git a/security/integrity/ima/ima_queue.c b/security/integrity/ima/ima_queue.c index 82a9ca43b989..fb4ec270f620 100644 --- a/security/integrity/ima/ima_queue.c +++ b/security/integrity/ima/ima_queue.c @@ -55,8 +55,8 @@ static struct ima_queue_entry *ima_lookup_digest_entry(u8 *digest_value, key = ima_hash_key(digest_value); rcu_read_lock(); hlist_for_each_entry_rcu(qe, &ima_htable.queue[key], hnext) { - rc = memcmp(qe->entry->digests[ima_sha1_idx].digest, - digest_value, TPM_DIGEST_SIZE); + rc = memcmp(qe->entry->digests[ima_hash_algo_idx].digest, + digest_value, hash_digest_size[ima_hash_algo]); if ((rc == 0) && (qe->entry->pcr == pcr)) { ret = qe; break; @@ -108,7 +108,7 @@ static int ima_add_digest_entry(struct ima_template_entry *entry, atomic_long_inc(&ima_htable.len); if (update_htable) { - key = ima_hash_key(entry->digests[ima_sha1_idx].digest); + key = ima_hash_key(entry->digests[ima_hash_algo_idx].digest); hlist_add_head_rcu(&qe->hnext, &ima_htable.queue[key]); } @@ -160,7 +160,7 @@ int ima_add_template_entry(struct ima_template_entry *entry, int violation, const char *op, struct inode *inode, const unsigned char *filename) { - u8 *digest = entry->digests[ima_sha1_idx].digest; + u8 *digest = entry->digests[ima_hash_algo_idx].digest; struct tpm_digest *digests_arg = entry->digests; const char *audit_cause = "hash_added"; char tpm_audit_cause[AUDIT_CAUSE_LEN_MAX]; From 05f099a7d0a73114c6eb3e6a359ea97563b47031 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 17 Apr 2020 15:36:05 +0800 Subject: [PATCH 008/427] dma-debug: make __dma_entry_alloc_check_leak() static Fix the following sparse warning: kernel/dma/debug.c:659:6: warning: symbol '__dma_entry_alloc_check_leak' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 9e1777c81f55..36c962a86bf2 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -656,7 +656,7 @@ static struct dma_debug_entry *__dma_entry_alloc(void) return entry; } -void __dma_entry_alloc_check_leak(void) +static void __dma_entry_alloc_check_leak(void) { u32 tmp = nr_total_entries % nr_prealloc_entries; From e860c299ac0d738b44ff91693f11e63080a29698 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2020 17:04:52 -0700 Subject: [PATCH 009/427] dma-remap: separate DMA atomic pools from direct remap code DMA atomic pools will be needed beyond only CONFIG_DMA_DIRECT_REMAP so separate them out into their own file. This also adds a new Kconfig option that can be subsequently used for options, such as CONFIG_AMD_MEM_ENCRYPT, that will utilize the coherent pools but do not have a dependency on direct remapping. For this patch alone, there is no functional change introduced. Reviewed-by: Christoph Hellwig Signed-off-by: David Rientjes [hch: fixup copyrights and remove unused includes] Signed-off-by: Christoph Hellwig --- kernel/dma/Kconfig | 6 ++- kernel/dma/Makefile | 1 + kernel/dma/pool.c | 123 ++++++++++++++++++++++++++++++++++++++++++++ kernel/dma/remap.c | 121 +------------------------------------------ 4 files changed, 130 insertions(+), 121 deletions(-) create mode 100644 kernel/dma/pool.c diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 4c103a24e380..d006668c0027 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -79,10 +79,14 @@ config DMA_REMAP select DMA_NONCOHERENT_MMAP bool -config DMA_DIRECT_REMAP +config DMA_COHERENT_POOL bool select DMA_REMAP +config DMA_DIRECT_REMAP + bool + select DMA_COHERENT_POOL + config DMA_CMA bool "DMA Contiguous Memory Allocator" depends on HAVE_DMA_CONTIGUOUS && CMA diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index d237cf3dc181..370f63344e9c 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -6,4 +6,5 @@ obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o obj-$(CONFIG_DMA_VIRT_OPS) += virt.o obj-$(CONFIG_DMA_API_DEBUG) += debug.o obj-$(CONFIG_SWIOTLB) += swiotlb.o +obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o obj-$(CONFIG_DMA_REMAP) += remap.o diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c new file mode 100644 index 000000000000..3df5d9d39922 --- /dev/null +++ b/kernel/dma/pool.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2012 ARM Ltd. + * Copyright (C) 2020 Google LLC + */ +#include +#include +#include +#include +#include +#include + +static struct gen_pool *atomic_pool __ro_after_init; + +#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K +static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; + +static int __init early_coherent_pool(char *p) +{ + atomic_pool_size = memparse(p, &p); + return 0; +} +early_param("coherent_pool", early_coherent_pool); + +static gfp_t dma_atomic_pool_gfp(void) +{ + if (IS_ENABLED(CONFIG_ZONE_DMA)) + return GFP_DMA; + if (IS_ENABLED(CONFIG_ZONE_DMA32)) + return GFP_DMA32; + return GFP_KERNEL; +} + +static int __init dma_atomic_pool_init(void) +{ + unsigned int pool_size_order = get_order(atomic_pool_size); + unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; + struct page *page; + void *addr; + int ret; + + if (dev_get_cma_area(NULL)) + page = dma_alloc_from_contiguous(NULL, nr_pages, + pool_size_order, false); + else + page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order); + if (!page) + goto out; + + arch_dma_prep_coherent(page, atomic_pool_size); + + atomic_pool = gen_pool_create(PAGE_SHIFT, -1); + if (!atomic_pool) + goto free_page; + + addr = dma_common_contiguous_remap(page, atomic_pool_size, + pgprot_dmacoherent(PAGE_KERNEL), + __builtin_return_address(0)); + if (!addr) + goto destroy_genpool; + + ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr, + page_to_phys(page), atomic_pool_size, -1); + if (ret) + goto remove_mapping; + gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL); + + pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n", + atomic_pool_size / 1024); + return 0; + +remove_mapping: + dma_common_free_remap(addr, atomic_pool_size); +destroy_genpool: + gen_pool_destroy(atomic_pool); + atomic_pool = NULL; +free_page: + if (!dma_release_from_contiguous(NULL, page, nr_pages)) + __free_pages(page, pool_size_order); +out: + pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n", + atomic_pool_size / 1024); + return -ENOMEM; +} +postcore_initcall(dma_atomic_pool_init); + +bool dma_in_atomic_pool(void *start, size_t size) +{ + if (unlikely(!atomic_pool)) + return false; + + return gen_pool_has_addr(atomic_pool, (unsigned long)start, size); +} + +void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) +{ + unsigned long val; + void *ptr = NULL; + + if (!atomic_pool) { + WARN(1, "coherent pool not initialised!\n"); + return NULL; + } + + val = gen_pool_alloc(atomic_pool, size); + if (val) { + phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val); + + *ret_page = pfn_to_page(__phys_to_pfn(phys)); + ptr = (void *)val; + memset(ptr, 0, size); + } + + return ptr; +} + +bool dma_free_from_pool(void *start, size_t size) +{ + if (!dma_in_atomic_pool(start, size)) + return false; + gen_pool_free(atomic_pool, (unsigned long)start, size); + return true; +} diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index d14cbc83986a..f7b402849891 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -1,13 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (C) 2012 ARM Ltd. * Copyright (c) 2014 The Linux Foundation */ -#include -#include -#include -#include -#include +#include #include #include @@ -97,117 +92,3 @@ void dma_common_free_remap(void *cpu_addr, size_t size) unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size)); vunmap(cpu_addr); } - -#ifdef CONFIG_DMA_DIRECT_REMAP -static struct gen_pool *atomic_pool __ro_after_init; - -#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K -static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; - -static int __init early_coherent_pool(char *p) -{ - atomic_pool_size = memparse(p, &p); - return 0; -} -early_param("coherent_pool", early_coherent_pool); - -static gfp_t dma_atomic_pool_gfp(void) -{ - if (IS_ENABLED(CONFIG_ZONE_DMA)) - return GFP_DMA; - if (IS_ENABLED(CONFIG_ZONE_DMA32)) - return GFP_DMA32; - return GFP_KERNEL; -} - -static int __init dma_atomic_pool_init(void) -{ - unsigned int pool_size_order = get_order(atomic_pool_size); - unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; - struct page *page; - void *addr; - int ret; - - if (dev_get_cma_area(NULL)) - page = dma_alloc_from_contiguous(NULL, nr_pages, - pool_size_order, false); - else - page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order); - if (!page) - goto out; - - arch_dma_prep_coherent(page, atomic_pool_size); - - atomic_pool = gen_pool_create(PAGE_SHIFT, -1); - if (!atomic_pool) - goto free_page; - - addr = dma_common_contiguous_remap(page, atomic_pool_size, - pgprot_dmacoherent(PAGE_KERNEL), - __builtin_return_address(0)); - if (!addr) - goto destroy_genpool; - - ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr, - page_to_phys(page), atomic_pool_size, -1); - if (ret) - goto remove_mapping; - gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL); - - pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n", - atomic_pool_size / 1024); - return 0; - -remove_mapping: - dma_common_free_remap(addr, atomic_pool_size); -destroy_genpool: - gen_pool_destroy(atomic_pool); - atomic_pool = NULL; -free_page: - if (!dma_release_from_contiguous(NULL, page, nr_pages)) - __free_pages(page, pool_size_order); -out: - pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n", - atomic_pool_size / 1024); - return -ENOMEM; -} -postcore_initcall(dma_atomic_pool_init); - -bool dma_in_atomic_pool(void *start, size_t size) -{ - if (unlikely(!atomic_pool)) - return false; - - return gen_pool_has_addr(atomic_pool, (unsigned long)start, size); -} - -void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) -{ - unsigned long val; - void *ptr = NULL; - - if (!atomic_pool) { - WARN(1, "coherent pool not initialised!\n"); - return NULL; - } - - val = gen_pool_alloc(atomic_pool, size); - if (val) { - phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val); - - *ret_page = pfn_to_page(__phys_to_pfn(phys)); - ptr = (void *)val; - memset(ptr, 0, size); - } - - return ptr; -} - -bool dma_free_from_pool(void *start, size_t size) -{ - if (!dma_in_atomic_pool(start, size)) - return false; - gen_pool_free(atomic_pool, (unsigned long)start, size); - return true; -} -#endif /* CONFIG_DMA_DIRECT_REMAP */ From c84dc6e68a1d2464e050d9694be4e4ff49e32bfd Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2020 17:04:55 -0700 Subject: [PATCH 010/427] dma-pool: add additional coherent pools to map to gfp mask The single atomic pool is allocated from the lowest zone possible since it is guaranteed to be applicable for any DMA allocation. Devices may allocate through the DMA API but not have a strict reliance on GFP_DMA memory. Since the atomic pool will be used for all non-blockable allocations, returning all memory from ZONE_DMA may unnecessarily deplete the zone. Provision for multiple atomic pools that will map to the optimal gfp mask of the device. When allocating non-blockable memory, determine the optimal gfp mask of the device and use the appropriate atomic pool. The coherent DMA mask will remain the same between allocation and free and, thus, memory will be freed to the same atomic pool it was allocated from. __dma_atomic_pool_init() will be changed to return struct gen_pool * later once dynamic expansion is added. Signed-off-by: David Rientjes Signed-off-by: Christoph Hellwig --- drivers/iommu/dma-iommu.c | 5 +- include/linux/dma-direct.h | 2 + include/linux/dma-mapping.h | 6 +- kernel/dma/direct.c | 12 ++-- kernel/dma/pool.c | 120 +++++++++++++++++++++++------------- 5 files changed, 91 insertions(+), 54 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index ba128d1cdaee..4959f5df21bd 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -952,7 +952,7 @@ static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr) /* Non-coherent atomic allocation? Easy */ if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_free_from_pool(cpu_addr, alloc_size)) + dma_free_from_pool(dev, cpu_addr, alloc_size)) return; if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) { @@ -1035,7 +1035,8 @@ static void *iommu_dma_alloc(struct device *dev, size_t size, if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !gfpflags_allow_blocking(gfp) && !coherent) - cpu_addr = dma_alloc_from_pool(PAGE_ALIGN(size), &page, gfp); + cpu_addr = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &page, + gfp); else cpu_addr = iommu_dma_alloc_pages(dev, size, &page, gfp, attrs); if (!cpu_addr) diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 24b8684aa21d..136f984df0d9 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -67,6 +67,8 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size, } u64 dma_direct_get_required_mask(struct device *dev); +gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, + u64 *phys_mask); void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 330ad58fbf4d..b43116a6405d 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -630,9 +630,9 @@ void *dma_common_pages_remap(struct page **pages, size_t size, pgprot_t prot, const void *caller); void dma_common_free_remap(void *cpu_addr, size_t size); -bool dma_in_atomic_pool(void *start, size_t size); -void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags); -bool dma_free_from_pool(void *start, size_t size); +void *dma_alloc_from_pool(struct device *dev, size_t size, + struct page **ret_page, gfp_t flags); +bool dma_free_from_pool(struct device *dev, void *start, size_t size); int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8f4bbdaf965e..a834ee22f8ff 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -45,8 +45,8 @@ u64 dma_direct_get_required_mask(struct device *dev) return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } -static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, - u64 *phys_limit) +gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, + u64 *phys_limit) { u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit); @@ -89,8 +89,8 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, /* we always manually zero the memory once we are done: */ gfp &= ~__GFP_ZERO; - gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, - &phys_limit); + gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_limit); page = dma_alloc_contiguous(dev, alloc_size, gfp); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, alloc_size); @@ -128,7 +128,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && dma_alloc_need_uncached(dev, attrs) && !gfpflags_allow_blocking(gfp)) { - ret = dma_alloc_from_pool(PAGE_ALIGN(size), &page, gfp); + ret = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &page, gfp); if (!ret) return NULL; goto done; @@ -212,7 +212,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, } if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_free_from_pool(cpu_addr, PAGE_ALIGN(size))) + dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) return; if (force_dma_unencrypted(dev)) diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 3df5d9d39922..db4f89ac5f5f 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -10,7 +10,9 @@ #include #include -static struct gen_pool *atomic_pool __ro_after_init; +static struct gen_pool *atomic_pool_dma __ro_after_init; +static struct gen_pool *atomic_pool_dma32 __ro_after_init; +static struct gen_pool *atomic_pool_kernel __ro_after_init; #define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; @@ -22,89 +24,119 @@ static int __init early_coherent_pool(char *p) } early_param("coherent_pool", early_coherent_pool); -static gfp_t dma_atomic_pool_gfp(void) +static int __init __dma_atomic_pool_init(struct gen_pool **pool, + size_t pool_size, gfp_t gfp) { - if (IS_ENABLED(CONFIG_ZONE_DMA)) - return GFP_DMA; - if (IS_ENABLED(CONFIG_ZONE_DMA32)) - return GFP_DMA32; - return GFP_KERNEL; -} - -static int __init dma_atomic_pool_init(void) -{ - unsigned int pool_size_order = get_order(atomic_pool_size); - unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; + const unsigned int order = get_order(pool_size); + const unsigned long nr_pages = pool_size >> PAGE_SHIFT; struct page *page; void *addr; int ret; if (dev_get_cma_area(NULL)) - page = dma_alloc_from_contiguous(NULL, nr_pages, - pool_size_order, false); + page = dma_alloc_from_contiguous(NULL, nr_pages, order, false); else - page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order); + page = alloc_pages(gfp, order); if (!page) goto out; - arch_dma_prep_coherent(page, atomic_pool_size); + arch_dma_prep_coherent(page, pool_size); - atomic_pool = gen_pool_create(PAGE_SHIFT, -1); - if (!atomic_pool) + *pool = gen_pool_create(PAGE_SHIFT, -1); + if (!*pool) goto free_page; - addr = dma_common_contiguous_remap(page, atomic_pool_size, + addr = dma_common_contiguous_remap(page, pool_size, pgprot_dmacoherent(PAGE_KERNEL), __builtin_return_address(0)); if (!addr) goto destroy_genpool; - ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr, - page_to_phys(page), atomic_pool_size, -1); + ret = gen_pool_add_virt(*pool, (unsigned long)addr, page_to_phys(page), + pool_size, -1); if (ret) goto remove_mapping; - gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL); + gen_pool_set_algo(*pool, gen_pool_first_fit_order_align, NULL); - pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n", - atomic_pool_size / 1024); + pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n", + pool_size >> 10, &gfp); return 0; remove_mapping: - dma_common_free_remap(addr, atomic_pool_size); + dma_common_free_remap(addr, pool_size); destroy_genpool: - gen_pool_destroy(atomic_pool); - atomic_pool = NULL; + gen_pool_destroy(*pool); + *pool = NULL; free_page: if (!dma_release_from_contiguous(NULL, page, nr_pages)) - __free_pages(page, pool_size_order); + __free_pages(page, order); out: - pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n", - atomic_pool_size / 1024); + pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n", + pool_size >> 10, &gfp); return -ENOMEM; } + +static int __init dma_atomic_pool_init(void) +{ + int ret = 0; + int err; + + ret = __dma_atomic_pool_init(&atomic_pool_kernel, atomic_pool_size, + GFP_KERNEL); + if (IS_ENABLED(CONFIG_ZONE_DMA)) { + err = __dma_atomic_pool_init(&atomic_pool_dma, + atomic_pool_size, GFP_DMA); + if (!ret && err) + ret = err; + } + if (IS_ENABLED(CONFIG_ZONE_DMA32)) { + err = __dma_atomic_pool_init(&atomic_pool_dma32, + atomic_pool_size, GFP_DMA32); + if (!ret && err) + ret = err; + } + return ret; +} postcore_initcall(dma_atomic_pool_init); -bool dma_in_atomic_pool(void *start, size_t size) +static inline struct gen_pool *dev_to_pool(struct device *dev) { - if (unlikely(!atomic_pool)) - return false; + u64 phys_mask; + gfp_t gfp; - return gen_pool_has_addr(atomic_pool, (unsigned long)start, size); + gfp = dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_mask); + if (IS_ENABLED(CONFIG_ZONE_DMA) && gfp == GFP_DMA) + return atomic_pool_dma; + if (IS_ENABLED(CONFIG_ZONE_DMA32) && gfp == GFP_DMA32) + return atomic_pool_dma32; + return atomic_pool_kernel; } -void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) +static bool dma_in_atomic_pool(struct device *dev, void *start, size_t size) { + struct gen_pool *pool = dev_to_pool(dev); + + if (unlikely(!pool)) + return false; + return gen_pool_has_addr(pool, (unsigned long)start, size); +} + +void *dma_alloc_from_pool(struct device *dev, size_t size, + struct page **ret_page, gfp_t flags) +{ + struct gen_pool *pool = dev_to_pool(dev); unsigned long val; void *ptr = NULL; - if (!atomic_pool) { - WARN(1, "coherent pool not initialised!\n"); + if (!pool) { + WARN(1, "%pGg atomic pool not initialised!\n", &flags); return NULL; } - val = gen_pool_alloc(atomic_pool, size); + val = gen_pool_alloc(pool, size); if (val) { - phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val); + phys_addr_t phys = gen_pool_virt_to_phys(pool, val); *ret_page = pfn_to_page(__phys_to_pfn(phys)); ptr = (void *)val; @@ -114,10 +146,12 @@ void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) return ptr; } -bool dma_free_from_pool(void *start, size_t size) +bool dma_free_from_pool(struct device *dev, void *start, size_t size) { - if (!dma_in_atomic_pool(start, size)) + struct gen_pool *pool = dev_to_pool(dev); + + if (!dma_in_atomic_pool(dev, start, size)) return false; - gen_pool_free(atomic_pool, (unsigned long)start, size); + gen_pool_free(pool, (unsigned long)start, size); return true; } From a6cec3fdbd72e9f975f2db2bbfd1847831478195 Mon Sep 17 00:00:00 2001 From: Ani Sinha Date: Tue, 21 Apr 2020 08:57:50 +0530 Subject: [PATCH 011/427] PCI: pciehp: Remove unused EMI() and HP_SUPR_RM() macros EMI() and HP_SUPR_RM() are unused, so remove them. Link: https://lore.kernel.org/r/1587439673-39652-1-git-send-email-ani@anisinha.ca Signed-off-by: Ani Sinha Signed-off-by: Bjorn Helgaas Reviewed-by: Mika Westerberg --- drivers/pci/hotplug/pciehp.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h index ae44f46d1bf3..4fd200d8b0a9 100644 --- a/drivers/pci/hotplug/pciehp.h +++ b/drivers/pci/hotplug/pciehp.h @@ -148,8 +148,6 @@ struct controller { #define MRL_SENS(ctrl) ((ctrl)->slot_cap & PCI_EXP_SLTCAP_MRLSP) #define ATTN_LED(ctrl) ((ctrl)->slot_cap & PCI_EXP_SLTCAP_AIP) #define PWR_LED(ctrl) ((ctrl)->slot_cap & PCI_EXP_SLTCAP_PIP) -#define HP_SUPR_RM(ctrl) ((ctrl)->slot_cap & PCI_EXP_SLTCAP_HPS) -#define EMI(ctrl) ((ctrl)->slot_cap & PCI_EXP_SLTCAP_EIP) #define NO_CMD_CMPL(ctrl) ((ctrl)->slot_cap & PCI_EXP_SLTCAP_NCCS) #define PSN(ctrl) (((ctrl)->slot_cap & PCI_EXP_SLTCAP_PSN) >> 19) From 7d5b10fcb81e511ddf79c1c6b7f6efb282f80680 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 6 Apr 2020 15:42:01 -0400 Subject: [PATCH 012/427] PCI/P2PDMA: Add AMD Zen Raven and Renoir Root Ports to whitelist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the hardware architect, pre-Zen parts support p2p writes and Zen parts support both p2p reads and writes. Add entries for Zen parts Raven (0x15d0) and Renoir (0x1630). Link: https://lore.kernel.org/r/20200406194201.846411-1-alexander.deucher@amd.com Signed-off-by: Alex Deucher Signed-off-by: Bjorn Helgaas Acked-by: Christian König Acked-by: Huang Rui --- drivers/pci/p2pdma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index b73b10bce0df..e8e444eeb1cd 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -282,6 +282,8 @@ static const struct pci_p2pdma_whitelist_entry { } pci_p2pdma_whitelist[] = { /* AMD ZEN */ {PCI_VENDOR_ID_AMD, 0x1450, 0}, + {PCI_VENDOR_ID_AMD, 0x15d0, 0}, + {PCI_VENDOR_ID_AMD, 0x1630, 0}, /* Intel Xeon E5/Core i7 */ {PCI_VENDOR_ID_INTEL, 0x3c00, REQ_SAME_HOST_BRIDGE}, From c3aaf086701d05a82c8156ee8620af41e5a7d6fe Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 7 Apr 2020 18:23:15 -0500 Subject: [PATCH 013/427] PCI/PM: Call .bridge_d3() hook only if non-NULL 26ad34d510a8 ("PCI / ACPI: Whitelist D3 for more PCIe hotplug ports") added the struct pci_platform_pm_ops.bridge_d3() function pointer and platform_pci_bridge_d3() to use it. The .bridge_d3() op is implemented by acpi_pci_platform_pm, but not by mid_pci_platform_pm. We don't expect platform_pci_bridge_d3() to be called on Intel MID platforms, but nothing in the code itself would prevent that. Check the .bridge_d3() pointer for NULL before calling it. Fixes: 26ad34d510a8 ("PCI / ACPI: Whitelist D3 for more PCIe hotplug ports") Signed-off-by: Bjorn Helgaas Reviewed-by: Mika Westerberg --- drivers/pci/pci.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 595fcf59843f..dfa7ec008963 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -868,7 +868,9 @@ static inline bool platform_pci_need_resume(struct pci_dev *dev) static inline bool platform_pci_bridge_d3(struct pci_dev *dev) { - return pci_platform_pm ? pci_platform_pm->bridge_d3(dev) : false; + if (pci_platform_pm && pci_platform_pm->bridge_d3) + return pci_platform_pm->bridge_d3(dev); + return false; } /** From 94b84ac1979b3b70b80a6d499d0c397057a6c7b3 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 8 Apr 2020 18:10:54 -0500 Subject: [PATCH 014/427] PCI: dra7xx: Don't select CONFIG_PCI_DRA7XX_HOST by default Drivers should not be selected by default because that bloats the kernel for people who don't need them. Enable CONFIG_PCI_DRA7XX_HOST by default only if SOC_DRA7XX. Signed-off-by: Bjorn Helgaas Cc: Kishon Vijay Abraham I Cc: linux-omap@vger.kernel.org --- drivers/pci/controller/dwc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 03dcaf65d159..55d93b4bac58 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -26,7 +26,7 @@ config PCI_DRA7XX_HOST depends on OF && HAS_IOMEM && TI_PIPE3 select PCIE_DW_HOST select PCI_DRA7XX - default y + default y if SOC_DRA7XX help Enables support for the PCIe controller in the DRA7xx SoC to work in host mode. There are two instances of PCIe controller in DRA7xx. From fbedda4e4dc5a96d613dc056a82c3f14942f7998 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 8 Apr 2020 18:08:53 -0500 Subject: [PATCH 015/427] PCI: keystone: Don't select CONFIG_PCI_KEYSTONE_HOST by default Drivers should not be selected by default because that bloats the kernel for people who don't need them. Remove the "default y" for CONFIG_PCI_KEYSTONE_HOST. Signed-off-by: Bjorn Helgaas Cc: Murali Karicheri Cc: linux-arm-kernel@lists.infradead.org --- drivers/pci/controller/dwc/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 55d93b4bac58..ce13fd62d161 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -111,7 +111,6 @@ config PCI_KEYSTONE_HOST depends on PCI_MSI_IRQ_DOMAIN select PCIE_DW_HOST select PCI_KEYSTONE - default y help Enables support for the PCIe controller in the Keystone SoC to work in host mode. The PCI controller on Keystone is based on From 8c8ff55b4da7c614ca159a7c04099d17b37ada25 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 8 Apr 2020 18:13:34 -0500 Subject: [PATCH 016/427] PCI/AER: Don't select CONFIG_PCIEAER by default PCIe Advanced Error Reporting (AER) is optional and there's no need for it to be selected by default. Remove the "default y" for CONFIG_PCIEAER. Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan Cc: Russell Currey Cc: Sam Bobroff Cc: Oliver O'Halloran --- drivers/pci/pcie/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig index 66386811cfde..9cd31331aee9 100644 --- a/drivers/pci/pcie/Kconfig +++ b/drivers/pci/pcie/Kconfig @@ -25,7 +25,6 @@ config PCIEAER bool "PCI Express Advanced Error Reporting support" depends on PCIEPORTBUS select RAS - default y help This enables PCI Express Root Port Advanced Error Reporting (AER) driver support. Error reporting messages sent to Root From adc9fbcd7d04a711b765e8f7f5c2e07cbbac0f20 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 16 Apr 2020 16:51:14 -0500 Subject: [PATCH 017/427] PCI: Use of_node_name_eq() for node name comparisons Convert string compares of DT node names to use of_node_name_eq() helper instead. This removes direct access to the node name pointer. Link: https://lore.kernel.org/r/20200416215114.7715-1-robh@kernel.org Signed-off-by: Rob Herring Signed-off-by: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman --- drivers/pci/hotplug/rpaphp_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c index 6504869efabc..9887c9de08c3 100644 --- a/drivers/pci/hotplug/rpaphp_core.c +++ b/drivers/pci/hotplug/rpaphp_core.c @@ -435,7 +435,7 @@ static int rpaphp_drc_add_slot(struct device_node *dn) */ int rpaphp_add_slot(struct device_node *dn) { - if (!dn->name || strcmp(dn->name, "pci")) + if (!of_node_name_eq(dn, "pci")) return 0; if (of_find_property(dn, "ibm,drc-info", NULL)) From af03958da0678c3162ae534829cabf9f67f0d950 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 15 Apr 2020 17:38:32 -0700 Subject: [PATCH 018/427] PCI/EDR: Log only ACPI_NOTIFY_DISCONNECT_RECOVER events Previously we logged *all* ACPI SYSTEM-level events, which may include lots of non-EDR events. Move the message so we only log those related to EDR. Link: https://lore.kernel.org/r/01afb4e01efbe455de0c445bef6cf3ffc59340d2.1586996350.git.sathyanarayanan.kuppuswamy@linux.intel.com [bhelgaas: drop the pci_dbg() of all events since ACPI can log those already] Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/edr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pcie/edr.c b/drivers/pci/pcie/edr.c index 594622a6cb16..a6b9b479b97a 100644 --- a/drivers/pci/pcie/edr.c +++ b/drivers/pci/pcie/edr.c @@ -148,11 +148,11 @@ static void edr_handle_event(acpi_handle handle, u32 event, void *data) pci_ers_result_t estate = PCI_ERS_RESULT_DISCONNECT; u16 status; - pci_info(pdev, "ACPI event %#x received\n", event); - if (event != ACPI_NOTIFY_DISCONNECT_RECOVER) return; + pci_info(pdev, "EDR event received\n"); + /* Locate the port which issued EDR event */ edev = acpi_dpc_port_get(pdev); if (!edev) { From 54adadf9b08571fb8b11dc9d0d3a2ddd39825efd Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 20 Apr 2020 12:09:58 +0200 Subject: [PATCH 019/427] dma-pool: dynamically expanding atomic pools When an atomic pool becomes fully depleted because it is now relied upon for all non-blocking allocations through the DMA API, allow background expansion of each pool by a kworker. When an atomic pool has less than the default size of memory left, kick off a kworker to dynamically expand the pool in the background. The pool is doubled in size, up to MAX_ORDER-1. If memory cannot be allocated at the requested order, smaller allocation(s) are attempted. This allows the default size to be kept quite low when one or more of the atomic pools is not used. Allocations for lowmem should also use GFP_KERNEL for the benefits of reclaim, so use GFP_KERNEL | GFP_DMA and GFP_KERNEL | GFP_DMA32 for lowmem allocations. This also allows __dma_atomic_pool_init() to return a pointer to the pool to make initialization cleaner. Also switch over some node ids to the more appropriate NUMA_NO_NODE. Signed-off-by: David Rientjes Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 122 +++++++++++++++++++++++++++++++--------------- 1 file changed, 84 insertions(+), 38 deletions(-) diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index db4f89ac5f5f..ffe866c2c034 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -9,13 +9,17 @@ #include #include #include +#include static struct gen_pool *atomic_pool_dma __ro_after_init; static struct gen_pool *atomic_pool_dma32 __ro_after_init; static struct gen_pool *atomic_pool_kernel __ro_after_init; #define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K -static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; +static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE; + +/* Dynamic background expansion when the atomic pool is near capacity */ +static struct work_struct atomic_pool_work; static int __init early_coherent_pool(char *p) { @@ -24,76 +28,116 @@ static int __init early_coherent_pool(char *p) } early_param("coherent_pool", early_coherent_pool); -static int __init __dma_atomic_pool_init(struct gen_pool **pool, - size_t pool_size, gfp_t gfp) +static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, + gfp_t gfp) { - const unsigned int order = get_order(pool_size); - const unsigned long nr_pages = pool_size >> PAGE_SHIFT; + unsigned int order; struct page *page; void *addr; - int ret; + int ret = -ENOMEM; - if (dev_get_cma_area(NULL)) - page = dma_alloc_from_contiguous(NULL, nr_pages, order, false); - else - page = alloc_pages(gfp, order); + /* Cannot allocate larger than MAX_ORDER-1 */ + order = min(get_order(pool_size), MAX_ORDER-1); + + do { + pool_size = 1 << (PAGE_SHIFT + order); + + if (dev_get_cma_area(NULL)) + page = dma_alloc_from_contiguous(NULL, 1 << order, + order, false); + else + page = alloc_pages(gfp, order); + } while (!page && order-- > 0); if (!page) goto out; arch_dma_prep_coherent(page, pool_size); - *pool = gen_pool_create(PAGE_SHIFT, -1); - if (!*pool) - goto free_page; - addr = dma_common_contiguous_remap(page, pool_size, pgprot_dmacoherent(PAGE_KERNEL), __builtin_return_address(0)); if (!addr) - goto destroy_genpool; + goto free_page; - ret = gen_pool_add_virt(*pool, (unsigned long)addr, page_to_phys(page), - pool_size, -1); + ret = gen_pool_add_virt(pool, (unsigned long)addr, page_to_phys(page), + pool_size, NUMA_NO_NODE); if (ret) goto remove_mapping; - gen_pool_set_algo(*pool, gen_pool_first_fit_order_align, NULL); - pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n", - pool_size >> 10, &gfp); return 0; remove_mapping: dma_common_free_remap(addr, pool_size); -destroy_genpool: - gen_pool_destroy(*pool); - *pool = NULL; free_page: - if (!dma_release_from_contiguous(NULL, page, nr_pages)) + if (!dma_release_from_contiguous(NULL, page, 1 << order)) __free_pages(page, order); out: - pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n", - pool_size >> 10, &gfp); - return -ENOMEM; + return ret; +} + +static void atomic_pool_resize(struct gen_pool *pool, gfp_t gfp) +{ + if (pool && gen_pool_avail(pool) < atomic_pool_size) + atomic_pool_expand(pool, gen_pool_size(pool), gfp); +} + +static void atomic_pool_work_fn(struct work_struct *work) +{ + if (IS_ENABLED(CONFIG_ZONE_DMA)) + atomic_pool_resize(atomic_pool_dma, + GFP_KERNEL | GFP_DMA); + if (IS_ENABLED(CONFIG_ZONE_DMA32)) + atomic_pool_resize(atomic_pool_dma32, + GFP_KERNEL | GFP_DMA32); + atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL); +} + +static __init struct gen_pool *__dma_atomic_pool_init(size_t pool_size, + gfp_t gfp) +{ + struct gen_pool *pool; + int ret; + + pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE); + if (!pool) + return NULL; + + gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL); + + ret = atomic_pool_expand(pool, pool_size, gfp); + if (ret) { + gen_pool_destroy(pool); + pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n", + pool_size >> 10, &gfp); + return NULL; + } + + pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n", + gen_pool_size(pool) >> 10, &gfp); + return pool; } static int __init dma_atomic_pool_init(void) { int ret = 0; - int err; - ret = __dma_atomic_pool_init(&atomic_pool_kernel, atomic_pool_size, - GFP_KERNEL); + INIT_WORK(&atomic_pool_work, atomic_pool_work_fn); + + atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size, + GFP_KERNEL); + if (!atomic_pool_kernel) + ret = -ENOMEM; if (IS_ENABLED(CONFIG_ZONE_DMA)) { - err = __dma_atomic_pool_init(&atomic_pool_dma, - atomic_pool_size, GFP_DMA); - if (!ret && err) - ret = err; + atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size, + GFP_KERNEL | GFP_DMA); + if (!atomic_pool_dma) + ret = -ENOMEM; } if (IS_ENABLED(CONFIG_ZONE_DMA32)) { - err = __dma_atomic_pool_init(&atomic_pool_dma32, - atomic_pool_size, GFP_DMA32); - if (!ret && err) - ret = err; + atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size, + GFP_KERNEL | GFP_DMA32); + if (!atomic_pool_dma32) + ret = -ENOMEM; } return ret; } @@ -142,6 +186,8 @@ void *dma_alloc_from_pool(struct device *dev, size_t size, ptr = (void *)val; memset(ptr, 0, size); } + if (gen_pool_avail(pool) < atomic_pool_size) + schedule_work(&atomic_pool_work); return ptr; } From 76a19940bd62a81148c303f3df6d0cee9ae4b509 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2020 17:04:58 -0700 Subject: [PATCH 020/427] dma-direct: atomic allocations must come from atomic coherent pools When a device requires unencrypted memory and the context does not allow blocking, memory must be returned from the atomic coherent pools. This avoids the remap when CONFIG_DMA_DIRECT_REMAP is not enabled and the config only requires CONFIG_DMA_COHERENT_POOL. This will be used for CONFIG_AMD_MEM_ENCRYPT in a subsequent patch. Keep all memory in these pools unencrypted. When set_memory_decrypted() fails, this prohibits the memory from being added. If adding memory to the genpool fails, and set_memory_encrypted() subsequently fails, there is no alternative other than leaking the memory. Signed-off-by: David Rientjes Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 46 ++++++++++++++++++++++++++++++++++++++------- kernel/dma/pool.c | 27 +++++++++++++++++++++++--- 2 files changed, 63 insertions(+), 10 deletions(-) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index a834ee22f8ff..0a4881e59aa7 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -76,6 +76,39 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); } +/* + * Decrypting memory is allowed to block, so if this device requires + * unencrypted memory it must come from atomic pools. + */ +static inline bool dma_should_alloc_from_pool(struct device *dev, gfp_t gfp, + unsigned long attrs) +{ + if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)) + return false; + if (gfpflags_allow_blocking(gfp)) + return false; + if (force_dma_unencrypted(dev)) + return true; + if (!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP)) + return false; + if (dma_alloc_need_uncached(dev, attrs)) + return true; + return false; +} + +static inline bool dma_should_free_from_pool(struct device *dev, + unsigned long attrs) +{ + if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL)) + return true; + if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && + !force_dma_unencrypted(dev)) + return false; + if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP)) + return true; + return false; +} + struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp, unsigned long attrs) { @@ -125,9 +158,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, struct page *page; void *ret; - if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_alloc_need_uncached(dev, attrs) && - !gfpflags_allow_blocking(gfp)) { + if (dma_should_alloc_from_pool(dev, gfp, attrs)) { ret = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &page, gfp); if (!ret) return NULL; @@ -204,6 +235,11 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, { unsigned int page_order = get_order(size); + /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ + if (dma_should_free_from_pool(dev, attrs) && + dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) + return; + if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && !force_dma_unencrypted(dev)) { /* cpu_addr is a struct page cookie, not a kernel address */ @@ -211,10 +247,6 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, return; } - if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) - return; - if (force_dma_unencrypted(dev)) set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index ffe866c2c034..c8d61b3a7bd6 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -53,22 +54,42 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, arch_dma_prep_coherent(page, pool_size); +#ifdef CONFIG_DMA_DIRECT_REMAP addr = dma_common_contiguous_remap(page, pool_size, pgprot_dmacoherent(PAGE_KERNEL), __builtin_return_address(0)); if (!addr) goto free_page; - +#else + addr = page_to_virt(page); +#endif + /* + * Memory in the atomic DMA pools must be unencrypted, the pools do not + * shrink so no re-encryption occurs in dma_direct_free_pages(). + */ + ret = set_memory_decrypted((unsigned long)page_to_virt(page), + 1 << order); + if (ret) + goto remove_mapping; ret = gen_pool_add_virt(pool, (unsigned long)addr, page_to_phys(page), pool_size, NUMA_NO_NODE); if (ret) - goto remove_mapping; + goto encrypt_mapping; return 0; +encrypt_mapping: + ret = set_memory_encrypted((unsigned long)page_to_virt(page), + 1 << order); + if (WARN_ON_ONCE(ret)) { + /* Decrypt succeeded but encrypt failed, purposely leak */ + goto out; + } remove_mapping: +#ifdef CONFIG_DMA_DIRECT_REMAP dma_common_free_remap(addr, pool_size); -free_page: +#endif +free_page: __maybe_unused if (!dma_release_from_contiguous(NULL, page, 1 << order)) __free_pages(page, order); out: From 2edc5bb3c5cc42131438460a50b7b16905c81c2a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2020 17:04:59 -0700 Subject: [PATCH 021/427] dma-pool: add pool sizes to debugfs The atomic DMA pools can dynamically expand based on non-blocking allocations that need to use it. Export the sizes of each of these pools, in bytes, through debugfs for measurement. Suggested-by: Christoph Hellwig Signed-off-by: David Rientjes [hch: remove the !CONFIG_DEBUG_FS stubs] Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index c8d61b3a7bd6..dde6de7f8e83 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -3,6 +3,7 @@ * Copyright (C) 2012 ARM Ltd. * Copyright (C) 2020 Google LLC */ +#include #include #include #include @@ -13,8 +14,11 @@ #include static struct gen_pool *atomic_pool_dma __ro_after_init; +static unsigned long pool_size_dma; static struct gen_pool *atomic_pool_dma32 __ro_after_init; +static unsigned long pool_size_dma32; static struct gen_pool *atomic_pool_kernel __ro_after_init; +static unsigned long pool_size_kernel; #define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE; @@ -29,6 +33,29 @@ static int __init early_coherent_pool(char *p) } early_param("coherent_pool", early_coherent_pool); +static void __init dma_atomic_pool_debugfs_init(void) +{ + struct dentry *root; + + root = debugfs_create_dir("dma_pools", NULL); + if (IS_ERR_OR_NULL(root)) + return; + + debugfs_create_ulong("pool_size_dma", 0400, root, &pool_size_dma); + debugfs_create_ulong("pool_size_dma32", 0400, root, &pool_size_dma32); + debugfs_create_ulong("pool_size_kernel", 0400, root, &pool_size_kernel); +} + +static void dma_atomic_pool_size_add(gfp_t gfp, size_t size) +{ + if (gfp & __GFP_DMA) + pool_size_dma += size; + else if (gfp & __GFP_DMA32) + pool_size_dma32 += size; + else + pool_size_kernel += size; +} + static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, gfp_t gfp) { @@ -76,6 +103,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, if (ret) goto encrypt_mapping; + dma_atomic_pool_size_add(gfp, pool_size); return 0; encrypt_mapping: @@ -160,6 +188,8 @@ static int __init dma_atomic_pool_init(void) if (!atomic_pool_dma32) ret = -ENOMEM; } + + dma_atomic_pool_debugfs_init(); return ret; } postcore_initcall(dma_atomic_pool_init); From 82fef0ad811fb5976cf36ccc3d2c3bc0195dfb72 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2020 17:05:01 -0700 Subject: [PATCH 022/427] x86/mm: unencrypted non-blocking DMA allocations use coherent pools When CONFIG_AMD_MEM_ENCRYPT is enabled and a device requires unencrypted DMA, all non-blocking allocations must originate from the atomic DMA coherent pools. Select CONFIG_DMA_COHERENT_POOL for CONFIG_AMD_MEM_ENCRYPT. Signed-off-by: David Rientjes Signed-off-by: Christoph Hellwig --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1d6104ea8af0..2bf2222819d3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1520,6 +1520,7 @@ config X86_CPA_STATISTICS config AMD_MEM_ENCRYPT bool "AMD Secure Memory Encryption (SME) support" depends on X86_64 && CPU_SUP_AMD + select DMA_COHERENT_POOL select DYNAMIC_PHYSICAL_MASK select ARCH_USE_MEMREMAP_PROT select ARCH_HAS_FORCE_DMA_UNENCRYPTED From 1d659236fb43c4d2b37af7a4309681e834e9ec9a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2020 17:05:02 -0700 Subject: [PATCH 023/427] dma-pool: scale the default DMA coherent pool size with memory capacity When AMD memory encryption is enabled, some devices may use more than 256KB/sec from the atomic pools. It would be more appropriate to scale the default size based on memory capacity unless the coherent_pool option is used on the kernel command line. This provides a slight optimization on initial expansion and is deemed appropriate due to the increased reliance on the atomic pools. Note that the default size of 128KB per pool will normally be larger than the single coherent pool implementation since there are now up to three coherent pools (DMA, DMA32, and kernel). Note that even prior to this patch, coherent_pool= for sizes larger than 1 << (PAGE_SHIFT + MAX_ORDER-1) can fail. With new dynamic expansion support, this would be trivially extensible to allow even larger initial sizes. Signed-off-by: David Rientjes Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index dde6de7f8e83..35bb51c31fff 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -20,8 +20,8 @@ static unsigned long pool_size_dma32; static struct gen_pool *atomic_pool_kernel __ro_after_init; static unsigned long pool_size_kernel; -#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K -static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE; +/* Size can be defined by the coherent_pool command line */ +static size_t atomic_pool_size; /* Dynamic background expansion when the atomic pool is near capacity */ static struct work_struct atomic_pool_work; @@ -170,6 +170,16 @@ static int __init dma_atomic_pool_init(void) { int ret = 0; + /* + * If coherent_pool was not used on the command line, default the pool + * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER-1. + */ + if (!atomic_pool_size) { + atomic_pool_size = max(totalram_pages() >> PAGE_SHIFT, 1UL) * + SZ_128K; + atomic_pool_size = min_t(size_t, atomic_pool_size, + 1 << (PAGE_SHIFT + MAX_ORDER-1)); + } INIT_WORK(&atomic_pool_work, atomic_pool_work_fn); atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size, From 298f3db6ee690259927b105d5ad1079563361323 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 23 Apr 2020 09:31:31 -0700 Subject: [PATCH 024/427] dma-contiguous: fix comment for dma_release_from_contiguous Commit 90ae409f9eb3 ("dma-direct: fix zone selection after an unaddressable CMA allocation") changed the logic in dma_release_from_contiguous to remove the normal pages fallback path, but did not update the comment. Fix that. Signed-off-by: Peter Collingbourne Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 8bc6f2d670f9..15bc5026c485 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -222,8 +222,8 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, * @gfp: Allocation flags. * * This function allocates contiguous memory buffer for specified device. It - * first tries to use device specific contiguous memory area if available or - * the default global one, then tries a fallback allocation of normal pages. + * tries to use device specific contiguous memory area if available, or the + * default global one. * * Note that it byapss one-page size of allocations from the global area as * the addresses within one page are always contiguous, so there is no need From c100beb9ccfb98e2474586a4006483cbf770c823 Mon Sep 17 00:00:00 2001 From: Alexandru Gagniuc Date: Mon, 27 Apr 2020 18:25:13 -0500 Subject: [PATCH 025/427] PCI/AER: Use only _OSC to determine AER ownership Per the PCI Firmware spec, r3.2, sec 4.5.1, the OS can request control of AER via bit 3 of the _OSC Control Field. In the returned value of the Control Field: The firmware sets [bit 3] to 1 to grant control over PCI Express Advanced Error Reporting. ... after control is transferred to the operating system, firmware must not modify the Advanced Error Reporting Capability. If control of this feature was requested and denied or was not requested, firmware returns this bit set to 0. Previously the pci_root driver looked at the HEST FIRMWARE_FIRST bit to determine whether to request ownership of the AER Capability. This was based on ACPI spec v6.3, sec 18.3.2.4, and similar sections, which say things like: Bit [0] - FIRMWARE_FIRST: If set, indicates that system firmware will handle errors from this source first. Bit [1] - GLOBAL: If set, indicates that the settings contained in this structure apply globally to all PCI Express Devices. These ACPI references don't say anything about ownership of the AER Capability. Remove use of the FIRMWARE_FIRST bit and rely only on the _OSC bit to determine whether we have control of the AER Capability. Link: https://lore.kernel.org/r/20181115231605.24352-1-mr.nuke.me@gmail.com/ v1 Link: https://lore.kernel.org/r/20190326172343.28946-1-mr.nuke.me@gmail.com/ v2 Link: https://lore.kernel.org/r/67af2931705bed9a588b5a39d369cb70b9942190.1587925636.git.sathyanarayanan.kuppuswamy@linux.intel.com [bhelgaas: commit log, note: Alex posted this identical patch 18 months ago, and I failed to apply it then, so I made him the author, added links to his postings, and added his Signed-off-by] Signed-off-by: Alexandru Gagniuc Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Bjorn Helgaas Reviewed-by: Jon Derrick --- drivers/acpi/pci_root.c | 9 ++------- drivers/pci/pcie/aer.c | 26 +------------------------- include/linux/pci-acpi.h | 6 ------ 3 files changed, 3 insertions(+), 38 deletions(-) diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index ac8ad6cb82aa..9e235c1a75ff 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -483,13 +483,8 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm, if (IS_ENABLED(CONFIG_HOTPLUG_PCI_SHPC)) control |= OSC_PCI_SHPC_NATIVE_HP_CONTROL; - if (pci_aer_available()) { - if (aer_acpi_firmware_first()) - dev_info(&device->dev, - "PCIe AER handled by firmware\n"); - else - control |= OSC_PCI_EXPRESS_AER_CONTROL; - } + if (pci_aer_available()) + control |= OSC_PCI_EXPRESS_AER_CONTROL; /* * Per the Downstream Port Containment Related Enhancements ECN to diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index f4274d301235..efc26773cc6d 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -318,30 +318,6 @@ int pcie_aer_get_firmware_first(struct pci_dev *dev) aer_set_firmware_first(dev); return dev->__aer_firmware_first; } - -static bool aer_firmware_first; - -/** - * aer_acpi_firmware_first - Check if APEI should control AER. - */ -bool aer_acpi_firmware_first(void) -{ - static bool parsed = false; - struct aer_hest_parse_info info = { - .pci_dev = NULL, /* Check all PCIe devices */ - .firmware_first = 0, - }; - - if (pcie_ports_native) - return false; - - if (!parsed) { - apei_hest_parse(aer_hest_parse, &info); - aer_firmware_first = info.firmware_first; - parsed = true; - } - return aer_firmware_first; -} #endif #define PCI_EXP_AER_FLAGS (PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE | \ @@ -1523,7 +1499,7 @@ static struct pcie_port_service_driver aerdriver = { */ int __init pcie_aer_init(void) { - if (!pci_aer_available() || aer_acpi_firmware_first()) + if (!pci_aer_available()) return -ENXIO; return pcie_port_service_register(&aerdriver); } diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 2d155bfb8fbf..11c98875538a 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -125,10 +125,4 @@ static inline void acpi_pci_add_bus(struct pci_bus *bus) { } static inline void acpi_pci_remove_bus(struct pci_bus *bus) { } #endif /* CONFIG_ACPI */ -#ifdef CONFIG_ACPI_APEI -extern bool aer_acpi_firmware_first(void); -#else -static inline bool aer_acpi_firmware_first(void) { return false; } -#endif - #endif /* _PCI_ACPI_H_ */ From 0b104773b4f72ccd8af98a2f1efe69b174c344d3 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 9 Apr 2020 17:49:21 -0600 Subject: [PATCH 026/427] PCI: Constify struct pci_ecam_ops struct pci_ecam_ops is typically DT match table data which is defined to be const. It's also best practice for ops structs to be const. Ideally, we'd make struct pci_ops const as well, but that becomes pretty invasive, so for now we just cast it where needed. Link: https://lore.kernel.org/r/20200409234923.21598-2-robh@kernel.org Signed-off-by: Rob Herring Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas Acked-by: Catalin Marinas Cc: Catalin Marinas Cc: Will Deacon Cc: Lorenzo Pieralisi Cc: Andrew Murray Cc: Bjorn Helgaas Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Jonathan Chocron Cc: Zhou Wang Cc: Robert Richter Cc: Toan Le Cc: Marc Gonzalez Cc: Mans Rullgard Cc: linux-acpi@vger.kernel.org --- arch/arm64/kernel/pci.c | 4 ++-- drivers/acpi/pci_mcfg.c | 8 ++++---- drivers/pci/controller/dwc/pcie-al.c | 2 +- drivers/pci/controller/dwc/pcie-hisi.c | 8 ++++---- drivers/pci/controller/pci-host-common.c | 6 +++--- drivers/pci/controller/pci-host-generic.c | 4 ++-- drivers/pci/controller/pci-thunder-ecam.c | 2 +- drivers/pci/controller/pci-thunder-pem.c | 4 ++-- drivers/pci/controller/pci-xgene.c | 4 ++-- drivers/pci/controller/pcie-tango.c | 2 +- drivers/pci/ecam.c | 6 +++--- include/linux/pci-acpi.h | 2 +- include/linux/pci-ecam.h | 22 +++++++++++----------- 13 files changed, 37 insertions(+), 37 deletions(-) diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c index 570988c7a7ff..1006ed2d7c60 100644 --- a/arch/arm64/kernel/pci.c +++ b/arch/arm64/kernel/pci.c @@ -117,7 +117,7 @@ pci_acpi_setup_ecam_mapping(struct acpi_pci_root *root) struct device *dev = &root->device->dev; struct resource *bus_res = &root->secondary; u16 seg = root->segment; - struct pci_ecam_ops *ecam_ops; + const struct pci_ecam_ops *ecam_ops; struct resource cfgres; struct acpi_device *adev; struct pci_config_window *cfg; @@ -185,7 +185,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) root_ops->release_info = pci_acpi_generic_release_info; root_ops->prepare_resources = pci_acpi_root_prepare_resources; - root_ops->pci_ops = &ri->cfg->ops->pci_ops; + root_ops->pci_ops = (struct pci_ops *)&ri->cfg->ops->pci_ops; bus = acpi_pci_root_create(root, root_ops, &ri->common, ri->cfg); if (!bus) return NULL; diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c index 6b347d9920cc..54b36b7ad47d 100644 --- a/drivers/acpi/pci_mcfg.c +++ b/drivers/acpi/pci_mcfg.c @@ -29,7 +29,7 @@ struct mcfg_fixup { u32 oem_revision; u16 segment; struct resource bus_range; - struct pci_ecam_ops *ops; + const struct pci_ecam_ops *ops; struct resource cfgres; }; @@ -165,7 +165,7 @@ static int pci_mcfg_quirk_matches(struct mcfg_fixup *f, u16 segment, static void pci_mcfg_apply_quirks(struct acpi_pci_root *root, struct resource *cfgres, - struct pci_ecam_ops **ecam_ops) + const struct pci_ecam_ops **ecam_ops) { #ifdef CONFIG_PCI_QUIRKS u16 segment = root->segment; @@ -191,9 +191,9 @@ static void pci_mcfg_apply_quirks(struct acpi_pci_root *root, static LIST_HEAD(pci_mcfg_list); int pci_mcfg_lookup(struct acpi_pci_root *root, struct resource *cfgres, - struct pci_ecam_ops **ecam_ops) + const struct pci_ecam_ops **ecam_ops) { - struct pci_ecam_ops *ops = &pci_generic_ecam_ops; + const struct pci_ecam_ops *ops = &pci_generic_ecam_ops; struct resource *bus_res = &root->secondary; u16 seg = root->segment; struct mcfg_entry *e; diff --git a/drivers/pci/controller/dwc/pcie-al.c b/drivers/pci/controller/dwc/pcie-al.c index 1eeda2f6371f..270868f3859a 100644 --- a/drivers/pci/controller/dwc/pcie-al.c +++ b/drivers/pci/controller/dwc/pcie-al.c @@ -80,7 +80,7 @@ static int al_pcie_init(struct pci_config_window *cfg) return 0; } -struct pci_ecam_ops al_pcie_ops = { +const struct pci_ecam_ops al_pcie_ops = { .bus_shift = 20, .init = al_pcie_init, .pci_ops = { diff --git a/drivers/pci/controller/dwc/pcie-hisi.c b/drivers/pci/controller/dwc/pcie-hisi.c index 6d9e1b2b8f7b..90017045334d 100644 --- a/drivers/pci/controller/dwc/pcie-hisi.c +++ b/drivers/pci/controller/dwc/pcie-hisi.c @@ -104,7 +104,7 @@ static int hisi_pcie_init(struct pci_config_window *cfg) return 0; } -struct pci_ecam_ops hisi_pcie_ops = { +const struct pci_ecam_ops hisi_pcie_ops = { .bus_shift = 20, .init = hisi_pcie_init, .pci_ops = { @@ -362,7 +362,7 @@ static int hisi_pcie_platform_init(struct pci_config_window *cfg) return 0; } -struct pci_ecam_ops hisi_pcie_platform_ops = { +const struct pci_ecam_ops hisi_pcie_platform_ops = { .bus_shift = 20, .init = hisi_pcie_platform_init, .pci_ops = { @@ -375,11 +375,11 @@ struct pci_ecam_ops hisi_pcie_platform_ops = { static const struct of_device_id hisi_pcie_almost_ecam_of_match[] = { { .compatible = "hisilicon,hip06-pcie-ecam", - .data = (void *) &hisi_pcie_platform_ops, + .data = &hisi_pcie_platform_ops, }, { .compatible = "hisilicon,hip07-pcie-ecam", - .data = (void *) &hisi_pcie_platform_ops, + .data = &hisi_pcie_platform_ops, }, {}, }; diff --git a/drivers/pci/controller/pci-host-common.c b/drivers/pci/controller/pci-host-common.c index 250a3fc80ec6..f6d5dc068488 100644 --- a/drivers/pci/controller/pci-host-common.c +++ b/drivers/pci/controller/pci-host-common.c @@ -19,7 +19,7 @@ static void gen_pci_unmap_cfg(void *ptr) } static struct pci_config_window *gen_pci_init(struct device *dev, - struct list_head *resources, struct pci_ecam_ops *ops) + struct list_head *resources, const struct pci_ecam_ops *ops) { int err; struct resource cfgres; @@ -55,7 +55,7 @@ err_out: } int pci_host_common_probe(struct platform_device *pdev, - struct pci_ecam_ops *ops) + const struct pci_ecam_ops *ops) { struct device *dev = &pdev->dev; struct pci_host_bridge *bridge; @@ -82,7 +82,7 @@ int pci_host_common_probe(struct platform_device *pdev, bridge->dev.parent = dev; bridge->sysdata = cfg; bridge->busnr = cfg->busr.start; - bridge->ops = &ops->pci_ops; + bridge->ops = (struct pci_ops *)&ops->pci_ops; bridge->map_irq = of_irq_parse_and_map_pci; bridge->swizzle_irq = pci_common_swizzle; diff --git a/drivers/pci/controller/pci-host-generic.c b/drivers/pci/controller/pci-host-generic.c index 75a2fb930d4b..7e9a7c0833b1 100644 --- a/drivers/pci/controller/pci-host-generic.c +++ b/drivers/pci/controller/pci-host-generic.c @@ -15,7 +15,7 @@ #include #include -static struct pci_ecam_ops gen_pci_cfg_cam_bus_ops = { +static const struct pci_ecam_ops gen_pci_cfg_cam_bus_ops = { .bus_shift = 16, .pci_ops = { .map_bus = pci_ecam_map_bus, @@ -49,7 +49,7 @@ static void __iomem *pci_dw_ecam_map_bus(struct pci_bus *bus, return pci_ecam_map_bus(bus, devfn, where); } -static struct pci_ecam_ops pci_dw_ecam_bus_ops = { +static const struct pci_ecam_ops pci_dw_ecam_bus_ops = { .bus_shift = 20, .pci_ops = { .map_bus = pci_dw_ecam_map_bus, diff --git a/drivers/pci/controller/pci-thunder-ecam.c b/drivers/pci/controller/pci-thunder-ecam.c index 32d1d7b81ef4..c3fdd3e6b21c 100644 --- a/drivers/pci/controller/pci-thunder-ecam.c +++ b/drivers/pci/controller/pci-thunder-ecam.c @@ -345,7 +345,7 @@ static int thunder_ecam_config_write(struct pci_bus *bus, unsigned int devfn, return pci_generic_config_write(bus, devfn, where, size, val); } -struct pci_ecam_ops pci_thunder_ecam_ops = { +const struct pci_ecam_ops pci_thunder_ecam_ops = { .bus_shift = 20, .pci_ops = { .map_bus = pci_ecam_map_bus, diff --git a/drivers/pci/controller/pci-thunder-pem.c b/drivers/pci/controller/pci-thunder-pem.c index 9491e266b1ea..2e792707ceab 100644 --- a/drivers/pci/controller/pci-thunder-pem.c +++ b/drivers/pci/controller/pci-thunder-pem.c @@ -403,7 +403,7 @@ static int thunder_pem_acpi_init(struct pci_config_window *cfg) return thunder_pem_init(dev, cfg, res_pem); } -struct pci_ecam_ops thunder_pem_ecam_ops = { +const struct pci_ecam_ops thunder_pem_ecam_ops = { .bus_shift = 24, .init = thunder_pem_acpi_init, .pci_ops = { @@ -440,7 +440,7 @@ static int thunder_pem_platform_init(struct pci_config_window *cfg) return thunder_pem_init(dev, cfg, res_pem); } -static struct pci_ecam_ops pci_thunder_pem_ops = { +static const struct pci_ecam_ops pci_thunder_pem_ops = { .bus_shift = 24, .init = thunder_pem_platform_init, .pci_ops = { diff --git a/drivers/pci/controller/pci-xgene.c b/drivers/pci/controller/pci-xgene.c index de195fd430dc..d1efa8ffbae1 100644 --- a/drivers/pci/controller/pci-xgene.c +++ b/drivers/pci/controller/pci-xgene.c @@ -256,7 +256,7 @@ static int xgene_v1_pcie_ecam_init(struct pci_config_window *cfg) return xgene_pcie_ecam_init(cfg, XGENE_PCIE_IP_VER_1); } -struct pci_ecam_ops xgene_v1_pcie_ecam_ops = { +const struct pci_ecam_ops xgene_v1_pcie_ecam_ops = { .bus_shift = 16, .init = xgene_v1_pcie_ecam_init, .pci_ops = { @@ -271,7 +271,7 @@ static int xgene_v2_pcie_ecam_init(struct pci_config_window *cfg) return xgene_pcie_ecam_init(cfg, XGENE_PCIE_IP_VER_2); } -struct pci_ecam_ops xgene_v2_pcie_ecam_ops = { +const struct pci_ecam_ops xgene_v2_pcie_ecam_ops = { .bus_shift = 16, .init = xgene_v2_pcie_ecam_init, .pci_ops = { diff --git a/drivers/pci/controller/pcie-tango.c b/drivers/pci/controller/pcie-tango.c index 21a208da3f59..3b2b10906fdd 100644 --- a/drivers/pci/controller/pcie-tango.c +++ b/drivers/pci/controller/pcie-tango.c @@ -207,7 +207,7 @@ static int smp8759_config_write(struct pci_bus *bus, unsigned int devfn, return ret; } -static struct pci_ecam_ops smp8759_ecam_ops = { +static const struct pci_ecam_ops smp8759_ecam_ops = { .bus_shift = 20, .pci_ops = { .map_bus = pci_ecam_map_bus, diff --git a/drivers/pci/ecam.c b/drivers/pci/ecam.c index 1a81af0ba961..1b05ff627859 100644 --- a/drivers/pci/ecam.c +++ b/drivers/pci/ecam.c @@ -26,7 +26,7 @@ static const bool per_bus_mapping = !IS_ENABLED(CONFIG_64BIT); */ struct pci_config_window *pci_ecam_create(struct device *dev, struct resource *cfgres, struct resource *busr, - struct pci_ecam_ops *ops) + const struct pci_ecam_ops *ops) { struct pci_config_window *cfg; unsigned int bus_range, bus_range_max, bsz; @@ -145,7 +145,7 @@ void __iomem *pci_ecam_map_bus(struct pci_bus *bus, unsigned int devfn, } /* ECAM ops */ -struct pci_ecam_ops pci_generic_ecam_ops = { +const struct pci_ecam_ops pci_generic_ecam_ops = { .bus_shift = 20, .pci_ops = { .map_bus = pci_ecam_map_bus, @@ -156,7 +156,7 @@ struct pci_ecam_ops pci_generic_ecam_ops = { #if defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS) /* ECAM ops for 32-bit access only (non-compliant) */ -struct pci_ecam_ops pci_32b_ops = { +const struct pci_ecam_ops pci_32b_ops = { .bus_shift = 20, .pci_ops = { .map_bus = pci_ecam_map_bus, diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 2d155bfb8fbf..81f5535ca1b5 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -27,7 +27,7 @@ extern phys_addr_t acpi_pci_root_get_mcfg_addr(acpi_handle handle); struct pci_ecam_ops; extern int pci_mcfg_lookup(struct acpi_pci_root *root, struct resource *cfgres, - struct pci_ecam_ops **ecam_ops); + const struct pci_ecam_ops **ecam_ops); static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev) { diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h index a73164c85e78..6c21dd0901ab 100644 --- a/include/linux/pci-ecam.h +++ b/include/linux/pci-ecam.h @@ -29,7 +29,7 @@ struct pci_config_window { struct resource res; struct resource busr; void *priv; - struct pci_ecam_ops *ops; + const struct pci_ecam_ops *ops; union { void __iomem *win; /* 64-bit single mapping */ void __iomem **winp; /* 32-bit per-bus mapping */ @@ -40,29 +40,29 @@ struct pci_config_window { /* create and free pci_config_window */ struct pci_config_window *pci_ecam_create(struct device *dev, struct resource *cfgres, struct resource *busr, - struct pci_ecam_ops *ops); + const struct pci_ecam_ops *ops); void pci_ecam_free(struct pci_config_window *cfg); /* map_bus when ->sysdata is an instance of pci_config_window */ void __iomem *pci_ecam_map_bus(struct pci_bus *bus, unsigned int devfn, int where); /* default ECAM ops */ -extern struct pci_ecam_ops pci_generic_ecam_ops; +extern const struct pci_ecam_ops pci_generic_ecam_ops; #if defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS) -extern struct pci_ecam_ops pci_32b_ops; /* 32-bit accesses only */ -extern struct pci_ecam_ops hisi_pcie_ops; /* HiSilicon */ -extern struct pci_ecam_ops thunder_pem_ecam_ops; /* Cavium ThunderX 1.x & 2.x */ -extern struct pci_ecam_ops pci_thunder_ecam_ops; /* Cavium ThunderX 1.x */ -extern struct pci_ecam_ops xgene_v1_pcie_ecam_ops; /* APM X-Gene PCIe v1 */ -extern struct pci_ecam_ops xgene_v2_pcie_ecam_ops; /* APM X-Gene PCIe v2.x */ -extern struct pci_ecam_ops al_pcie_ops; /* Amazon Annapurna Labs PCIe */ +extern const struct pci_ecam_ops pci_32b_ops; /* 32-bit accesses only */ +extern const struct pci_ecam_ops hisi_pcie_ops; /* HiSilicon */ +extern const struct pci_ecam_ops thunder_pem_ecam_ops; /* Cavium ThunderX 1.x & 2.x */ +extern const struct pci_ecam_ops pci_thunder_ecam_ops; /* Cavium ThunderX 1.x */ +extern const struct pci_ecam_ops xgene_v1_pcie_ecam_ops; /* APM X-Gene PCIe v1 */ +extern const struct pci_ecam_ops xgene_v2_pcie_ecam_ops; /* APM X-Gene PCIe v2.x */ +extern const struct pci_ecam_ops al_pcie_ops; /* Amazon Annapurna Labs PCIe */ #endif #ifdef CONFIG_PCI_HOST_COMMON /* for DT-based PCI controllers that support ECAM */ int pci_host_common_probe(struct platform_device *pdev, - struct pci_ecam_ops *ops); + const struct pci_ecam_ops *ops); int pci_host_common_remove(struct platform_device *pdev); #endif #endif From 0c59c06a7c90390c3985c9acd58a73320781c15e Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 9 Apr 2020 17:49:22 -0600 Subject: [PATCH 027/427] PCI: host-generic: Support building as modules Enable building host-generic and its host-common dependency as a module. Link: https://lore.kernel.org/r/20200409234923.21598-3-robh@kernel.org Signed-off-by: Rob Herring Signed-off-by: Lorenzo Pieralisi Acked-by: Will Deacon Acked-by: Bjorn Helgaas Cc: Lorenzo Pieralisi Cc: Andrew Murray Cc: Bjorn Helgaas Cc: Will Deacon Cc: linux-pci@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org --- drivers/pci/controller/Kconfig | 4 ++-- drivers/pci/controller/pci-host-common.c | 5 +++++ drivers/pci/controller/pci-host-generic.c | 7 +++++-- drivers/pci/ecam.c | 4 ++++ drivers/pci/setup-bus.c | 1 + include/linux/pci-ecam.h | 2 +- 6 files changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig index 91bfdb784829..416a53414728 100644 --- a/drivers/pci/controller/Kconfig +++ b/drivers/pci/controller/Kconfig @@ -62,11 +62,11 @@ config PCIE_RCAR Say Y here if you want PCIe controller support on R-Car SoCs. config PCI_HOST_COMMON - bool + tristate select PCI_ECAM config PCI_HOST_GENERIC - bool "Generic PCI host controller" + tristate "Generic PCI host controller" depends on OF select PCI_HOST_COMMON select IRQ_DOMAIN diff --git a/drivers/pci/controller/pci-host-common.c b/drivers/pci/controller/pci-host-common.c index f6d5dc068488..6d15bc12e726 100644 --- a/drivers/pci/controller/pci-host-common.c +++ b/drivers/pci/controller/pci-host-common.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -95,6 +96,7 @@ int pci_host_common_probe(struct platform_device *pdev, platform_set_drvdata(pdev, bridge->bus); return 0; } +EXPORT_SYMBOL_GPL(pci_host_common_probe); int pci_host_common_remove(struct platform_device *pdev) { @@ -107,3 +109,6 @@ int pci_host_common_remove(struct platform_device *pdev) return 0; } +EXPORT_SYMBOL_GPL(pci_host_common_remove); + +MODULE_LICENSE("GPL v2"); diff --git a/drivers/pci/controller/pci-host-generic.c b/drivers/pci/controller/pci-host-generic.c index 7e9a7c0833b1..fd8cff61de14 100644 --- a/drivers/pci/controller/pci-host-generic.c +++ b/drivers/pci/controller/pci-host-generic.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -76,6 +77,7 @@ static const struct of_device_id gen_pci_of_match[] = { { }, }; +MODULE_DEVICE_TABLE(of, gen_pci_of_match); static int gen_pci_probe(struct platform_device *pdev) { @@ -92,9 +94,10 @@ static struct platform_driver gen_pci_driver = { .driver = { .name = "pci-host-generic", .of_match_table = gen_pci_of_match, - .suppress_bind_attrs = true, }, .probe = gen_pci_probe, .remove = pci_host_common_remove, }; -builtin_platform_driver(gen_pci_driver); +module_platform_driver(gen_pci_driver); + +MODULE_LICENSE("GPL v2"); diff --git a/drivers/pci/ecam.c b/drivers/pci/ecam.c index 1b05ff627859..8f065a42fc1a 100644 --- a/drivers/pci/ecam.c +++ b/drivers/pci/ecam.c @@ -101,6 +101,7 @@ err_exit: pci_ecam_free(cfg); return ERR_PTR(err); } +EXPORT_SYMBOL_GPL(pci_ecam_create); void pci_ecam_free(struct pci_config_window *cfg) { @@ -121,6 +122,7 @@ void pci_ecam_free(struct pci_config_window *cfg) release_resource(&cfg->res); kfree(cfg); } +EXPORT_SYMBOL_GPL(pci_ecam_free); /* * Function to implement the pci_ops ->map_bus method @@ -143,6 +145,7 @@ void __iomem *pci_ecam_map_bus(struct pci_bus *bus, unsigned int devfn, base = cfg->win + (busn << cfg->ops->bus_shift); return base + (devfn << devfn_shift) + where; } +EXPORT_SYMBOL_GPL(pci_ecam_map_bus); /* ECAM ops */ const struct pci_ecam_ops pci_generic_ecam_ops = { @@ -153,6 +156,7 @@ const struct pci_ecam_ops pci_generic_ecam_ops = { .write = pci_generic_config_write, } }; +EXPORT_SYMBOL_GPL(pci_generic_ecam_ops); #if defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS) /* ECAM ops for 32-bit access only (non-compliant) */ diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index bbcef1a053ab..5b35f7fc2ace 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -26,6 +26,7 @@ #include "pci.h" unsigned int pci_flags; +EXPORT_SYMBOL_GPL(pci_flags); struct pci_dev_resource { struct list_head list; diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h index 6c21dd0901ab..fd0edb8b8a00 100644 --- a/include/linux/pci-ecam.h +++ b/include/linux/pci-ecam.h @@ -59,7 +59,7 @@ extern const struct pci_ecam_ops xgene_v2_pcie_ecam_ops; /* APM X-Gene PCIe v2.x extern const struct pci_ecam_ops al_pcie_ops; /* Amazon Annapurna Labs PCIe */ #endif -#ifdef CONFIG_PCI_HOST_COMMON +#if IS_ENABLED(CONFIG_PCI_HOST_COMMON) /* for DT-based PCI controllers that support ECAM */ int pci_host_common_probe(struct platform_device *pdev, const struct pci_ecam_ops *ops); From d09ddd8190fbdc07696bf34b548ae15aa1816714 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 21 Apr 2020 18:22:56 +0200 Subject: [PATCH 028/427] PCI: Allow pci_resize_resource() for devices on root bus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When resizing a BAR, pci_reassign_bridge_resources() is invoked to bring the bridge windows of parent bridges in line with the new BAR assignment. This assumes the device whose BAR is being resized lives on a subordinate bus, but this is not necessarily the case. A device may live on the root bus, in which case dev->bus->self is NULL, and passing a NULL pci_dev pointer to pci_reassign_bridge_resources() will cause it to crash. So let's make the call to pci_reassign_bridge_resources() conditional on whether dev->bus->self is non-NULL in the first place. Fixes: 8bb705e3e79d84e7 ("PCI: Add pci_resize_resource() for resizing BARs") Link: https://lore.kernel.org/r/20200421162256.26887-1-ardb@kernel.org Signed-off-by: Ard Biesheuvel Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König --- drivers/pci/setup-res.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index d8ca40a97693..d21fa04fa44d 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -439,10 +439,11 @@ int pci_resize_resource(struct pci_dev *dev, int resno, int size) res->end = res->start + pci_rebar_size_to_bytes(size) - 1; /* Check if the new config works by trying to assign everything. */ - ret = pci_reassign_bridge_resources(dev->bus->self, res->flags); - if (ret) - goto error_resize; - + if (dev->bus->self) { + ret = pci_reassign_bridge_resources(dev->bus->self, res->flags); + if (ret) + goto error_resize; + } return 0; error_resize: From bf7116204657cabf6f74b5ade32424d04f480338 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 20 Apr 2020 08:52:27 +0200 Subject: [PATCH 029/427] PCI: dwc: Clean up computing of msix_tbl Commit 6f5e193bfb55 ("PCI: dwc: Fix dw_pcie_ep_raise_msix_irq() to get correct MSI-X table address") overcomplicated the computation of the msix_tbl address. Simplify it as it's simply the addr + offset. Provided addr is (void *) already. objdump -d shows no difference after this patch. Link: https://lore.kernel.org/r/20200420065227.4920-1-jslaby@suse.cz Signed-off-by: Jiri Slaby Signed-off-by: Lorenzo Pieralisi Cc: Kishon Vijay Abraham I Cc: Lorenzo Pieralisi --- drivers/pci/controller/dwc/pcie-designware-ep.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 1cdcbd102ce8..c815d36905b6 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -433,7 +433,6 @@ int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no, struct dw_pcie *pci = to_dw_pcie_from_ep(ep); struct pci_epf_msix_tbl *msix_tbl; struct pci_epc *epc = ep->epc; - struct pci_epf_bar *epf_bar; u32 reg, msg_data, vec_ctrl; unsigned int aligned_offset; u32 tbl_offset; @@ -446,10 +445,7 @@ int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no, bir = (tbl_offset & PCI_MSIX_TABLE_BIR); tbl_offset &= PCI_MSIX_TABLE_OFFSET; - epf_bar = ep->epf_bar[bir]; - msix_tbl = epf_bar->addr; - msix_tbl = (struct pci_epf_msix_tbl *)((char *)msix_tbl + tbl_offset); - + msix_tbl = ep->epf_bar[bir]->addr + tbl_offset; msg_addr = msix_tbl[(interrupt_num - 1)].msg_addr; msg_data = msix_tbl[(interrupt_num - 1)].msg_data; vec_ctrl = msix_tbl[(interrupt_num - 1)].vector_ctrl; From 819482a96790d25fca97ec5c752400edbef05753 Mon Sep 17 00:00:00 2001 From: Zou Wei Date: Thu, 23 Apr 2020 11:18:03 +0800 Subject: [PATCH 030/427] PCI: dwc: Make hisi_pcie_platform_ops static Fix the following sparse warning: drivers/pci/controller/dwc/pcie-hisi.c:365:21: warning: symbol 'hisi_pcie_platform_ops' was not declared. Should it be static? Link: https://lore.kernel.org/r/1587611883-26960-1-git-send-email-zou_wei@huawei.com Reported-by: Hulk Robot Signed-off-by: Zou Wei Signed-off-by: Lorenzo Pieralisi Reviewed-by: Zhou Wang --- drivers/pci/controller/dwc/pcie-hisi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pcie-hisi.c b/drivers/pci/controller/dwc/pcie-hisi.c index 6d9e1b2b8f7b..11f5ff711d4d 100644 --- a/drivers/pci/controller/dwc/pcie-hisi.c +++ b/drivers/pci/controller/dwc/pcie-hisi.c @@ -362,7 +362,7 @@ static int hisi_pcie_platform_init(struct pci_config_window *cfg) return 0; } -struct pci_ecam_ops hisi_pcie_platform_ops = { +static struct pci_ecam_ops hisi_pcie_platform_ops = { .bus_shift = 20, .init = hisi_pcie_platform_init, .pci_ops = { From bca718988b9008d0d5f504e2d318178fc84958c1 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 18 Apr 2020 10:16:37 +0200 Subject: [PATCH 031/427] PCI: v3-semi: Fix a memory leak in v3_pci_probe() error handling paths If we fails somewhere in 'v3_pci_probe()', we need to free 'host'. Use the managed version of 'pci_alloc_host_bridge()' to do that easily. The use of managed resources is already widely used in this driver. Link: https://lore.kernel.org/r/20200418081637.1585-1-christophe.jaillet@wanadoo.fr Fixes: 68a15eb7bd0c ("PCI: v3-semi: Add V3 Semiconductor PCI host driver") Signed-off-by: Christophe JAILLET [lorenzo.pieralisi@arm.com: commit log] Signed-off-by: Lorenzo Pieralisi Acked-by: Linus Walleij --- drivers/pci/controller/pci-v3-semi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-v3-semi.c b/drivers/pci/controller/pci-v3-semi.c index bd05221f5a22..ddcb4571a79b 100644 --- a/drivers/pci/controller/pci-v3-semi.c +++ b/drivers/pci/controller/pci-v3-semi.c @@ -720,7 +720,7 @@ static int v3_pci_probe(struct platform_device *pdev) int irq; int ret; - host = pci_alloc_host_bridge(sizeof(*v3)); + host = devm_pci_alloc_host_bridge(dev, sizeof(*v3)); if (!host) return -ENOMEM; From 0e86d981f9b7252e9716c5137cd8e4d9ad8ef32f Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Wed, 22 Apr 2020 16:24:47 +0900 Subject: [PATCH 032/427] PCI: endpoint: functions/pci-epf-test: Fix DMA channel release When unbinding pci_epf_test, pci_epf_test_clean_dma_chan() is called in pci_epf_test_unbind() even though epf_test->dma_supported is false. As a result, dma_release_channel() will trigger a NULL pointer dereference because dma_chan is not set. Avoid calling dma_release_channel() if epf_test->dma_supported is false. Link: https://lore.kernel.org/r/1587540287-10458-1-git-send-email-hayashi.kunihiko@socionext.com Fixes: 5ebf3fc59bd2 ("PCI: endpoint: functions/pci-epf-test: Add DMA support to transfer data") Signed-off-by: Kunihiko Hayashi [lorenzo.pieralisi@arm.com: commit log] Signed-off-by: Lorenzo Pieralisi Acked-by: Kishon Vijay Abraham I --- drivers/pci/endpoint/functions/pci-epf-test.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c index 60330f3e3751..c89a9561439f 100644 --- a/drivers/pci/endpoint/functions/pci-epf-test.c +++ b/drivers/pci/endpoint/functions/pci-epf-test.c @@ -187,6 +187,9 @@ static int pci_epf_test_init_dma_chan(struct pci_epf_test *epf_test) */ static void pci_epf_test_clean_dma_chan(struct pci_epf_test *epf_test) { + if (!epf_test->dma_supported) + return; + dma_release_channel(epf_test->dma_chan); epf_test->dma_chan = NULL; } From c96efe26569204555705076368f8ef7565d06fa6 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 27 Mar 2020 13:45:56 +0000 Subject: [PATCH 033/427] PCI: altera: Clean up indentation issue on a return statement A return statment is indented incorrectly, remove extraneous space. Link: https://lore.kernel.org/r/20200327134556.265411-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/pcie-altera.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/pcie-altera.c b/drivers/pci/controller/pcie-altera.c index b447c3e4abad..24cb1c331058 100644 --- a/drivers/pci/controller/pcie-altera.c +++ b/drivers/pci/controller/pcie-altera.c @@ -193,7 +193,7 @@ static bool altera_pcie_valid_device(struct altera_pcie *pcie, if (bus->number == pcie->root_bus_nr && dev > 0) return false; - return true; + return true; } static int tlp_read_packet(struct altera_pcie *pcie, u32 *value) From f187b6974f6dfbeba4aafda972cc37f27d091b73 Mon Sep 17 00:00:00 2001 From: Sean Fu Date: Wed, 29 Apr 2020 12:04:13 +0800 Subject: [PATCH 034/427] workqueue: Use IS_ERR and PTR_ERR instead of PTR_ERR_OR_ZERO. Replace inline function PTR_ERR_OR_ZERO with IS_ERR and PTR_ERR to remove redundant parameter definitions and checks. Reduce code size. Before: text data bss dec hex filename 47510 5979 840 54329 d439 kernel/workqueue.o After: text data bss dec hex filename 47474 5979 840 54293 d415 kernel/workqueue.o Signed-off-by: Sean Fu Signed-off-by: Tejun Heo --- kernel/workqueue.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 891ccad5f271..ddf0537dce14 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4197,7 +4197,6 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, static int init_rescuer(struct workqueue_struct *wq) { struct worker *rescuer; - int ret; if (!(wq->flags & WQ_MEM_RECLAIM)) return 0; @@ -4208,10 +4207,9 @@ static int init_rescuer(struct workqueue_struct *wq) rescuer->rescue_wq = wq; rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); - ret = PTR_ERR_OR_ZERO(rescuer->task); - if (ret) { + if (IS_ERR(rescuer->task)) { kfree(rescuer); - return ret; + return PTR_ERR(rescuer->task); } wq->rescuer = rescuer; From b2f75a41eaa6bfc4aa6f6a1faefbf21c2c8d1588 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 9 Apr 2020 17:49:23 -0600 Subject: [PATCH 035/427] PCI: host-generic: Eliminate pci_host_common_probe wrappers Most ECAM host drivers are just different pci_ecam_ops which can be DT match table data. That's already the case in some cases, but let's do that for all the ECAM drivers. Then we can use of_device_get_match_data() in pci_host_common_probe() and eliminate the probe wrapper functions and use pci_host_common_probe() directly for probe. Link: https://lore.kernel.org/r/20200409234923.21598-4-robh@kernel.org Signed-off-by: Rob Herring Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas Cc: Zhou Wang Cc: Lorenzo Pieralisi Cc: Andrew Murray Cc: Bjorn Helgaas Cc: Will Deacon Cc: Robert Richter Cc: Marc Gonzalez Cc: Mans Rullgard Cc: linux-pci@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org --- drivers/pci/controller/dwc/pcie-hisi.c | 11 +---------- drivers/pci/controller/pci-host-common.c | 9 +++++++-- drivers/pci/controller/pci-host-generic.c | 15 +-------------- drivers/pci/controller/pci-thunder-ecam.c | 12 +++++------- drivers/pci/controller/pci-thunder-pem.c | 12 +++++------- drivers/pci/controller/pcie-tango.c | 7 +++++-- include/linux/pci-ecam.h | 3 +-- 7 files changed, 25 insertions(+), 44 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-hisi.c b/drivers/pci/controller/dwc/pcie-hisi.c index 90017045334d..0ba50fb473b1 100644 --- a/drivers/pci/controller/dwc/pcie-hisi.c +++ b/drivers/pci/controller/dwc/pcie-hisi.c @@ -332,15 +332,6 @@ static struct platform_driver hisi_pcie_driver = { }; builtin_platform_driver(hisi_pcie_driver); -static int hisi_pcie_almost_ecam_probe(struct platform_device *pdev) -{ - struct device *dev = &pdev->dev; - struct pci_ecam_ops *ops; - - ops = (struct pci_ecam_ops *)of_device_get_match_data(dev); - return pci_host_common_probe(pdev, ops); -} - static int hisi_pcie_platform_init(struct pci_config_window *cfg) { struct device *dev = cfg->parent; @@ -385,7 +376,7 @@ static const struct of_device_id hisi_pcie_almost_ecam_of_match[] = { }; static struct platform_driver hisi_pcie_almost_ecam_driver = { - .probe = hisi_pcie_almost_ecam_probe, + .probe = pci_host_common_probe, .driver = { .name = "hisi-pcie-almost-ecam", .of_match_table = hisi_pcie_almost_ecam_of_match, diff --git a/drivers/pci/controller/pci-host-common.c b/drivers/pci/controller/pci-host-common.c index 6d15bc12e726..953de57f6c57 100644 --- a/drivers/pci/controller/pci-host-common.c +++ b/drivers/pci/controller/pci-host-common.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -55,15 +56,19 @@ err_out: return ERR_PTR(err); } -int pci_host_common_probe(struct platform_device *pdev, - const struct pci_ecam_ops *ops) +int pci_host_common_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct pci_host_bridge *bridge; struct pci_config_window *cfg; struct list_head resources; + const struct pci_ecam_ops *ops; int ret; + ops = of_device_get_match_data(&pdev->dev); + if (!ops) + return -ENODEV; + bridge = devm_pci_alloc_host_bridge(dev, 0); if (!bridge) return -ENOMEM; diff --git a/drivers/pci/controller/pci-host-generic.c b/drivers/pci/controller/pci-host-generic.c index fd8cff61de14..b51977abfdf1 100644 --- a/drivers/pci/controller/pci-host-generic.c +++ b/drivers/pci/controller/pci-host-generic.c @@ -11,8 +11,6 @@ #include #include #include -#include -#include #include #include @@ -79,23 +77,12 @@ static const struct of_device_id gen_pci_of_match[] = { }; MODULE_DEVICE_TABLE(of, gen_pci_of_match); -static int gen_pci_probe(struct platform_device *pdev) -{ - const struct of_device_id *of_id; - struct pci_ecam_ops *ops; - - of_id = of_match_node(gen_pci_of_match, pdev->dev.of_node); - ops = (struct pci_ecam_ops *)of_id->data; - - return pci_host_common_probe(pdev, ops); -} - static struct platform_driver gen_pci_driver = { .driver = { .name = "pci-host-generic", .of_match_table = gen_pci_of_match, }, - .probe = gen_pci_probe, + .probe = pci_host_common_probe, .remove = pci_host_common_remove, }; module_platform_driver(gen_pci_driver); diff --git a/drivers/pci/controller/pci-thunder-ecam.c b/drivers/pci/controller/pci-thunder-ecam.c index c3fdd3e6b21c..7e8835fee5f7 100644 --- a/drivers/pci/controller/pci-thunder-ecam.c +++ b/drivers/pci/controller/pci-thunder-ecam.c @@ -357,22 +357,20 @@ const struct pci_ecam_ops pci_thunder_ecam_ops = { #ifdef CONFIG_PCI_HOST_THUNDER_ECAM static const struct of_device_id thunder_ecam_of_match[] = { - { .compatible = "cavium,pci-host-thunder-ecam" }, + { + .compatible = "cavium,pci-host-thunder-ecam", + .data = &pci_thunder_ecam_ops, + }, { }, }; -static int thunder_ecam_probe(struct platform_device *pdev) -{ - return pci_host_common_probe(pdev, &pci_thunder_ecam_ops); -} - static struct platform_driver thunder_ecam_driver = { .driver = { .name = KBUILD_MODNAME, .of_match_table = thunder_ecam_of_match, .suppress_bind_attrs = true, }, - .probe = thunder_ecam_probe, + .probe = pci_host_common_probe, }; builtin_platform_driver(thunder_ecam_driver); diff --git a/drivers/pci/controller/pci-thunder-pem.c b/drivers/pci/controller/pci-thunder-pem.c index 2e792707ceab..3f847969143e 100644 --- a/drivers/pci/controller/pci-thunder-pem.c +++ b/drivers/pci/controller/pci-thunder-pem.c @@ -451,22 +451,20 @@ static const struct pci_ecam_ops pci_thunder_pem_ops = { }; static const struct of_device_id thunder_pem_of_match[] = { - { .compatible = "cavium,pci-host-thunder-pem" }, + { + .compatible = "cavium,pci-host-thunder-pem", + .data = &pci_thunder_pem_ops, + }, { }, }; -static int thunder_pem_probe(struct platform_device *pdev) -{ - return pci_host_common_probe(pdev, &pci_thunder_pem_ops); -} - static struct platform_driver thunder_pem_driver = { .driver = { .name = KBUILD_MODNAME, .of_match_table = thunder_pem_of_match, .suppress_bind_attrs = true, }, - .probe = thunder_pem_probe, + .probe = pci_host_common_probe, }; builtin_platform_driver(thunder_pem_driver); diff --git a/drivers/pci/controller/pcie-tango.c b/drivers/pci/controller/pcie-tango.c index 3b2b10906fdd..c13367c30fc6 100644 --- a/drivers/pci/controller/pcie-tango.c +++ b/drivers/pci/controller/pcie-tango.c @@ -295,11 +295,14 @@ static int tango_pcie_probe(struct platform_device *pdev) spin_lock_init(&pcie->used_msi_lock); irq_set_chained_handler_and_data(virq, tango_msi_isr, pcie); - return pci_host_common_probe(pdev, &smp8759_ecam_ops); + return pci_host_common_probe(pdev); } static const struct of_device_id tango_pcie_ids[] = { - { .compatible = "sigma,smp8759-pcie" }, + { + .compatible = "sigma,smp8759-pcie", + .data = &smp8759_ecam_ops, + }, { }, }; diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h index fd0edb8b8a00..1af5cb02ef7f 100644 --- a/include/linux/pci-ecam.h +++ b/include/linux/pci-ecam.h @@ -61,8 +61,7 @@ extern const struct pci_ecam_ops al_pcie_ops; /* Amazon Annapurna Labs PCIe */ #if IS_ENABLED(CONFIG_PCI_HOST_COMMON) /* for DT-based PCI controllers that support ECAM */ -int pci_host_common_probe(struct platform_device *pdev, - const struct pci_ecam_ops *ops); +int pci_host_common_probe(struct platform_device *pdev); int pci_host_common_remove(struct platform_device *pdev); #endif #endif From 2b9f217433e31d125fb697ca7974d3de3ecc3e92 Mon Sep 17 00:00:00 2001 From: Andrew Murray Date: Fri, 4 Oct 2019 14:29:41 +0100 Subject: [PATCH 036/427] PCI: rcar: Fix incorrect programming of OB windows The outbound windows (PCIEPAUR(x), PCIEPALR(x)) describe a mapping between a CPU address (which is determined by the window number 'x') and a programmed PCI address - Thus allowing the controller to translate CPU accesses into PCI accesses. However the existing code incorrectly writes the CPU address - lets fix this by writing the PCI address instead. For memory transactions, existing DT users describe a 1:1 identity mapping and thus this change should have no effect. However the same isn't true for I/O. Link: https://lore.kernel.org/r/20191004132941.6660-1-andrew.murray@arm.com Fixes: c25da4778803 ("PCI: rcar: Add Renesas R-Car PCIe driver") Tested-by: Marek Vasut Signed-off-by: Andrew Murray Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Vasut --- drivers/pci/controller/pcie-rcar.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/pci/controller/pcie-rcar.c b/drivers/pci/controller/pcie-rcar.c index 759c6542c5c8..1bae6a4abaae 100644 --- a/drivers/pci/controller/pcie-rcar.c +++ b/drivers/pci/controller/pcie-rcar.c @@ -333,11 +333,12 @@ static struct pci_ops rcar_pcie_ops = { }; static void rcar_pcie_setup_window(int win, struct rcar_pcie *pcie, - struct resource *res) + struct resource_entry *window) { /* Setup PCIe address space mappings for each resource */ resource_size_t size; resource_size_t res_start; + struct resource *res = window->res; u32 mask; rcar_pci_write_reg(pcie, 0x00000000, PCIEPTCTLR(win)); @@ -351,9 +352,9 @@ static void rcar_pcie_setup_window(int win, struct rcar_pcie *pcie, rcar_pci_write_reg(pcie, mask << 7, PCIEPAMR(win)); if (res->flags & IORESOURCE_IO) - res_start = pci_pio_to_address(res->start); + res_start = pci_pio_to_address(res->start) - window->offset; else - res_start = res->start; + res_start = res->start - window->offset; rcar_pci_write_reg(pcie, upper_32_bits(res_start), PCIEPAUR(win)); rcar_pci_write_reg(pcie, lower_32_bits(res_start) & ~0x7F, @@ -382,7 +383,7 @@ static int rcar_pcie_setup(struct list_head *resource, struct rcar_pcie *pci) switch (resource_type(res)) { case IORESOURCE_IO: case IORESOURCE_MEM: - rcar_pcie_setup_window(i, pci, res); + rcar_pcie_setup_window(i, pci, win); i++; break; case IORESOURCE_BUS: From ce351636c67f75a9f282ab69283be231d2c8e845 Mon Sep 17 00:00:00 2001 From: Kazufumi Ikeda Date: Sat, 14 Mar 2020 20:12:32 +0100 Subject: [PATCH 037/427] PCI: rcar: Add suspend/resume Add suspend/resume support for rcar. The resume handler reprograms the hardware based on the software state kept in specific device structures, so there is no need to save registers on suspend. Link: https://lore.kernel.org/r/20200314191232.3122290-1-marek.vasut@gmail.com Link: https://lore.kernel.org/r/20200426123148.56051-1-marek.vasut@gmail.com Signed-off-by: Kazufumi Ikeda Signed-off-by: Gaku Inami Signed-off-by: Marek Vasut Signed-off-by: Lorenzo Pieralisi Cc: Geert Uytterhoeven Cc: Phil Edworthy Cc: Simon Horman Cc: Wolfram Sang Cc: linux-renesas-soc@vger.kernel.org --- drivers/pci/controller/pcie-rcar.c | 90 +++++++++++++++++++++++++----- 1 file changed, 75 insertions(+), 15 deletions(-) diff --git a/drivers/pci/controller/pcie-rcar.c b/drivers/pci/controller/pcie-rcar.c index 1bae6a4abaae..59e55f56e386 100644 --- a/drivers/pci/controller/pcie-rcar.c +++ b/drivers/pci/controller/pcie-rcar.c @@ -153,6 +153,7 @@ struct rcar_pcie { int root_bus_nr; struct clk *bus_clk; struct rcar_msi msi; + int (*phy_init_fn)(struct rcar_pcie *pcie); }; static void rcar_pci_write_reg(struct rcar_pcie *pcie, u32 val, @@ -453,6 +454,32 @@ done: (macsr & LINK_SPEED) == LINK_SPEED_5_0GTS ? "5" : "2.5"); } +static void rcar_pcie_hw_enable(struct rcar_pcie *pci) +{ + struct resource_entry *win; + LIST_HEAD(res); + int i = 0; + + /* Try setting 5 GT/s link speed */ + rcar_pcie_force_speedup(pci); + + /* Setup PCI resources */ + resource_list_for_each_entry(win, &pci->resources) { + struct resource *res = win->res; + + if (!res->flags) + continue; + + switch (resource_type(res)) { + case IORESOURCE_IO: + case IORESOURCE_MEM: + rcar_pcie_setup_window(i, pci, win); + i++; + break; + } + } +} + static int rcar_pcie_enable(struct rcar_pcie *pcie) { struct device *dev = pcie->dev; @@ -892,11 +919,25 @@ static void rcar_pcie_unmap_msi(struct rcar_pcie *pcie) irq_domain_remove(msi->domain); } +static void rcar_pcie_hw_enable_msi(struct rcar_pcie *pcie) +{ + struct rcar_msi *msi = &pcie->msi; + unsigned long base; + + /* setup MSI data target */ + base = virt_to_phys((void *)msi->pages); + + rcar_pci_write_reg(pcie, lower_32_bits(base) | MSIFE, PCIEMSIALR); + rcar_pci_write_reg(pcie, upper_32_bits(base), PCIEMSIAUR); + + /* enable all MSI interrupts */ + rcar_pci_write_reg(pcie, 0xffffffff, PCIEMSIIER); +} + static int rcar_pcie_enable_msi(struct rcar_pcie *pcie) { struct device *dev = pcie->dev; struct rcar_msi *msi = &pcie->msi; - phys_addr_t base; int err, i; mutex_init(&msi->lock); @@ -935,17 +976,7 @@ static int rcar_pcie_enable_msi(struct rcar_pcie *pcie) /* setup MSI data target */ msi->pages = __get_free_pages(GFP_KERNEL, 0); - if (!msi->pages) { - err = -ENOMEM; - goto err; - } - base = virt_to_phys((void *)msi->pages); - - rcar_pci_write_reg(pcie, lower_32_bits(base) | MSIFE, PCIEMSIALR); - rcar_pci_write_reg(pcie, upper_32_bits(base), PCIEMSIAUR); - - /* enable all MSI interrupts */ - rcar_pci_write_reg(pcie, 0xffffffff, PCIEMSIIER); + rcar_pcie_hw_enable_msi(pcie); return 0; @@ -1117,7 +1148,6 @@ static int rcar_pcie_probe(struct platform_device *pdev) struct rcar_pcie *pcie; u32 data; int err; - int (*phy_init_fn)(struct rcar_pcie *); struct pci_host_bridge *bridge; bridge = pci_alloc_host_bridge(sizeof(*pcie)); @@ -1157,8 +1187,8 @@ static int rcar_pcie_probe(struct platform_device *pdev) if (err) goto err_clk_disable; - phy_init_fn = of_device_get_match_data(dev); - err = phy_init_fn(pcie); + pcie->phy_init_fn = of_device_get_match_data(dev); + err = pcie->phy_init_fn(pcie); if (err) { dev_err(dev, "failed to init PCIe PHY\n"); goto err_clk_disable; @@ -1220,6 +1250,35 @@ err_free_bridge: return err; } +static int __maybe_unused rcar_pcie_resume(struct device *dev) +{ + struct rcar_pcie *pcie = dev_get_drvdata(dev); + unsigned int data; + int err; + + err = rcar_pcie_parse_map_dma_ranges(pcie); + if (err) + return 0; + + /* Failure to get a link might just be that no cards are inserted */ + err = pcie->phy_init_fn(pcie); + if (err) { + dev_info(dev, "PCIe link down\n"); + return 0; + } + + data = rcar_pci_read_reg(pcie, MACSR); + dev_info(dev, "PCIe x%d: link up\n", (data >> 20) & 0x3f); + + /* Enable MSI */ + if (IS_ENABLED(CONFIG_PCI_MSI)) + rcar_pcie_hw_enable_msi(pcie); + + rcar_pcie_hw_enable(pcie); + + return 0; +} + static int rcar_pcie_resume_noirq(struct device *dev) { struct rcar_pcie *pcie = dev_get_drvdata(dev); @@ -1235,6 +1294,7 @@ static int rcar_pcie_resume_noirq(struct device *dev) } static const struct dev_pm_ops rcar_pcie_pm_ops = { + SET_SYSTEM_SLEEP_PM_OPS(NULL, rcar_pcie_resume) .resume_noirq = rcar_pcie_resume_noirq, }; From b24a0c16f738a68cd8c2997edf8ebe3d8951df93 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Fri, 10 Apr 2020 18:30:20 +0900 Subject: [PATCH 038/427] dt-bindings: pci: rcar: add r8a77961 support Add support for r8a77961 (R-Car M3-W+). To avoid confusion between R-Car M3-W (R8A77960) and R-Car M3-W+ (R8A77961), this patch also updates the comment of "renesas,pcie-r8a7796". Link: https://lore.kernel.org/r/1586511020-31833-1-git-send-email-yoshihiro.shimoda.uh@renesas.com Signed-off-by: Yoshihiro Shimoda Signed-off-by: Lorenzo Pieralisi Reviewed-by: Geert Uytterhoeven Acked-by: Rob Herring --- Documentation/devicetree/bindings/pci/rcar-pci.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/pci/rcar-pci.txt b/Documentation/devicetree/bindings/pci/rcar-pci.txt index 12702c8c46ce..1041c44a614f 100644 --- a/Documentation/devicetree/bindings/pci/rcar-pci.txt +++ b/Documentation/devicetree/bindings/pci/rcar-pci.txt @@ -11,7 +11,8 @@ compatible: "renesas,pcie-r8a7743" for the R8A7743 SoC; "renesas,pcie-r8a7791" for the R8A7791 SoC; "renesas,pcie-r8a7793" for the R8A7793 SoC; "renesas,pcie-r8a7795" for the R8A7795 SoC; - "renesas,pcie-r8a7796" for the R8A7796 SoC; + "renesas,pcie-r8a7796" for the R8A77960 SoC; + "renesas,pcie-r8a77961" for the R8A77961 SoC; "renesas,pcie-r8a77980" for the R8A77980 SoC; "renesas,pcie-r8a77990" for the R8A77990 SoC; "renesas,pcie-rcar-gen2" for a generic R-Car Gen2 or From 781c036b678c9ae978efb5f89311d0b5e6748b10 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Mon, 13 Apr 2020 12:24:22 +0800 Subject: [PATCH 039/427] ext4: remove unnecessary test_opt for DIOREAD_NOLOCK The DIOREAD_NOLOCK flag has been cleared when doing the test_opt that is meaningless, so remove the unnecessary test_opt for DIOREAD_NOLOCK. Signed-off-by: Kaixu Xia Link: https://lore.kernel.org/r/1586751862-19437-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bf5fcb477f66..79e07e69cef9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3971,17 +3971,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!\n"); + /* can't mount with both data=journal and dioread_nolock. */ clear_opt(sb, DIOREAD_NOLOCK); if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); goto failed_mount; } - if (test_opt(sb, DIOREAD_NOLOCK)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and dioread_nolock"); - goto failed_mount; - } if (test_opt(sb, DAX)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dax"); From 66ff14e59e8a30690755b08bc3042359703fb07a Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Wed, 6 May 2020 01:34:21 +0800 Subject: [PATCH 040/427] PCI/ASPM: Allow ASPM on links to PCIe-to-PCI/PCI-X Bridges 7d715a6c1ae5 ("PCI: add PCI Express ASPM support") added the ability for Linux to enable ASPM, but for some undocumented reason, it didn't enable ASPM on links where the downstream component is a PCIe-to-PCI/PCI-X Bridge. Remove this exclusion so we can enable ASPM on these links. The Dell OptiPlex 7080 mentioned in the bugzilla has a TI XIO2001 PCIe-to-PCI Bridge. Enabling ASPM on the link leading to it allows the Intel SoC to enter deeper Package C-states, which is a significant power savings. [bhelgaas: commit log] Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=207571 Link: https://lore.kernel.org/r/20200505173423.26968-1-kai.heng.feng@canonical.com Signed-off-by: Kai-Heng Feng Signed-off-by: Bjorn Helgaas Reviewed-by: Mika Westerberg --- drivers/pci/pcie/aspm.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 2378ed692534..b17e5ffd31b1 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -628,16 +628,6 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) /* Setup initial capable state. Will be updated later */ link->aspm_capable = link->aspm_support; - /* - * If the downstream component has pci bridge function, don't - * do ASPM for now. - */ - list_for_each_entry(child, &linkbus->devices, bus_list) { - if (pci_pcie_type(child) == PCI_EXP_TYPE_PCI_BRIDGE) { - link->aspm_disable = ASPM_STATE_ALL; - break; - } - } /* Get and check endpoint acceptable latencies */ list_for_each_entry(child, &linkbus->devices, bus_list) { From 1129d31b55d509f15e72dc68e4b5c3a4d7b4da8d Mon Sep 17 00:00:00 2001 From: Krzysztof Struczynski Date: Tue, 28 Apr 2020 09:30:10 +0200 Subject: [PATCH 041/427] ima: Fix ima digest hash table key calculation Function hash_long() accepts unsigned long, while currently only one byte is passed from ima_hash_key(), which calculates a key for ima_htable. Given that hashing the digest does not give clear benefits compared to using the digest itself, remove hash_long() and return the modulus calculated on the first two bytes of the digest with the number of slots. Also reduce the depth of the hash table by doubling the number of slots. Cc: stable@vger.kernel.org Fixes: 3323eec921ef ("integrity: IMA as an integrity service provider") Co-developed-by: Roberto Sassu Signed-off-by: Roberto Sassu Signed-off-by: Krzysztof Struczynski Acked-by: David.Laight@aculab.com (big endian system concerns) Signed-off-by: Mimi Zohar --- security/integrity/ima/ima.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 467dfdbea25c..02796473238b 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -36,7 +36,7 @@ enum tpm_pcrs { TPM_PCR0 = 0, TPM_PCR8 = 8 }; #define IMA_DIGEST_SIZE SHA1_DIGEST_SIZE #define IMA_EVENT_NAME_LEN_MAX 255 -#define IMA_HASH_BITS 9 +#define IMA_HASH_BITS 10 #define IMA_MEASURE_HTABLE_SIZE (1 << IMA_HASH_BITS) #define IMA_TEMPLATE_FIELD_ID_MAX_LEN 16 @@ -179,9 +179,10 @@ struct ima_h_table { }; extern struct ima_h_table ima_htable; -static inline unsigned long ima_hash_key(u8 *digest) +static inline unsigned int ima_hash_key(u8 *digest) { - return hash_long(*digest, IMA_HASH_BITS); + /* there is no point in taking a hash of part of a digest */ + return (digest[0] | digest[1] << 8) % IMA_MEASURE_HTABLE_SIZE; } #define __ima_hooks(hook) \ From 6ee28442a465ab4c4be45e3b15015af24b1ba906 Mon Sep 17 00:00:00 2001 From: Krzysztof Struczynski Date: Mon, 27 Apr 2020 12:28:58 +0200 Subject: [PATCH 042/427] ima: Remove redundant policy rule set in add_rules() Function ima_appraise_flag() returns the flag to be set in temp_ima_appraise depending on the hook identifier passed as an argument. It is not necessary to set the flag again for the POLICY_CHECK hook. Signed-off-by: Krzysztof Struczynski Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_policy.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index c334e0dc6083..ea9b991f0232 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -643,11 +643,8 @@ static void add_rules(struct ima_rule_entry *entries, int count, list_add_tail(&entry->list, &ima_policy_rules); } - if (entries[i].action == APPRAISE) { + if (entries[i].action == APPRAISE) temp_ima_appraise |= ima_appraise_flag(entries[i].func); - if (entries[i].func == POLICY_CHECK) - temp_ima_appraise |= IMA_APPRAISE_POLICY; - } } } From b59fda449cf07f2db3be3a67142e6c000f5e8d79 Mon Sep 17 00:00:00 2001 From: Krzysztof Struczynski Date: Mon, 27 Apr 2020 12:28:59 +0200 Subject: [PATCH 043/427] ima: Set again build_ima_appraise variable After adding the new add_rule() function in commit c52657d93b05 ("ima: refactor ima_init_policy()"), all appraisal flags are added to the temp_ima_appraise variable. Revert to the previous behavior instead of removing build_ima_appraise, to benefit from the protection offered by __ro_after_init. The mentioned commit introduced a bug, as it makes all the flags modifiable, while build_ima_appraise flags can be protected with __ro_after_init. Cc: stable@vger.kernel.org # 5.0.x Fixes: c52657d93b05 ("ima: refactor ima_init_policy()") Co-developed-by: Roberto Sassu Signed-off-by: Roberto Sassu Signed-off-by: Krzysztof Struczynski Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_policy.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index ea9b991f0232..ef7f68cc935e 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -643,8 +643,14 @@ static void add_rules(struct ima_rule_entry *entries, int count, list_add_tail(&entry->list, &ima_policy_rules); } - if (entries[i].action == APPRAISE) - temp_ima_appraise |= ima_appraise_flag(entries[i].func); + if (entries[i].action == APPRAISE) { + if (entries != build_appraise_rules) + temp_ima_appraise |= + ima_appraise_flag(entries[i].func); + else + build_ima_appraise |= + ima_appraise_flag(entries[i].func); + } } } From 0c4395fb2aa77341269ea619c5419ea48171883f Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 14 Apr 2020 10:01:31 +0200 Subject: [PATCH 044/427] evm: Fix possible memory leak in evm_calc_hmac_or_hash() Don't immediately return if the signature is portable and security.ima is not present. Just set error so that memory allocated is freed before returning from evm_calc_hmac_or_hash(). Fixes: 50b977481fce9 ("EVM: Add support for portable signature format") Signed-off-by: Roberto Sassu Cc: stable@vger.kernel.org Signed-off-by: Mimi Zohar --- security/integrity/evm/evm_crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c index 35682852ddea..499ea01b2edc 100644 --- a/security/integrity/evm/evm_crypto.c +++ b/security/integrity/evm/evm_crypto.c @@ -241,7 +241,7 @@ static int evm_calc_hmac_or_hash(struct dentry *dentry, /* Portable EVM signatures must include an IMA hash */ if (type == EVM_XATTR_PORTABLE_DIGSIG && !ima_present) - return -EPERM; + error = -EPERM; out: kfree(xattr_value); kfree(desc); From 62a7f3009a460001eb46984395280dd900bc4ef4 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Fri, 8 May 2020 14:53:40 +0800 Subject: [PATCH 045/427] serial: 8250_pci: Move Pericom IDs to pci_ids.h Move the IDs to pci_ids.h so it can be used by next patch. Link: https://lore.kernel.org/r/20200508065343.32751-1-kai.heng.feng@canonical.com Signed-off-by: Kai-Heng Feng Signed-off-by: Bjorn Helgaas Acked-by: Greg Kroah-Hartman Cc: stable@vger.kernel.org --- drivers/tty/serial/8250/8250_pci.c | 6 ------ include/linux/pci_ids.h | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c index 0804469ff052..1a74d511b02a 100644 --- a/drivers/tty/serial/8250/8250_pci.c +++ b/drivers/tty/serial/8250/8250_pci.c @@ -1869,12 +1869,6 @@ pci_moxa_setup(struct serial_private *priv, #define PCIE_DEVICE_ID_WCH_CH384_4S 0x3470 #define PCIE_DEVICE_ID_WCH_CH382_2S 0x3253 -#define PCI_VENDOR_ID_PERICOM 0x12D8 -#define PCI_DEVICE_ID_PERICOM_PI7C9X7951 0x7951 -#define PCI_DEVICE_ID_PERICOM_PI7C9X7952 0x7952 -#define PCI_DEVICE_ID_PERICOM_PI7C9X7954 0x7954 -#define PCI_DEVICE_ID_PERICOM_PI7C9X7958 0x7958 - #define PCI_VENDOR_ID_ACCESIO 0x494f #define PCI_DEVICE_ID_ACCESIO_PCIE_COM_2SDB 0x1051 #define PCI_DEVICE_ID_ACCESIO_MPCIE_COM_2S 0x1053 diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 1dfc4e1dcb94..9a57e6717e5c 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1832,6 +1832,12 @@ #define PCI_VENDOR_ID_NVIDIA_SGS 0x12d2 #define PCI_DEVICE_ID_NVIDIA_SGS_RIVA128 0x0018 +#define PCI_VENDOR_ID_PERICOM 0x12D8 +#define PCI_DEVICE_ID_PERICOM_PI7C9X7951 0x7951 +#define PCI_DEVICE_ID_PERICOM_PI7C9X7952 0x7952 +#define PCI_DEVICE_ID_PERICOM_PI7C9X7954 0x7954 +#define PCI_DEVICE_ID_PERICOM_PI7C9X7958 0x7958 + #define PCI_SUBVENDOR_ID_CHASE_PCIFAST 0x12E0 #define PCI_SUBDEVICE_ID_CHASE_PCIFAST4 0x0031 #define PCI_SUBDEVICE_ID_CHASE_PCIFAST8 0x0021 From 68f5fc4ea9ddf9f77720d568144219c4e6452cde Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Fri, 8 May 2020 14:53:41 +0800 Subject: [PATCH 046/427] PCI: Avoid Pericom USB controller OHCI/EHCI PME# defect Both Pericom OHCI and EHCI devices advertise PME# support from all power states: 06:00.0 USB controller [0c03]: Pericom Semiconductor PI7C9X442SL USB OHCI Controller [12d8:400e] (rev 01) (prog-if 10 [OHCI]) Subsystem: Pericom Semiconductor PI7C9X442SL USB OHCI Controller [12d8:400e] Capabilities: [80] Power Management version 3 Flags: PMEClk- DSI- D1+ D2+ AuxCurrent=375mA PME(D0+,D1+,D2+,D3hot+,D3cold+) 06:00.2 USB controller [0c03]: Pericom Semiconductor PI7C9X442SL USB EHCI Controller [12d8:400f] (rev 01) (prog-if 20 [EHCI]) Subsystem: Pericom Semiconductor PI7C9X442SL USB EHCI Controller [12d8:400f] Capabilities: [80] Power Management version 3 Flags: PMEClk- DSI- D1+ D2+ AuxCurrent=375mA PME(D0+,D1+,D2+,D3hot+,D3cold+) But testing shows that it's unreliable: there is a 20% chance PME# won't be asserted when a USB device is plugged. Remove PME support for both devices to make USB plugging work reliably. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=205981 Link: https://lore.kernel.org/r/20200508065343.32751-2-kai.heng.feng@canonical.com Signed-off-by: Kai-Heng Feng Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org --- drivers/pci/quirks.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 28c9a2409c50..7b4a98d0f0fd 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -5567,3 +5567,16 @@ static void pci_fixup_no_d0_pme(struct pci_dev *dev) dev->pme_support &= ~(PCI_PM_CAP_PME_D0 >> PCI_PM_CAP_PME_SHIFT); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ASMEDIA, 0x2142, pci_fixup_no_d0_pme); + +/* + * Device [12d8:0x400e] and [12d8:0x400f] + * These devices advertise PME# support in all power states but don't + * reliably assert it. + */ +static void pci_fixup_no_pme(struct pci_dev *dev) +{ + pci_info(dev, "PME# is unreliable, disabling it\n"); + dev->pme_support = 0; +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_PERICOM, 0x400e, pci_fixup_no_pme); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_PERICOM, 0x400f, pci_fixup_no_pme); From 63605f1cfcc56bcb25c48bbee75a679d85ba7675 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 14 Apr 2020 12:25:12 +0200 Subject: [PATCH 047/427] PCI: tegra: Fix reporting GPIO error value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Error code is stored in rp->reset_gpio and not in err variable. Link: https://lore.kernel.org/r/20200414102512.27506-1-pali@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Acked-by: Thierry Reding Acked-by: Rob Herring --- drivers/pci/controller/pci-tegra.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/pci-tegra.c b/drivers/pci/controller/pci-tegra.c index 3e64ba6a36a8..e3e917243e10 100644 --- a/drivers/pci/controller/pci-tegra.c +++ b/drivers/pci/controller/pci-tegra.c @@ -2219,8 +2219,8 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie) if (PTR_ERR(rp->reset_gpio) == -ENOENT) { rp->reset_gpio = NULL; } else { - dev_err(dev, "failed to get reset GPIO: %d\n", - err); + dev_err(dev, "failed to get reset GPIO: %ld\n", + PTR_ERR(rp->reset_gpio)); return PTR_ERR(rp->reset_gpio); } } From a18f4b6ea50b81e28bd05381883a531ab345f753 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Thu, 7 May 2020 13:33:12 +0100 Subject: [PATCH 048/427] PCI: rcar: Rename pcie-rcar.c to pcie-rcar-host.c This commit renames pcie-rcar.c to pcie-rcar-host.c in preparation for adding support for endpoint mode. CONFIG_PCIE_RCAR is kept so that arm64 defconfig change can be a separate patch. With this patch both config options PCIE_RCAR and PCIE_RCAR_HOST will be available but PCIE_RCAR internally selects PCIE_RCAR_HOST so that bisect builds wont be affected. Link: https://lore.kernel.org/r/1588854799-13710-2-git-send-email-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Lad Prabhakar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Yoshihiro Shimoda --- drivers/pci/controller/Kconfig | 10 ++++++++++ drivers/pci/controller/Makefile | 2 +- .../pci/controller/{pcie-rcar.c => pcie-rcar-host.c} | 0 3 files changed, 11 insertions(+), 1 deletion(-) rename drivers/pci/controller/{pcie-rcar.c => pcie-rcar-host.c} (100%) diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig index 91bfdb784829..32dcab3c103f 100644 --- a/drivers/pci/controller/Kconfig +++ b/drivers/pci/controller/Kconfig @@ -58,8 +58,18 @@ config PCIE_RCAR bool "Renesas R-Car PCIe controller" depends on ARCH_RENESAS || COMPILE_TEST depends on PCI_MSI_IRQ_DOMAIN + select PCIE_RCAR_HOST help Say Y here if you want PCIe controller support on R-Car SoCs. + This option will be removed after arm64 defconfig is updated. + +config PCIE_RCAR_HOST + bool "Renesas R-Car PCIe host controller" + depends on ARCH_RENESAS || COMPILE_TEST + depends on PCI_MSI_IRQ_DOMAIN + help + Say Y here if you want PCIe controller support on R-Car SoCs in host + mode. config PCI_HOST_COMMON bool diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile index 158c59771824..9dbccb5b24e1 100644 --- a/drivers/pci/controller/Makefile +++ b/drivers/pci/controller/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_PCI_MVEBU) += pci-mvebu.o obj-$(CONFIG_PCI_AARDVARK) += pci-aardvark.o obj-$(CONFIG_PCI_TEGRA) += pci-tegra.o obj-$(CONFIG_PCI_RCAR_GEN2) += pci-rcar-gen2.o -obj-$(CONFIG_PCIE_RCAR) += pcie-rcar.o +obj-$(CONFIG_PCIE_RCAR_HOST) += pcie-rcar-host.o obj-$(CONFIG_PCI_HOST_COMMON) += pci-host-common.o obj-$(CONFIG_PCI_HOST_GENERIC) += pci-host-generic.o obj-$(CONFIG_PCIE_XILINX) += pcie-xilinx.o diff --git a/drivers/pci/controller/pcie-rcar.c b/drivers/pci/controller/pcie-rcar-host.c similarity index 100% rename from drivers/pci/controller/pcie-rcar.c rename to drivers/pci/controller/pcie-rcar-host.c From 78a0d7f2f5a31357bce68012d886507b4cf33598 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Thu, 7 May 2020 13:33:13 +0100 Subject: [PATCH 049/427] PCI: rcar: Move shareable code to a common file Move shareable code to common file pcie-rcar.c and the #defines to pcie-rcar.h so that the common code can be reused with endpoint driver. There are no functional changes with this patch for the host controller driver. Link: https://lore.kernel.org/r/1588854799-13710-3-git-send-email-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Lad Prabhakar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Yoshihiro Shimoda --- drivers/pci/controller/Makefile | 2 +- drivers/pci/controller/pcie-rcar-host.c | 404 +++++++----------------- drivers/pci/controller/pcie-rcar.c | 117 +++++++ drivers/pci/controller/pcie-rcar.h | 131 ++++++++ 4 files changed, 361 insertions(+), 293 deletions(-) create mode 100644 drivers/pci/controller/pcie-rcar.c create mode 100644 drivers/pci/controller/pcie-rcar.h diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile index 9dbccb5b24e1..39802ee32946 100644 --- a/drivers/pci/controller/Makefile +++ b/drivers/pci/controller/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_PCI_MVEBU) += pci-mvebu.o obj-$(CONFIG_PCI_AARDVARK) += pci-aardvark.o obj-$(CONFIG_PCI_TEGRA) += pci-tegra.o obj-$(CONFIG_PCI_RCAR_GEN2) += pci-rcar-gen2.o -obj-$(CONFIG_PCIE_RCAR_HOST) += pcie-rcar-host.o +obj-$(CONFIG_PCIE_RCAR_HOST) += pcie-rcar.o pcie-rcar-host.o obj-$(CONFIG_PCI_HOST_COMMON) += pci-host-common.o obj-$(CONFIG_PCI_HOST_GENERIC) += pci-host-generic.o obj-$(CONFIG_PCIE_XILINX) += pcie-xilinx.o diff --git a/drivers/pci/controller/pcie-rcar-host.c b/drivers/pci/controller/pcie-rcar-host.c index 59e55f56e386..d210a36561be 100644 --- a/drivers/pci/controller/pcie-rcar-host.c +++ b/drivers/pci/controller/pcie-rcar-host.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * PCIe driver for Renesas R-Car SoCs - * Copyright (C) 2014 Renesas Electronics Europe Ltd + * Copyright (C) 2014-2020 Renesas Electronics Europe Ltd * * Based on: * arch/sh/drivers/pci/pcie-sh7786.c @@ -30,104 +30,7 @@ #include #include -#define PCIECAR 0x000010 -#define PCIECCTLR 0x000018 -#define CONFIG_SEND_ENABLE BIT(31) -#define TYPE0 (0 << 8) -#define TYPE1 BIT(8) -#define PCIECDR 0x000020 -#define PCIEMSR 0x000028 -#define PCIEINTXR 0x000400 -#define PCIEPHYSR 0x0007f0 -#define PHYRDY BIT(0) -#define PCIEMSITXR 0x000840 - -/* Transfer control */ -#define PCIETCTLR 0x02000 -#define DL_DOWN BIT(3) -#define CFINIT BIT(0) -#define PCIETSTR 0x02004 -#define DATA_LINK_ACTIVE BIT(0) -#define PCIEERRFR 0x02020 -#define UNSUPPORTED_REQUEST BIT(4) -#define PCIEMSIFR 0x02044 -#define PCIEMSIALR 0x02048 -#define MSIFE BIT(0) -#define PCIEMSIAUR 0x0204c -#define PCIEMSIIER 0x02050 - -/* root port address */ -#define PCIEPRAR(x) (0x02080 + ((x) * 0x4)) - -/* local address reg & mask */ -#define PCIELAR(x) (0x02200 + ((x) * 0x20)) -#define PCIELAMR(x) (0x02208 + ((x) * 0x20)) -#define LAM_PREFETCH BIT(3) -#define LAM_64BIT BIT(2) -#define LAR_ENABLE BIT(1) - -/* PCIe address reg & mask */ -#define PCIEPALR(x) (0x03400 + ((x) * 0x20)) -#define PCIEPAUR(x) (0x03404 + ((x) * 0x20)) -#define PCIEPAMR(x) (0x03408 + ((x) * 0x20)) -#define PCIEPTCTLR(x) (0x0340c + ((x) * 0x20)) -#define PAR_ENABLE BIT(31) -#define IO_SPACE BIT(8) - -/* Configuration */ -#define PCICONF(x) (0x010000 + ((x) * 0x4)) -#define PMCAP(x) (0x010040 + ((x) * 0x4)) -#define EXPCAP(x) (0x010070 + ((x) * 0x4)) -#define VCCAP(x) (0x010100 + ((x) * 0x4)) - -/* link layer */ -#define IDSETR1 0x011004 -#define TLCTLR 0x011048 -#define MACSR 0x011054 -#define SPCHGFIN BIT(4) -#define SPCHGFAIL BIT(6) -#define SPCHGSUC BIT(7) -#define LINK_SPEED (0xf << 16) -#define LINK_SPEED_2_5GTS (1 << 16) -#define LINK_SPEED_5_0GTS (2 << 16) -#define MACCTLR 0x011058 -#define MACCTLR_NFTS_MASK GENMASK(23, 16) /* The name is from SH7786 */ -#define SPEED_CHANGE BIT(24) -#define SCRAMBLE_DISABLE BIT(27) -#define LTSMDIS BIT(31) -#define MACCTLR_INIT_VAL (LTSMDIS | MACCTLR_NFTS_MASK) -#define PMSR 0x01105c -#define MACS2R 0x011078 -#define MACCGSPSETR 0x011084 -#define SPCNGRSN BIT(31) - -/* R-Car H1 PHY */ -#define H1_PCIEPHYADRR 0x04000c -#define WRITE_CMD BIT(16) -#define PHY_ACK BIT(24) -#define RATE_POS 12 -#define LANE_POS 8 -#define ADR_POS 0 -#define H1_PCIEPHYDOUTR 0x040014 - -/* R-Car Gen2 PHY */ -#define GEN2_PCIEPHYADDR 0x780 -#define GEN2_PCIEPHYDATA 0x784 -#define GEN2_PCIEPHYCTRL 0x78c - -#define INT_PCI_MSI_NR 32 - -#define RCONF(x) (PCICONF(0) + (x)) -#define RPMCAP(x) (PMCAP(0) + (x)) -#define REXPCAP(x) (EXPCAP(0) + (x)) -#define RVCCAP(x) (VCCAP(0) + (x)) - -#define PCIE_CONF_BUS(b) (((b) & 0xff) << 24) -#define PCIE_CONF_DEV(d) (((d) & 0x1f) << 19) -#define PCIE_CONF_FUNC(f) (((f) & 0x7) << 16) - -#define RCAR_PCI_MAX_RESOURCES 4 -#define MAX_NR_INBOUND_MAPS 6 +#include "pcie-rcar.h" struct rcar_msi { DECLARE_BITMAP(used, INT_PCI_MSI_NR); @@ -145,7 +48,8 @@ static inline struct rcar_msi *to_rcar_msi(struct msi_controller *chip) } /* Structure representing the PCIe interface */ -struct rcar_pcie { +struct rcar_pcie_host { + struct rcar_pcie pcie; struct device *dev; struct phy *phy; void __iomem *base; @@ -153,35 +57,9 @@ struct rcar_pcie { int root_bus_nr; struct clk *bus_clk; struct rcar_msi msi; - int (*phy_init_fn)(struct rcar_pcie *pcie); + int (*phy_init_fn)(struct rcar_pcie_host *host); }; -static void rcar_pci_write_reg(struct rcar_pcie *pcie, u32 val, - unsigned int reg) -{ - writel(val, pcie->base + reg); -} - -static u32 rcar_pci_read_reg(struct rcar_pcie *pcie, unsigned int reg) -{ - return readl(pcie->base + reg); -} - -enum { - RCAR_PCI_ACCESS_READ, - RCAR_PCI_ACCESS_WRITE, -}; - -static void rcar_rmw32(struct rcar_pcie *pcie, int where, u32 mask, u32 data) -{ - unsigned int shift = BITS_PER_BYTE * (where & 3); - u32 val = rcar_pci_read_reg(pcie, where & ~3); - - val &= ~(mask << shift); - val |= data << shift; - rcar_pci_write_reg(pcie, val, where & ~3); -} - static u32 rcar_read_conf(struct rcar_pcie *pcie, int where) { unsigned int shift = BITS_PER_BYTE * (where & 3); @@ -191,10 +69,11 @@ static u32 rcar_read_conf(struct rcar_pcie *pcie, int where) } /* Serialization is provided by 'pci_lock' in drivers/pci/access.c */ -static int rcar_pcie_config_access(struct rcar_pcie *pcie, +static int rcar_pcie_config_access(struct rcar_pcie_host *host, unsigned char access_type, struct pci_bus *bus, unsigned int devfn, int where, u32 *data) { + struct rcar_pcie *pcie = &host->pcie; unsigned int dev, func, reg, index; dev = PCI_SLOT(devfn); @@ -226,7 +105,7 @@ static int rcar_pcie_config_access(struct rcar_pcie *pcie, } else { /* Keep an eye out for changes to the root bus number */ if (pci_is_root_bus(bus) && (reg == PCI_PRIMARY_BUS)) - pcie->root_bus_nr = *data & 0xff; + host->root_bus_nr = *data & 0xff; rcar_pci_write_reg(pcie, *data, PCICONF(index)); } @@ -234,7 +113,7 @@ static int rcar_pcie_config_access(struct rcar_pcie *pcie, return PCIBIOS_SUCCESSFUL; } - if (pcie->root_bus_nr < 0) + if (host->root_bus_nr < 0) return PCIBIOS_DEVICE_NOT_FOUND; /* Clear errors */ @@ -245,7 +124,7 @@ static int rcar_pcie_config_access(struct rcar_pcie *pcie, PCIE_CONF_DEV(dev) | PCIE_CONF_FUNC(func) | reg, PCIECAR); /* Enable the configuration access */ - if (bus->parent->number == pcie->root_bus_nr) + if (bus->parent->number == host->root_bus_nr) rcar_pci_write_reg(pcie, CONFIG_SEND_ENABLE | TYPE0, PCIECCTLR); else rcar_pci_write_reg(pcie, CONFIG_SEND_ENABLE | TYPE1, PCIECCTLR); @@ -273,10 +152,10 @@ static int rcar_pcie_config_access(struct rcar_pcie *pcie, static int rcar_pcie_read_conf(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) { - struct rcar_pcie *pcie = bus->sysdata; + struct rcar_pcie_host *host = bus->sysdata; int ret; - ret = rcar_pcie_config_access(pcie, RCAR_PCI_ACCESS_READ, + ret = rcar_pcie_config_access(host, RCAR_PCI_ACCESS_READ, bus, devfn, where, val); if (ret != PCIBIOS_SUCCESSFUL) { *val = 0xffffffff; @@ -298,12 +177,12 @@ static int rcar_pcie_read_conf(struct pci_bus *bus, unsigned int devfn, static int rcar_pcie_write_conf(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val) { - struct rcar_pcie *pcie = bus->sysdata; + struct rcar_pcie_host *host = bus->sysdata; unsigned int shift; u32 data; int ret; - ret = rcar_pcie_config_access(pcie, RCAR_PCI_ACCESS_READ, + ret = rcar_pcie_config_access(host, RCAR_PCI_ACCESS_READ, bus, devfn, where, &data); if (ret != PCIBIOS_SUCCESSFUL) return ret; @@ -322,7 +201,7 @@ static int rcar_pcie_write_conf(struct pci_bus *bus, unsigned int devfn, } else data = val; - ret = rcar_pcie_config_access(pcie, RCAR_PCI_ACCESS_WRITE, + ret = rcar_pcie_config_access(host, RCAR_PCI_ACCESS_WRITE, bus, devfn, where, &data); return ret; @@ -333,49 +212,14 @@ static struct pci_ops rcar_pcie_ops = { .write = rcar_pcie_write_conf, }; -static void rcar_pcie_setup_window(int win, struct rcar_pcie *pcie, - struct resource_entry *window) -{ - /* Setup PCIe address space mappings for each resource */ - resource_size_t size; - resource_size_t res_start; - struct resource *res = window->res; - u32 mask; - - rcar_pci_write_reg(pcie, 0x00000000, PCIEPTCTLR(win)); - - /* - * The PAMR mask is calculated in units of 128Bytes, which - * keeps things pretty simple. - */ - size = resource_size(res); - mask = (roundup_pow_of_two(size) / SZ_128) - 1; - rcar_pci_write_reg(pcie, mask << 7, PCIEPAMR(win)); - - if (res->flags & IORESOURCE_IO) - res_start = pci_pio_to_address(res->start) - window->offset; - else - res_start = res->start - window->offset; - - rcar_pci_write_reg(pcie, upper_32_bits(res_start), PCIEPAUR(win)); - rcar_pci_write_reg(pcie, lower_32_bits(res_start) & ~0x7F, - PCIEPALR(win)); - - /* First resource is for IO */ - mask = PAR_ENABLE; - if (res->flags & IORESOURCE_IO) - mask |= IO_SPACE; - - rcar_pci_write_reg(pcie, mask, PCIEPTCTLR(win)); -} - -static int rcar_pcie_setup(struct list_head *resource, struct rcar_pcie *pci) +static int rcar_pcie_setup(struct list_head *resource, + struct rcar_pcie_host *host) { struct resource_entry *win; int i = 0; /* Setup PCI resources */ - resource_list_for_each_entry(win, &pci->resources) { + resource_list_for_each_entry(win, &host->resources) { struct resource *res = win->res; if (!res->flags) @@ -384,11 +228,11 @@ static int rcar_pcie_setup(struct list_head *resource, struct rcar_pcie *pci) switch (resource_type(res)) { case IORESOURCE_IO: case IORESOURCE_MEM: - rcar_pcie_setup_window(i, pci, win); + rcar_pcie_set_outbound(&host->pcie, i, win); i++; break; case IORESOURCE_BUS: - pci->root_bus_nr = res->start; + host->root_bus_nr = res->start; break; default: continue; @@ -454,17 +298,18 @@ done: (macsr & LINK_SPEED) == LINK_SPEED_5_0GTS ? "5" : "2.5"); } -static void rcar_pcie_hw_enable(struct rcar_pcie *pci) +static void rcar_pcie_hw_enable(struct rcar_pcie_host *host) { + struct rcar_pcie *pcie = &host->pcie; struct resource_entry *win; LIST_HEAD(res); int i = 0; /* Try setting 5 GT/s link speed */ - rcar_pcie_force_speedup(pci); + rcar_pcie_force_speedup(pcie); /* Setup PCI resources */ - resource_list_for_each_entry(win, &pci->resources) { + resource_list_for_each_entry(win, &host->resources) { struct resource *res = win->res; if (!res->flags) @@ -473,35 +318,36 @@ static void rcar_pcie_hw_enable(struct rcar_pcie *pci) switch (resource_type(res)) { case IORESOURCE_IO: case IORESOURCE_MEM: - rcar_pcie_setup_window(i, pci, win); + rcar_pcie_set_outbound(pcie, i, win); i++; break; } } } -static int rcar_pcie_enable(struct rcar_pcie *pcie) +static int rcar_pcie_enable(struct rcar_pcie_host *host) { + struct pci_host_bridge *bridge = pci_host_bridge_from_priv(host); + struct rcar_pcie *pcie = &host->pcie; struct device *dev = pcie->dev; - struct pci_host_bridge *bridge = pci_host_bridge_from_priv(pcie); struct pci_bus *bus, *child; int ret; /* Try setting 5 GT/s link speed */ rcar_pcie_force_speedup(pcie); - rcar_pcie_setup(&bridge->windows, pcie); + rcar_pcie_setup(&bridge->windows, host); pci_add_flags(PCI_REASSIGN_ALL_BUS); bridge->dev.parent = dev; - bridge->sysdata = pcie; - bridge->busnr = pcie->root_bus_nr; + bridge->sysdata = host; + bridge->busnr = host->root_bus_nr; bridge->ops = &rcar_pcie_ops; bridge->map_irq = of_irq_parse_and_map_pci; bridge->swizzle_irq = pci_common_swizzle; if (IS_ENABLED(CONFIG_PCI_MSI)) - bridge->msi = &pcie->msi.chip; + bridge->msi = &host->msi.chip; ret = pci_scan_root_bus_bridge(bridge); if (ret < 0) @@ -563,35 +409,6 @@ static void phy_write_reg(struct rcar_pcie *pcie, phy_wait_for_ack(pcie); } -static int rcar_pcie_wait_for_phyrdy(struct rcar_pcie *pcie) -{ - unsigned int timeout = 10; - - while (timeout--) { - if (rcar_pci_read_reg(pcie, PCIEPHYSR) & PHYRDY) - return 0; - - msleep(5); - } - - return -ETIMEDOUT; -} - -static int rcar_pcie_wait_for_dl(struct rcar_pcie *pcie) -{ - unsigned int timeout = 10000; - - while (timeout--) { - if ((rcar_pci_read_reg(pcie, PCIETSTR) & DATA_LINK_ACTIVE)) - return 0; - - udelay(5); - cpu_relax(); - } - - return -ETIMEDOUT; -} - static int rcar_pcie_hw_init(struct rcar_pcie *pcie) { int err; @@ -662,8 +479,10 @@ static int rcar_pcie_hw_init(struct rcar_pcie *pcie) return 0; } -static int rcar_pcie_phy_init_h1(struct rcar_pcie *pcie) +static int rcar_pcie_phy_init_h1(struct rcar_pcie_host *host) { + struct rcar_pcie *pcie = &host->pcie; + /* Initialize the phy */ phy_write_reg(pcie, 0, 0x42, 0x1, 0x0EC34191); phy_write_reg(pcie, 1, 0x42, 0x1, 0x0EC34180); @@ -685,8 +504,10 @@ static int rcar_pcie_phy_init_h1(struct rcar_pcie *pcie) return 0; } -static int rcar_pcie_phy_init_gen2(struct rcar_pcie *pcie) +static int rcar_pcie_phy_init_gen2(struct rcar_pcie_host *host) { + struct rcar_pcie *pcie = &host->pcie; + /* * These settings come from the R-Car Series, 2nd Generation User's * Manual, section 50.3.1 (2) Initialization of the physical layer. @@ -705,17 +526,17 @@ static int rcar_pcie_phy_init_gen2(struct rcar_pcie *pcie) return 0; } -static int rcar_pcie_phy_init_gen3(struct rcar_pcie *pcie) +static int rcar_pcie_phy_init_gen3(struct rcar_pcie_host *host) { int err; - err = phy_init(pcie->phy); + err = phy_init(host->phy); if (err) return err; - err = phy_power_on(pcie->phy); + err = phy_power_on(host->phy); if (err) - phy_exit(pcie->phy); + phy_exit(host->phy); return err; } @@ -758,8 +579,9 @@ static void rcar_msi_free(struct rcar_msi *chip, unsigned long irq) static irqreturn_t rcar_pcie_msi_irq(int irq, void *data) { - struct rcar_pcie *pcie = data; - struct rcar_msi *msi = &pcie->msi; + struct rcar_pcie_host *host = data; + struct rcar_pcie *pcie = &host->pcie; + struct rcar_msi *msi = &host->msi; struct device *dev = pcie->dev; unsigned long reg; @@ -798,7 +620,9 @@ static int rcar_msi_setup_irq(struct msi_controller *chip, struct pci_dev *pdev, struct msi_desc *desc) { struct rcar_msi *msi = to_rcar_msi(chip); - struct rcar_pcie *pcie = container_of(chip, struct rcar_pcie, msi.chip); + struct rcar_pcie_host *host = container_of(chip, struct rcar_pcie_host, + msi.chip); + struct rcar_pcie *pcie = &host->pcie; struct msi_msg msg; unsigned int irq; int hwirq; @@ -827,8 +651,10 @@ static int rcar_msi_setup_irq(struct msi_controller *chip, struct pci_dev *pdev, static int rcar_msi_setup_irqs(struct msi_controller *chip, struct pci_dev *pdev, int nvec, int type) { - struct rcar_pcie *pcie = container_of(chip, struct rcar_pcie, msi.chip); struct rcar_msi *msi = to_rcar_msi(chip); + struct rcar_pcie_host *host = container_of(chip, struct rcar_pcie_host, + msi.chip); + struct rcar_pcie *pcie = &host->pcie; struct msi_desc *desc; struct msi_msg msg; unsigned int irq; @@ -905,9 +731,9 @@ static const struct irq_domain_ops msi_domain_ops = { .map = rcar_msi_map, }; -static void rcar_pcie_unmap_msi(struct rcar_pcie *pcie) +static void rcar_pcie_unmap_msi(struct rcar_pcie_host *host) { - struct rcar_msi *msi = &pcie->msi; + struct rcar_msi *msi = &host->msi; int i, irq; for (i = 0; i < INT_PCI_MSI_NR; i++) { @@ -919,9 +745,10 @@ static void rcar_pcie_unmap_msi(struct rcar_pcie *pcie) irq_domain_remove(msi->domain); } -static void rcar_pcie_hw_enable_msi(struct rcar_pcie *pcie) +static void rcar_pcie_hw_enable_msi(struct rcar_pcie_host *host) { - struct rcar_msi *msi = &pcie->msi; + struct rcar_pcie *pcie = &host->pcie; + struct rcar_msi *msi = &host->msi; unsigned long base; /* setup MSI data target */ @@ -934,10 +761,11 @@ static void rcar_pcie_hw_enable_msi(struct rcar_pcie *pcie) rcar_pci_write_reg(pcie, 0xffffffff, PCIEMSIIER); } -static int rcar_pcie_enable_msi(struct rcar_pcie *pcie) +static int rcar_pcie_enable_msi(struct rcar_pcie_host *host) { + struct rcar_pcie *pcie = &host->pcie; struct device *dev = pcie->dev; - struct rcar_msi *msi = &pcie->msi; + struct rcar_msi *msi = &host->msi; int err, i; mutex_init(&msi->lock); @@ -960,7 +788,7 @@ static int rcar_pcie_enable_msi(struct rcar_pcie *pcie) /* Two irqs are for MSI, but they are also used for non-MSI irqs */ err = devm_request_irq(dev, msi->irq1, rcar_pcie_msi_irq, IRQF_SHARED | IRQF_NO_THREAD, - rcar_msi_irq_chip.name, pcie); + rcar_msi_irq_chip.name, host); if (err < 0) { dev_err(dev, "failed to request IRQ: %d\n", err); goto err; @@ -968,7 +796,7 @@ static int rcar_pcie_enable_msi(struct rcar_pcie *pcie) err = devm_request_irq(dev, msi->irq2, rcar_pcie_msi_irq, IRQF_SHARED | IRQF_NO_THREAD, - rcar_msi_irq_chip.name, pcie); + rcar_msi_irq_chip.name, host); if (err < 0) { dev_err(dev, "failed to request IRQ: %d\n", err); goto err; @@ -976,18 +804,19 @@ static int rcar_pcie_enable_msi(struct rcar_pcie *pcie) /* setup MSI data target */ msi->pages = __get_free_pages(GFP_KERNEL, 0); - rcar_pcie_hw_enable_msi(pcie); + rcar_pcie_hw_enable_msi(host); return 0; err: - rcar_pcie_unmap_msi(pcie); + rcar_pcie_unmap_msi(host); return err; } -static void rcar_pcie_teardown_msi(struct rcar_pcie *pcie) +static void rcar_pcie_teardown_msi(struct rcar_pcie_host *host) { - struct rcar_msi *msi = &pcie->msi; + struct rcar_pcie *pcie = &host->pcie; + struct rcar_msi *msi = &host->msi; /* Disable all MSI interrupts */ rcar_pci_write_reg(pcie, 0, PCIEMSIIER); @@ -997,18 +826,19 @@ static void rcar_pcie_teardown_msi(struct rcar_pcie *pcie) free_pages(msi->pages, 0); - rcar_pcie_unmap_msi(pcie); + rcar_pcie_unmap_msi(host); } -static int rcar_pcie_get_resources(struct rcar_pcie *pcie) +static int rcar_pcie_get_resources(struct rcar_pcie_host *host) { + struct rcar_pcie *pcie = &host->pcie; struct device *dev = pcie->dev; struct resource res; int err, i; - pcie->phy = devm_phy_optional_get(dev, "pcie"); - if (IS_ERR(pcie->phy)) - return PTR_ERR(pcie->phy); + host->phy = devm_phy_optional_get(dev, "pcie"); + if (IS_ERR(host->phy)) + return PTR_ERR(host->phy); err = of_address_to_resource(dev->of_node, 0, &res); if (err) @@ -1018,10 +848,10 @@ static int rcar_pcie_get_resources(struct rcar_pcie *pcie) if (IS_ERR(pcie->base)) return PTR_ERR(pcie->base); - pcie->bus_clk = devm_clk_get(dev, "pcie_bus"); - if (IS_ERR(pcie->bus_clk)) { + host->bus_clk = devm_clk_get(dev, "pcie_bus"); + if (IS_ERR(host->bus_clk)) { dev_err(dev, "cannot get pcie bus clock\n"); - return PTR_ERR(pcie->bus_clk); + return PTR_ERR(host->bus_clk); } i = irq_of_parse_and_map(dev->of_node, 0); @@ -1030,7 +860,7 @@ static int rcar_pcie_get_resources(struct rcar_pcie *pcie) err = -ENOENT; goto err_irq1; } - pcie->msi.irq1 = i; + host->msi.irq1 = i; i = irq_of_parse_and_map(dev->of_node, 1); if (!i) { @@ -1038,12 +868,12 @@ static int rcar_pcie_get_resources(struct rcar_pcie *pcie) err = -ENOENT; goto err_irq2; } - pcie->msi.irq2 = i; + host->msi.irq2 = i; return 0; err_irq2: - irq_dispose_mapping(pcie->msi.irq1); + irq_dispose_mapping(host->msi.irq1); err_irq1: return err; } @@ -1086,21 +916,8 @@ static int rcar_pcie_inbound_ranges(struct rcar_pcie *pcie, mask = roundup_pow_of_two(size) - 1; mask &= ~0xf; - /* - * Set up 64-bit inbound regions as the range parser doesn't - * distinguish between 32 and 64-bit types. - */ - rcar_pci_write_reg(pcie, lower_32_bits(pci_addr), - PCIEPRAR(idx)); - rcar_pci_write_reg(pcie, lower_32_bits(cpu_addr), PCIELAR(idx)); - rcar_pci_write_reg(pcie, lower_32_bits(mask) | flags, - PCIELAMR(idx)); - - rcar_pci_write_reg(pcie, upper_32_bits(pci_addr), - PCIEPRAR(idx + 1)); - rcar_pci_write_reg(pcie, upper_32_bits(cpu_addr), - PCIELAR(idx + 1)); - rcar_pci_write_reg(pcie, 0, PCIELAMR(idx + 1)); + rcar_pcie_set_inbound(pcie, cpu_addr, pci_addr, + lower_32_bits(mask) | flags, idx, true); pci_addr += size; cpu_addr += size; @@ -1111,14 +928,14 @@ static int rcar_pcie_inbound_ranges(struct rcar_pcie *pcie, return 0; } -static int rcar_pcie_parse_map_dma_ranges(struct rcar_pcie *pcie) +static int rcar_pcie_parse_map_dma_ranges(struct rcar_pcie_host *host) { - struct pci_host_bridge *bridge = pci_host_bridge_from_priv(pcie); + struct pci_host_bridge *bridge = pci_host_bridge_from_priv(host); struct resource_entry *entry; int index = 0, err = 0; resource_list_for_each_entry(entry, &bridge->dma_ranges) { - err = rcar_pcie_inbound_ranges(pcie, entry, &index); + err = rcar_pcie_inbound_ranges(&host->pcie, entry, &index); if (err) break; } @@ -1145,21 +962,22 @@ static const struct of_device_id rcar_pcie_of_match[] = { static int rcar_pcie_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; + struct rcar_pcie_host *host; struct rcar_pcie *pcie; u32 data; int err; struct pci_host_bridge *bridge; - bridge = pci_alloc_host_bridge(sizeof(*pcie)); + bridge = pci_alloc_host_bridge(sizeof(*host)); if (!bridge) return -ENOMEM; - pcie = pci_host_bridge_priv(bridge); - + host = pci_host_bridge_priv(bridge); + pcie = &host->pcie; pcie->dev = dev; - platform_set_drvdata(pdev, pcie); + platform_set_drvdata(pdev, host); - err = pci_parse_request_of_pci_ranges(dev, &pcie->resources, + err = pci_parse_request_of_pci_ranges(dev, &host->resources, &bridge->dma_ranges, NULL); if (err) goto err_free_bridge; @@ -1171,24 +989,24 @@ static int rcar_pcie_probe(struct platform_device *pdev) goto err_pm_disable; } - err = rcar_pcie_get_resources(pcie); + err = rcar_pcie_get_resources(host); if (err < 0) { dev_err(dev, "failed to request resources: %d\n", err); goto err_pm_put; } - err = clk_prepare_enable(pcie->bus_clk); + err = clk_prepare_enable(host->bus_clk); if (err) { dev_err(dev, "failed to enable bus clock: %d\n", err); goto err_unmap_msi_irqs; } - err = rcar_pcie_parse_map_dma_ranges(pcie); + err = rcar_pcie_parse_map_dma_ranges(host); if (err) goto err_clk_disable; - pcie->phy_init_fn = of_device_get_match_data(dev); - err = pcie->phy_init_fn(pcie); + host->phy_init_fn = of_device_get_match_data(dev); + err = host->phy_init_fn(host); if (err) { dev_err(dev, "failed to init PCIe PHY\n"); goto err_clk_disable; @@ -1205,7 +1023,7 @@ static int rcar_pcie_probe(struct platform_device *pdev) dev_info(dev, "PCIe x%d: link up\n", (data >> 20) & 0x3f); if (IS_ENABLED(CONFIG_PCI_MSI)) { - err = rcar_pcie_enable_msi(pcie); + err = rcar_pcie_enable_msi(host); if (err < 0) { dev_err(dev, "failed to enable MSI support: %d\n", @@ -1214,7 +1032,7 @@ static int rcar_pcie_probe(struct platform_device *pdev) } } - err = rcar_pcie_enable(pcie); + err = rcar_pcie_enable(host); if (err) goto err_msi_teardown; @@ -1222,27 +1040,27 @@ static int rcar_pcie_probe(struct platform_device *pdev) err_msi_teardown: if (IS_ENABLED(CONFIG_PCI_MSI)) - rcar_pcie_teardown_msi(pcie); + rcar_pcie_teardown_msi(host); err_phy_shutdown: - if (pcie->phy) { - phy_power_off(pcie->phy); - phy_exit(pcie->phy); + if (host->phy) { + phy_power_off(host->phy); + phy_exit(host->phy); } err_clk_disable: - clk_disable_unprepare(pcie->bus_clk); + clk_disable_unprepare(host->bus_clk); err_unmap_msi_irqs: - irq_dispose_mapping(pcie->msi.irq2); - irq_dispose_mapping(pcie->msi.irq1); + irq_dispose_mapping(host->msi.irq2); + irq_dispose_mapping(host->msi.irq1); err_pm_put: pm_runtime_put(dev); err_pm_disable: pm_runtime_disable(dev); - pci_free_resource_list(&pcie->resources); + pci_free_resource_list(&host->resources); err_free_bridge: pci_free_host_bridge(bridge); @@ -1252,16 +1070,17 @@ err_free_bridge: static int __maybe_unused rcar_pcie_resume(struct device *dev) { - struct rcar_pcie *pcie = dev_get_drvdata(dev); + struct rcar_pcie_host *host = dev_get_drvdata(dev); + struct rcar_pcie *pcie = &host->pcie; unsigned int data; int err; - err = rcar_pcie_parse_map_dma_ranges(pcie); + err = rcar_pcie_parse_map_dma_ranges(host); if (err) return 0; /* Failure to get a link might just be that no cards are inserted */ - err = pcie->phy_init_fn(pcie); + err = host->phy_init_fn(host); if (err) { dev_info(dev, "PCIe link down\n"); return 0; @@ -1272,16 +1091,17 @@ static int __maybe_unused rcar_pcie_resume(struct device *dev) /* Enable MSI */ if (IS_ENABLED(CONFIG_PCI_MSI)) - rcar_pcie_hw_enable_msi(pcie); + rcar_pcie_hw_enable_msi(host); - rcar_pcie_hw_enable(pcie); + rcar_pcie_hw_enable(host); return 0; } static int rcar_pcie_resume_noirq(struct device *dev) { - struct rcar_pcie *pcie = dev_get_drvdata(dev); + struct rcar_pcie_host *host = dev_get_drvdata(dev); + struct rcar_pcie *pcie = &host->pcie; if (rcar_pci_read_reg(pcie, PMSR) && !(rcar_pci_read_reg(pcie, PCIETCTLR) & DL_DOWN)) diff --git a/drivers/pci/controller/pcie-rcar.c b/drivers/pci/controller/pcie-rcar.c new file mode 100644 index 000000000000..cf8840d180c3 --- /dev/null +++ b/drivers/pci/controller/pcie-rcar.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PCIe driver for Renesas R-Car SoCs + * Copyright (C) 2014-2020 Renesas Electronics Europe Ltd + * + * Author: Phil Edworthy + */ + +#include +#include + +#include "pcie-rcar.h" + +void rcar_pci_write_reg(struct rcar_pcie *pcie, u32 val, unsigned int reg) +{ + writel(val, pcie->base + reg); +} + +u32 rcar_pci_read_reg(struct rcar_pcie *pcie, unsigned int reg) +{ + return readl(pcie->base + reg); +} + +void rcar_rmw32(struct rcar_pcie *pcie, int where, u32 mask, u32 data) +{ + unsigned int shift = BITS_PER_BYTE * (where & 3); + u32 val = rcar_pci_read_reg(pcie, where & ~3); + + val &= ~(mask << shift); + val |= data << shift; + rcar_pci_write_reg(pcie, val, where & ~3); +} + +int rcar_pcie_wait_for_phyrdy(struct rcar_pcie *pcie) +{ + unsigned int timeout = 10; + + while (timeout--) { + if (rcar_pci_read_reg(pcie, PCIEPHYSR) & PHYRDY) + return 0; + + msleep(5); + } + + return -ETIMEDOUT; +} + +int rcar_pcie_wait_for_dl(struct rcar_pcie *pcie) +{ + unsigned int timeout = 10000; + + while (timeout--) { + if ((rcar_pci_read_reg(pcie, PCIETSTR) & DATA_LINK_ACTIVE)) + return 0; + + udelay(5); + cpu_relax(); + } + + return -ETIMEDOUT; +} + +void rcar_pcie_set_outbound(struct rcar_pcie *pcie, int win, + struct resource_entry *window) +{ + /* Setup PCIe address space mappings for each resource */ + struct resource *res = window->res; + resource_size_t res_start; + resource_size_t size; + u32 mask; + + rcar_pci_write_reg(pcie, 0x00000000, PCIEPTCTLR(win)); + + /* + * The PAMR mask is calculated in units of 128Bytes, which + * keeps things pretty simple. + */ + size = resource_size(res); + mask = (roundup_pow_of_two(size) / SZ_128) - 1; + rcar_pci_write_reg(pcie, mask << 7, PCIEPAMR(win)); + + if (res->flags & IORESOURCE_IO) + res_start = pci_pio_to_address(res->start) - window->offset; + else + res_start = res->start - window->offset; + + rcar_pci_write_reg(pcie, upper_32_bits(res_start), PCIEPAUR(win)); + rcar_pci_write_reg(pcie, lower_32_bits(res_start) & ~0x7F, + PCIEPALR(win)); + + /* First resource is for IO */ + mask = PAR_ENABLE; + if (res->flags & IORESOURCE_IO) + mask |= IO_SPACE; + + rcar_pci_write_reg(pcie, mask, PCIEPTCTLR(win)); +} + +void rcar_pcie_set_inbound(struct rcar_pcie *pcie, u64 cpu_addr, + u64 pci_addr, u64 flags, int idx, bool host) +{ + /* + * Set up 64-bit inbound regions as the range parser doesn't + * distinguish between 32 and 64-bit types. + */ + if (host) + rcar_pci_write_reg(pcie, lower_32_bits(pci_addr), + PCIEPRAR(idx)); + rcar_pci_write_reg(pcie, lower_32_bits(cpu_addr), PCIELAR(idx)); + rcar_pci_write_reg(pcie, flags, PCIELAMR(idx)); + + if (host) + rcar_pci_write_reg(pcie, upper_32_bits(pci_addr), + PCIEPRAR(idx + 1)); + rcar_pci_write_reg(pcie, upper_32_bits(cpu_addr), PCIELAR(idx + 1)); + rcar_pci_write_reg(pcie, 0, PCIELAMR(idx + 1)); +} diff --git a/drivers/pci/controller/pcie-rcar.h b/drivers/pci/controller/pcie-rcar.h new file mode 100644 index 000000000000..97640e16af58 --- /dev/null +++ b/drivers/pci/controller/pcie-rcar.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * PCIe driver for Renesas R-Car SoCs + * Copyright (C) 2014-2020 Renesas Electronics Europe Ltd + * + * Author: Phil Edworthy + */ + +#ifndef _PCIE_RCAR_H +#define _PCIE_RCAR_H + +#define PCIECAR 0x000010 +#define PCIECCTLR 0x000018 +#define CONFIG_SEND_ENABLE BIT(31) +#define TYPE0 (0 << 8) +#define TYPE1 BIT(8) +#define PCIECDR 0x000020 +#define PCIEMSR 0x000028 +#define PCIEINTXR 0x000400 +#define PCIEPHYSR 0x0007f0 +#define PHYRDY BIT(0) +#define PCIEMSITXR 0x000840 + +/* Transfer control */ +#define PCIETCTLR 0x02000 +#define DL_DOWN BIT(3) +#define CFINIT BIT(0) +#define PCIETSTR 0x02004 +#define DATA_LINK_ACTIVE BIT(0) +#define PCIEERRFR 0x02020 +#define UNSUPPORTED_REQUEST BIT(4) +#define PCIEMSIFR 0x02044 +#define PCIEMSIALR 0x02048 +#define MSIFE BIT(0) +#define PCIEMSIAUR 0x0204c +#define PCIEMSIIER 0x02050 + +/* root port address */ +#define PCIEPRAR(x) (0x02080 + ((x) * 0x4)) + +/* local address reg & mask */ +#define PCIELAR(x) (0x02200 + ((x) * 0x20)) +#define PCIELAMR(x) (0x02208 + ((x) * 0x20)) +#define LAM_PREFETCH BIT(3) +#define LAM_64BIT BIT(2) +#define LAR_ENABLE BIT(1) + +/* PCIe address reg & mask */ +#define PCIEPALR(x) (0x03400 + ((x) * 0x20)) +#define PCIEPAUR(x) (0x03404 + ((x) * 0x20)) +#define PCIEPAMR(x) (0x03408 + ((x) * 0x20)) +#define PCIEPTCTLR(x) (0x0340c + ((x) * 0x20)) +#define PAR_ENABLE BIT(31) +#define IO_SPACE BIT(8) + +/* Configuration */ +#define PCICONF(x) (0x010000 + ((x) * 0x4)) +#define PMCAP(x) (0x010040 + ((x) * 0x4)) +#define EXPCAP(x) (0x010070 + ((x) * 0x4)) +#define VCCAP(x) (0x010100 + ((x) * 0x4)) + +/* link layer */ +#define IDSETR1 0x011004 +#define TLCTLR 0x011048 +#define MACSR 0x011054 +#define SPCHGFIN BIT(4) +#define SPCHGFAIL BIT(6) +#define SPCHGSUC BIT(7) +#define LINK_SPEED (0xf << 16) +#define LINK_SPEED_2_5GTS (1 << 16) +#define LINK_SPEED_5_0GTS (2 << 16) +#define MACCTLR 0x011058 +#define MACCTLR_NFTS_MASK GENMASK(23, 16) /* The name is from SH7786 */ +#define SPEED_CHANGE BIT(24) +#define SCRAMBLE_DISABLE BIT(27) +#define LTSMDIS BIT(31) +#define MACCTLR_INIT_VAL (LTSMDIS | MACCTLR_NFTS_MASK) +#define PMSR 0x01105c +#define MACS2R 0x011078 +#define MACCGSPSETR 0x011084 +#define SPCNGRSN BIT(31) + +/* R-Car H1 PHY */ +#define H1_PCIEPHYADRR 0x04000c +#define WRITE_CMD BIT(16) +#define PHY_ACK BIT(24) +#define RATE_POS 12 +#define LANE_POS 8 +#define ADR_POS 0 +#define H1_PCIEPHYDOUTR 0x040014 + +/* R-Car Gen2 PHY */ +#define GEN2_PCIEPHYADDR 0x780 +#define GEN2_PCIEPHYDATA 0x784 +#define GEN2_PCIEPHYCTRL 0x78c + +#define INT_PCI_MSI_NR 32 + +#define RCONF(x) (PCICONF(0) + (x)) +#define RPMCAP(x) (PMCAP(0) + (x)) +#define REXPCAP(x) (EXPCAP(0) + (x)) +#define RVCCAP(x) (VCCAP(0) + (x)) + +#define PCIE_CONF_BUS(b) (((b) & 0xff) << 24) +#define PCIE_CONF_DEV(d) (((d) & 0x1f) << 19) +#define PCIE_CONF_FUNC(f) (((f) & 0x7) << 16) + +#define RCAR_PCI_MAX_RESOURCES 4 +#define MAX_NR_INBOUND_MAPS 6 + +struct rcar_pcie { + struct device *dev; + void __iomem *base; +}; + +enum { + RCAR_PCI_ACCESS_READ, + RCAR_PCI_ACCESS_WRITE, +}; + +void rcar_pci_write_reg(struct rcar_pcie *pcie, u32 val, unsigned int reg); +u32 rcar_pci_read_reg(struct rcar_pcie *pcie, unsigned int reg); +void rcar_rmw32(struct rcar_pcie *pcie, int where, u32 mask, u32 data); +int rcar_pcie_wait_for_phyrdy(struct rcar_pcie *pcie); +int rcar_pcie_wait_for_dl(struct rcar_pcie *pcie); +void rcar_pcie_set_outbound(struct rcar_pcie *pcie, int win, + struct resource_entry *window); +void rcar_pcie_set_inbound(struct rcar_pcie *pcie, u64 cpu_addr, + u64 pci_addr, u64 flags, int idx, bool host); + +#endif From 328263687148bebf0d5daf5d06bcc2a46f3d7b0a Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Thu, 7 May 2020 13:33:14 +0100 Subject: [PATCH 050/427] PCI: rcar: Fix calculating mask for PCIEPAMR register The mask value was calculated incorrectly for PCIEPAMR register if the size was less than 128 bytes. Fix this issue by adding a check on size. Link: https://lore.kernel.org/r/1588854799-13710-4-git-send-email-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Lad Prabhakar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Yoshihiro Shimoda --- drivers/pci/controller/pcie-rcar.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pcie-rcar.c b/drivers/pci/controller/pcie-rcar.c index cf8840d180c3..7583699ef7b6 100644 --- a/drivers/pci/controller/pcie-rcar.c +++ b/drivers/pci/controller/pcie-rcar.c @@ -76,7 +76,10 @@ void rcar_pcie_set_outbound(struct rcar_pcie *pcie, int win, * keeps things pretty simple. */ size = resource_size(res); - mask = (roundup_pow_of_two(size) / SZ_128) - 1; + if (size > 128) + mask = (roundup_pow_of_two(size) / SZ_128) - 1; + else + mask = 0x0; rcar_pci_write_reg(pcie, mask << 7, PCIEPAMR(win)); if (res->flags & IORESOURCE_IO) From 975cf23e3aa89588cbfc9ad6f2b23bd32af4edc7 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Thu, 7 May 2020 13:33:15 +0100 Subject: [PATCH 051/427] PCI: endpoint: Pass page size as argument to pci_epc_mem_init() pci_epc_mem_init() internally used page size equal to *PAGE_SIZE* to manage the address space so instead just pass the page size as a argument to pci_epc_mem_init(). Also make pci_epc_mem_init() as a C function instead of a macro function in preparation for adding support for pci-epc-mem core to handle multiple windows. Link: https://lore.kernel.org/r/1588854799-13710-5-git-send-email-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Lad Prabhakar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Yoshihiro Shimoda Acked-by: Kishon Vijay Abraham I --- drivers/pci/controller/cadence/pcie-cadence-ep.c | 2 +- drivers/pci/controller/pcie-rockchip-ep.c | 2 +- drivers/pci/endpoint/pci-epc-mem.c | 7 +++++++ include/linux/pci-epc.h | 5 ++--- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/pci/controller/cadence/pcie-cadence-ep.c b/drivers/pci/controller/cadence/pcie-cadence-ep.c index 1c173dad67d1..1c15c8352125 100644 --- a/drivers/pci/controller/cadence/pcie-cadence-ep.c +++ b/drivers/pci/controller/cadence/pcie-cadence-ep.c @@ -450,7 +450,7 @@ int cdns_pcie_ep_setup(struct cdns_pcie_ep *ep) epc->max_functions = 1; ret = pci_epc_mem_init(epc, pcie->mem_res->start, - resource_size(pcie->mem_res)); + resource_size(pcie->mem_res), PAGE_SIZE); if (ret < 0) { dev_err(dev, "failed to initialize the memory space\n"); goto err_init; diff --git a/drivers/pci/controller/pcie-rockchip-ep.c b/drivers/pci/controller/pcie-rockchip-ep.c index d743b0a48988..5eaf36629a75 100644 --- a/drivers/pci/controller/pcie-rockchip-ep.c +++ b/drivers/pci/controller/pcie-rockchip-ep.c @@ -615,7 +615,7 @@ static int rockchip_pcie_ep_probe(struct platform_device *pdev) rockchip_pcie_write(rockchip, BIT(0), PCIE_CORE_PHY_FUNC_CFG); err = pci_epc_mem_init(epc, rockchip->mem_res->start, - resource_size(rockchip->mem_res)); + resource_size(rockchip->mem_res), PAGE_SIZE); if (err < 0) { dev_err(dev, "failed to initialize the memory space\n"); goto err_uninit_port; diff --git a/drivers/pci/endpoint/pci-epc-mem.c b/drivers/pci/endpoint/pci-epc-mem.c index abfac1109a13..cdd1d3821249 100644 --- a/drivers/pci/endpoint/pci-epc-mem.c +++ b/drivers/pci/endpoint/pci-epc-mem.c @@ -93,6 +93,13 @@ return ret; } EXPORT_SYMBOL_GPL(__pci_epc_mem_init); +int pci_epc_mem_init(struct pci_epc *epc, phys_addr_t base, + size_t size, size_t page_size) +{ + return __pci_epc_mem_init(epc, base, size, page_size); +} +EXPORT_SYMBOL_GPL(pci_epc_mem_init); + /** * pci_epc_mem_exit() - cleanup the pci_epc_mem structure * @epc: the EPC device that invoked pci_epc_mem_exit diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index e0ed9d01f6e5..5bc1de65849e 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -137,9 +137,6 @@ struct pci_epc_features { #define devm_pci_epc_create(dev, ops) \ __devm_pci_epc_create((dev), (ops), THIS_MODULE) -#define pci_epc_mem_init(epc, phys_addr, size) \ - __pci_epc_mem_init((epc), (phys_addr), (size), PAGE_SIZE) - static inline void epc_set_drvdata(struct pci_epc *epc, void *data) { dev_set_drvdata(&epc->dev, data); @@ -195,6 +192,8 @@ unsigned int pci_epc_get_first_free_bar(const struct pci_epc_features struct pci_epc *pci_epc_get(const char *epc_name); void pci_epc_put(struct pci_epc *epc); +int pci_epc_mem_init(struct pci_epc *epc, phys_addr_t base, + size_t size, size_t page_size); int __pci_epc_mem_init(struct pci_epc *epc, phys_addr_t phys_addr, size_t size, size_t page_size); void pci_epc_mem_exit(struct pci_epc *epc); From 22e21e51ce755399fd42055a3f668ee4af370881 Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Thu, 7 May 2020 19:20:20 +0200 Subject: [PATCH 052/427] PCI: brcmstb: Assert fundamental reset on initialization While preparing the driver for upstream this detail was missed. If not asserted during the initialization process, devices connected on the bus will not be made aware of the internal reset happening. This, potentially resulting in unexpected behavior. Link: https://lore.kernel.org/r/20200507172020.18000-1-nsaenzjulienne@suse.de Fixes: c0452137034b ("PCI: brcmstb: Add Broadcom STB PCIe host controller driver") Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Lorenzo Pieralisi Acked-by: Florian Fainelli --- drivers/pci/controller/pcie-brcmstb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index 6d79d14527a6..9aa4cdc7557e 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -697,6 +697,7 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie) /* Reset the bridge */ brcm_pcie_bridge_sw_init_set(pcie, 1); + brcm_pcie_perst_set(pcie, 1); usleep_range(100, 200); From b382e4a0a18f4abfd4d53f32a03dd6bb60df758a Mon Sep 17 00:00:00 2001 From: Jim Quinlan Date: Thu, 7 May 2020 16:15:40 -0400 Subject: [PATCH 053/427] PCI: brcmstb: Don't clk_put() a managed clock clk_put() was being invoked on a clock obtained by devm_clk_get_optional(). Link: https://lore.kernel.org/r/20200507201544.43432-2-james.quinlan@broadcom.com Signed-off-by: Jim Quinlan Signed-off-by: Lorenzo Pieralisi Acked-by: Florian Fainelli Acked-by: Nicolas Saenz Julienne --- drivers/pci/controller/pcie-brcmstb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index 9aa4cdc7557e..e847528c072f 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -900,7 +900,6 @@ static void __brcm_pcie_remove(struct brcm_pcie *pcie) brcm_msi_remove(pcie); brcm_pcie_turn_off(pcie); clk_disable_unprepare(pcie->clk); - clk_put(pcie->clk); } static int brcm_pcie_remove(struct platform_device *pdev) From 077a4fa92a615a4d0f86eae68d777b9dd5e5a95b Mon Sep 17 00:00:00 2001 From: Jim Quinlan Date: Thu, 7 May 2020 16:15:41 -0400 Subject: [PATCH 054/427] PCI: brcmstb: Fix window register offset from 4 to 8 The outbound memory window registers were being referenced with an incorrect stride offset. This probably wasn't noticed previously as there was likely only one such window employed. Link: https://lore.kernel.org/r/20200507201544.43432-3-james.quinlan@broadcom.com Fixes: c0452137034b ("PCI: brcmstb: Add Broadcom STB PCIe host controller driver") Signed-off-by: Jim Quinlan Signed-off-by: Lorenzo Pieralisi Acked-by: Florian Fainelli Acked-by: Nicolas Saenz Julienne --- drivers/pci/controller/pcie-brcmstb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index e847528c072f..a4a70532a658 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -54,11 +54,11 @@ #define PCIE_MISC_CPU_2_PCIE_MEM_WIN0_LO 0x400c #define PCIE_MEM_WIN0_LO(win) \ - PCIE_MISC_CPU_2_PCIE_MEM_WIN0_LO + ((win) * 4) + PCIE_MISC_CPU_2_PCIE_MEM_WIN0_LO + ((win) * 8) #define PCIE_MISC_CPU_2_PCIE_MEM_WIN0_HI 0x4010 #define PCIE_MEM_WIN0_HI(win) \ - PCIE_MISC_CPU_2_PCIE_MEM_WIN0_HI + ((win) * 4) + PCIE_MISC_CPU_2_PCIE_MEM_WIN0_HI + ((win) * 8) #define PCIE_MISC_RC_BAR1_CONFIG_LO 0x402c #define PCIE_MISC_RC_BAR1_CONFIG_LO_SIZE_MASK 0x1f From 420c517b1e30faa4a102f884045496a1280eab1c Mon Sep 17 00:00:00 2001 From: Jim Quinlan Date: Thu, 7 May 2020 16:15:42 -0400 Subject: [PATCH 055/427] dt-bindings: PCI: brcmstb: New prop 'aspm-no-l0s' For various reasons, one may want to disable the ASPM L0s capability. Link: https://lore.kernel.org/r/20200507201544.43432-4-james.quinlan@broadcom.com Signed-off-by: Jim Quinlan Signed-off-by: Lorenzo Pieralisi Acked-by: Rob Herring --- Documentation/devicetree/bindings/pci/brcm,stb-pcie.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/brcm,stb-pcie.yaml b/Documentation/devicetree/bindings/pci/brcm,stb-pcie.yaml index 77d3e81a437b..8680a0f86c5a 100644 --- a/Documentation/devicetree/bindings/pci/brcm,stb-pcie.yaml +++ b/Documentation/devicetree/bindings/pci/brcm,stb-pcie.yaml @@ -56,6 +56,8 @@ properties: description: Indicates usage of spread-spectrum clocking. type: boolean + aspm-no-l0s: true + required: - reg - dma-ranges From caab002d5069f8610a6ec1d2addeef21f4f96909 Mon Sep 17 00:00:00 2001 From: Jim Quinlan Date: Thu, 7 May 2020 16:15:43 -0400 Subject: [PATCH 056/427] PCI: brcmstb: Disable L0s component of ASPM if requested Some informal internal experiments has shown that the BrcmSTB ASPM L0s savings may introduce an undesirable noise signal on some customers' boards. In addition, L0s was found lacking in realized power savings, especially relative to the L1 ASPM component. This is BrcmSTB's experience and may not hold for others. At any rate, if the 'aspm-no-l0s' property is present L0s will be disabled. Link: https://lore.kernel.org/r/20200507201544.43432-5-james.quinlan@broadcom.com Signed-off-by: Jim Quinlan Signed-off-by: Lorenzo Pieralisi Acked-by: Florian Fainelli Acked-by: Nicolas Saenz Julienne --- drivers/pci/controller/pcie-brcmstb.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index a4a70532a658..752f5b331579 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -41,6 +41,9 @@ #define PCIE_RC_CFG_PRIV1_ID_VAL3 0x043c #define PCIE_RC_CFG_PRIV1_ID_VAL3_CLASS_CODE_MASK 0xffffff +#define PCIE_RC_CFG_PRIV1_LINK_CAPABILITY 0x04dc +#define PCIE_RC_CFG_PRIV1_LINK_CAPABILITY_ASPM_SUPPORT_MASK 0xc00 + #define PCIE_RC_DL_MDIO_ADDR 0x1100 #define PCIE_RC_DL_MDIO_WR_DATA 0x1104 #define PCIE_RC_DL_MDIO_RD_DATA 0x1108 @@ -693,7 +696,7 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie) int num_out_wins = 0; u16 nlw, cls, lnksta; int i, ret; - u32 tmp; + u32 tmp, aspm_support; /* Reset the bridge */ brcm_pcie_bridge_sw_init_set(pcie, 1); @@ -804,6 +807,15 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie) num_out_wins++; } + /* Don't advertise L0s capability if 'aspm-no-l0s' */ + aspm_support = PCIE_LINK_STATE_L1; + if (!of_property_read_bool(pcie->np, "aspm-no-l0s")) + aspm_support |= PCIE_LINK_STATE_L0S; + tmp = readl(base + PCIE_RC_CFG_PRIV1_LINK_CAPABILITY); + u32p_replace_bits(&tmp, aspm_support, + PCIE_RC_CFG_PRIV1_LINK_CAPABILITY_ASPM_SUPPORT_MASK); + writel(tmp, base + PCIE_RC_CFG_PRIV1_LINK_CAPABILITY); + /* * For config space accesses on the RC, show the right class for * a PCIe-PCIe bridge (the default setting is to be EP mode). From 83cc3508ffaa6e2cd364d29418d35fab6f069b51 Mon Sep 17 00:00:00 2001 From: Wei Hu Date: Thu, 7 May 2020 13:02:11 +0800 Subject: [PATCH 057/427] PCI: hv: Fix the PCI HyperV probe failure path to release resource properly In some error cases in hv_pci_probe(), allocated resources are not freed. Fix this by adding a field to keep track of the high water mark for slots that have resources allocated to them. In case of an error, this high water mark is used to know which slots have resources that must be released. Since slots are numbered starting with zero, a value of -1 indicates no slots have been allocated resources. There may be unused slots in the range between slot 0 and the high water mark slot, but these slots are already ignored by the existing code in the allocate and release loops with the call to get_pcichild_wslot(). Link: https://lore.kernel.org/r/20200507050211.10923-1-weh@microsoft.com Signed-off-by: Wei Hu Signed-off-by: Lorenzo Pieralisi Reviewed-by: Michael Kelley --- drivers/pci/controller/pci-hyperv.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index e15022ff63e3..e6fac0187722 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -480,6 +480,9 @@ struct hv_pcibus_device { struct workqueue_struct *wq; + /* Highest slot of child device with resources allocated */ + int wslot_res_allocated; + /* hypercall arg, must not cross page boundary */ struct hv_retarget_device_interrupt retarget_msi_interrupt_params; @@ -2847,7 +2850,7 @@ static int hv_send_resources_allocated(struct hv_device *hdev) struct hv_pci_dev *hpdev; struct pci_packet *pkt; size_t size_res; - u32 wslot; + int wslot; int ret; size_res = (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) @@ -2900,6 +2903,8 @@ static int hv_send_resources_allocated(struct hv_device *hdev) comp_pkt.completion_status); break; } + + hbus->wslot_res_allocated = wslot; } kfree(pkt); @@ -2918,10 +2923,10 @@ static int hv_send_resources_released(struct hv_device *hdev) struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); struct pci_child_message pkt; struct hv_pci_dev *hpdev; - u32 wslot; + int wslot; int ret; - for (wslot = 0; wslot < 256; wslot++) { + for (wslot = hbus->wslot_res_allocated; wslot >= 0; wslot--) { hpdev = get_pcichild_wslot(hbus, wslot); if (!hpdev) continue; @@ -2936,8 +2941,12 @@ static int hv_send_resources_released(struct hv_device *hdev) VM_PKT_DATA_INBAND, 0); if (ret) return ret; + + hbus->wslot_res_allocated = wslot - 1; } + hbus->wslot_res_allocated = -1; + return 0; } @@ -3037,6 +3046,7 @@ static int hv_pci_probe(struct hv_device *hdev, if (!hbus) return -ENOMEM; hbus->state = hv_pcibus_init; + hbus->wslot_res_allocated = -1; /* * The PCI bus "domain" is what is called "segment" in ACPI and other @@ -3136,7 +3146,7 @@ static int hv_pci_probe(struct hv_device *hdev, ret = hv_pci_allocate_bridge_windows(hbus); if (ret) - goto free_irq_domain; + goto exit_d0; ret = hv_send_resources_allocated(hdev); if (ret) @@ -3154,6 +3164,8 @@ static int hv_pci_probe(struct hv_device *hdev, free_windows: hv_pci_free_bridge_windows(hbus); +exit_d0: + (void) hv_pci_bus_exit(hdev, true); free_irq_domain: irq_domain_remove(hbus->irq_domain); free_fwnode: From c81992e7f4aa19a055dbff5bd6c6d5ff9408f2fb Mon Sep 17 00:00:00 2001 From: Wei Hu Date: Thu, 7 May 2020 13:03:00 +0800 Subject: [PATCH 058/427] PCI: hv: Retry PCI bus D0 entry on invalid device state When kdump is triggered, some PCI devices may have not been shut down cleanly before the kdump kernel starts. This causes the initial attempt to enter D0 state in the kdump kernel to fail with invalid device state returned from Hyper-V host. When this happens, explicitly call hv_pci_bus_exit() and retry to enter the D0 state. Link: https://lore.kernel.org/r/20200507050300.10974-1-weh@microsoft.com Signed-off-by: Wei Hu [lorenzo.pieralisi@arm.com: commit log] Signed-off-by: Lorenzo Pieralisi Reviewed-by: Michael Kelley --- drivers/pci/controller/pci-hyperv.c | 40 +++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index e6fac0187722..92092a47d3af 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -2739,6 +2739,8 @@ static void hv_free_config_window(struct hv_pcibus_device *hbus) vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH); } +static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs); + /** * hv_pci_enter_d0() - Bring the "bus" into the D0 power state * @hdev: VMBus's tracking struct for this root PCI bus @@ -2751,8 +2753,10 @@ static int hv_pci_enter_d0(struct hv_device *hdev) struct pci_bus_d0_entry *d0_entry; struct hv_pci_compl comp_pkt; struct pci_packet *pkt; + bool retry = true; int ret; +enter_d0_retry: /* * Tell the host that the bus is ready to use, and moved into the * powered-on state. This includes telling the host which region @@ -2779,6 +2783,38 @@ static int hv_pci_enter_d0(struct hv_device *hdev) if (ret) goto exit; + /* + * In certain case (Kdump) the pci device of interest was + * not cleanly shut down and resource is still held on host + * side, the host could return invalid device status. + * We need to explicitly request host to release the resource + * and try to enter D0 again. + */ + if (comp_pkt.completion_status < 0 && retry) { + retry = false; + + dev_err(&hdev->device, "Retrying D0 Entry\n"); + + /* + * Hv_pci_bus_exit() calls hv_send_resource_released() + * to free up resources of its child devices. + * In the kdump kernel we need to set the + * wslot_res_allocated to 255 so it scans all child + * devices to release resources allocated in the + * normal kernel before panic happened. + */ + hbus->wslot_res_allocated = 255; + + ret = hv_pci_bus_exit(hdev, true); + + if (ret == 0) { + kfree(pkt); + goto enter_d0_retry; + } + dev_err(&hdev->device, + "Retrying D0 failed with ret %d\n", ret); + } + if (comp_pkt.completion_status < 0) { dev_err(&hdev->device, "PCI Pass-through VSP failed D0 Entry with status %x\n", @@ -3185,7 +3221,7 @@ free_bus: return ret; } -static int hv_pci_bus_exit(struct hv_device *hdev, bool hibernating) +static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) { struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); struct { @@ -3203,7 +3239,7 @@ static int hv_pci_bus_exit(struct hv_device *hdev, bool hibernating) if (hdev->channel->rescind) return 0; - if (!hibernating) { + if (!keep_devs) { /* Delete any children which might still exist. */ dr = kzalloc(sizeof(*dr), GFP_KERNEL); if (dr && hv_pci_start_relations_work(hbus, dr)) From 2aff0d5d61e75660dab30e56ab23f298291d505b Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 6 May 2020 07:21:30 +0200 Subject: [PATCH 059/427] MAINTAINERS: correct typo in new NXP LAYERSCAPE GEN4 Commit 3edeb49525bb ("dt-bindings: PCI: Add NXP Layerscape SoCs PCIe Gen4 controller") includes a new entry in MAINTAINERS, but slipped in a typo in one of the file entries. Hence, since then, ./scripts/get_maintainer.pl --self-test complains: warning: no file matches F: \ drivers/pci/controller/mobibeil/pcie-layerscape-gen4.c Correct the typo in PCI DRIVER FOR NXP LAYERSCAPE GEN4 CONTROLLER. Link: https://lore.kernel.org/r/20200506052130.5780-1-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Signed-off-by: Lorenzo Pieralisi Acked-by: Rob Herring --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index e64e5db31497..0fd27329e6f7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12941,7 +12941,7 @@ L: linux-pci@vger.kernel.org L: linux-arm-kernel@lists.infradead.org S: Maintained F: Documentation/devicetree/bindings/pci/layerscape-pcie-gen4.txt -F: drivers/pci/controller/mobibeil/pcie-layerscape-gen4.c +F: drivers/pci/controller/mobiveil/pcie-layerscape-gen4.c PCI DRIVER FOR RENESAS R-CAR M: Marek Vasut From b92b36eadf4d7fa4a34f048c2a3bb61a735a885e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 8 May 2020 18:07:40 +0300 Subject: [PATCH 060/427] workqueue: Fix an use after free in init_rescuer() We need to preserve error code before freeing "rescuer". Fixes: f187b6974f6df ("workqueue: Use IS_ERR and PTR_ERR instead of PTR_ERR_OR_ZERO.") Signed-off-by: Dan Carpenter Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ddf0537dce14..10ed8d761e0b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4197,6 +4197,7 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, static int init_rescuer(struct workqueue_struct *wq) { struct worker *rescuer; + int ret; if (!(wq->flags & WQ_MEM_RECLAIM)) return 0; @@ -4208,8 +4209,9 @@ static int init_rescuer(struct workqueue_struct *wq) rescuer->rescue_wq = wq; rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); if (IS_ERR(rescuer->task)) { + ret = PTR_ERR(rescuer->task); kfree(rescuer); - return PTR_ERR(rescuer->task); + return ret; } wq->rescuer = rescuer; From 7fdde0f9a571b1e9a31a839d3e10a3ee46a1145c Mon Sep 17 00:00:00 2001 From: Bryce Willey Date: Sun, 3 May 2020 17:49:26 -0400 Subject: [PATCH 061/427] Documentation: PCI: Give unique labels to sections Make subsection labels more specific to avoid sphinx warnings. Exact warning: Documentation/PCI/endpoint/pci-endpoint.rst:208: WARNING: duplicate label pci/endpoint/pci-endpoint:other apis, other instance in Documentation/PCI/endpoint/pci-endpoint.rst Link: https://lore.kernel.org/r/20200503214926.23748-1-bryce.steven.willey@gmail.com Signed-off-by: Bryce Willey [lorenzo.pieralisi@arm.com: commit log] Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas Acked-by: Rob Herring --- Documentation/PCI/endpoint/pci-endpoint.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Documentation/PCI/endpoint/pci-endpoint.rst b/Documentation/PCI/endpoint/pci-endpoint.rst index 0e2311b5617b..7536be445db8 100644 --- a/Documentation/PCI/endpoint/pci-endpoint.rst +++ b/Documentation/PCI/endpoint/pci-endpoint.rst @@ -78,8 +78,8 @@ by the PCI controller driver. Cleanup the pci_epc_mem structure allocated during pci_epc_mem_init(). -APIs for the PCI Endpoint Function Driver -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +EPC APIs for the PCI Endpoint Function Driver +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This section lists the APIs that the PCI Endpoint core provides to be used by the PCI endpoint function driver. @@ -117,8 +117,8 @@ by the PCI endpoint function driver. The PCI endpoint function driver should use pci_epc_mem_free_addr() to free the memory space allocated using pci_epc_mem_alloc_addr(). -Other APIs -~~~~~~~~~~ +Other EPC APIs +~~~~~~~~~~~~~~ There are other APIs provided by the EPC library. These are used for binding the EPF device with EPC device. pci-ep-cfs.c can be used as reference for @@ -160,8 +160,8 @@ PCI Endpoint Function(EPF) Library The EPF library provides APIs to be used by the function driver and the EPC library to provide endpoint mode functionality. -APIs for the PCI Endpoint Function Driver -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +EPF APIs for the PCI Endpoint Function Driver +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This section lists the APIs that the PCI Endpoint core provides to be used by the PCI endpoint function driver. @@ -204,8 +204,8 @@ by the PCI endpoint controller library. The PCI endpoint controller library invokes pci_epf_linkup() when the EPC device has established the connection to the host. -Other APIs -~~~~~~~~~~ +Other EPF APIs +~~~~~~~~~~~~~~ There are other APIs provided by the EPF library. These are used to notify the function driver when the EPF device is bound to the EPC device. From cfc6eea9f6af84e838e28be57b03be5502c4a02e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 14 Apr 2020 00:33:20 +0900 Subject: [PATCH 062/427] kconfig: do not use OR-assignment for zero-cleared structure The simple assignment is enough because memset() three lines above has zero-cleared the structure. Signed-off-by: Masahiro Yamada --- scripts/kconfig/symbol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/kconfig/symbol.c b/scripts/kconfig/symbol.c index 3dc81397d003..9363e37b8870 100644 --- a/scripts/kconfig/symbol.c +++ b/scripts/kconfig/symbol.c @@ -831,7 +831,7 @@ struct symbol *sym_lookup(const char *name, int flags) memset(symbol, 0, sizeof(*symbol)); symbol->name = new_name; symbol->type = S_UNKNOWN; - symbol->flags |= flags; + symbol->flags = flags; symbol->next = symbol_hash[hash]; symbol_hash[hash] = symbol; From 644a4b6cecc2ae3a8a840bb3606edd99af94e972 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 14 Apr 2020 00:35:42 +0900 Subject: [PATCH 063/427] kconfig: do not assign a variable in the return statement I am not a big fan of doing assignment in a return statement. Split it into two lines. Signed-off-by: Masahiro Yamada --- scripts/kconfig/menu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/kconfig/menu.c b/scripts/kconfig/menu.c index e436ba44c9c5..a5fbd6ccc006 100644 --- a/scripts/kconfig/menu.c +++ b/scripts/kconfig/menu.c @@ -65,7 +65,8 @@ void menu_add_entry(struct symbol *sym) struct menu *menu_add_menu(void) { last_entry_ptr = ¤t_entry->list; - return current_menu = current_entry; + current_menu = current_entry; + return current_menu; } void menu_end_menu(void) From b7546111a43a0fc75d9de4f7bce2104d6eb9d2b9 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 24 Apr 2020 14:49:28 +0900 Subject: [PATCH 064/427] kconfig: tests: remove randconfig test for choice in choice Nesting choice statements does not make any sense. Commit df8df5e4bc37 ("usb: get rid of 'choice' for legacy gadget drivers") got rid of the only usecase. I will turn it into a syntax error. Remove the test in advance. Signed-off-by: Masahiro Yamada --- .../kconfig/tests/rand_nested_choice/Kconfig | 35 ------------------- .../tests/rand_nested_choice/__init__.py | 17 --------- .../tests/rand_nested_choice/expected_stdout0 | 2 -- .../tests/rand_nested_choice/expected_stdout1 | 4 --- .../tests/rand_nested_choice/expected_stdout2 | 5 --- 5 files changed, 63 deletions(-) delete mode 100644 scripts/kconfig/tests/rand_nested_choice/Kconfig delete mode 100644 scripts/kconfig/tests/rand_nested_choice/__init__.py delete mode 100644 scripts/kconfig/tests/rand_nested_choice/expected_stdout0 delete mode 100644 scripts/kconfig/tests/rand_nested_choice/expected_stdout1 delete mode 100644 scripts/kconfig/tests/rand_nested_choice/expected_stdout2 diff --git a/scripts/kconfig/tests/rand_nested_choice/Kconfig b/scripts/kconfig/tests/rand_nested_choice/Kconfig deleted file mode 100644 index 8350de7f732b..000000000000 --- a/scripts/kconfig/tests/rand_nested_choice/Kconfig +++ /dev/null @@ -1,35 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 - -choice - prompt "choice" - -config A - bool "A" - -config B - bool "B" - -if B -choice - prompt "sub choice" - -config C - bool "C" - -config D - bool "D" - -if D -choice - prompt "subsub choice" - -config E - bool "E" - -endchoice -endif # D - -endchoice -endif # B - -endchoice diff --git a/scripts/kconfig/tests/rand_nested_choice/__init__.py b/scripts/kconfig/tests/rand_nested_choice/__init__.py deleted file mode 100644 index 9e4b2db53581..000000000000 --- a/scripts/kconfig/tests/rand_nested_choice/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -""" -Set random values recursively in nested choices. - -Kconfig can create a choice-in-choice structure by using 'if' statement. -randconfig should correctly set random choice values. - -Related Linux commit: 3b9a19e08960e5cdad5253998637653e592a3c29 -""" - - -def test(conf): - for i in range(20): - assert conf.randconfig() == 0 - assert (conf.config_contains('expected_stdout0') or - conf.config_contains('expected_stdout1') or - conf.config_contains('expected_stdout2')) diff --git a/scripts/kconfig/tests/rand_nested_choice/expected_stdout0 b/scripts/kconfig/tests/rand_nested_choice/expected_stdout0 deleted file mode 100644 index 05450f3d4eb5..000000000000 --- a/scripts/kconfig/tests/rand_nested_choice/expected_stdout0 +++ /dev/null @@ -1,2 +0,0 @@ -CONFIG_A=y -# CONFIG_B is not set diff --git a/scripts/kconfig/tests/rand_nested_choice/expected_stdout1 b/scripts/kconfig/tests/rand_nested_choice/expected_stdout1 deleted file mode 100644 index 37ab29584157..000000000000 --- a/scripts/kconfig/tests/rand_nested_choice/expected_stdout1 +++ /dev/null @@ -1,4 +0,0 @@ -# CONFIG_A is not set -CONFIG_B=y -CONFIG_C=y -# CONFIG_D is not set diff --git a/scripts/kconfig/tests/rand_nested_choice/expected_stdout2 b/scripts/kconfig/tests/rand_nested_choice/expected_stdout2 deleted file mode 100644 index 849ff47e9848..000000000000 --- a/scripts/kconfig/tests/rand_nested_choice/expected_stdout2 +++ /dev/null @@ -1,5 +0,0 @@ -# CONFIG_A is not set -CONFIG_B=y -# CONFIG_C is not set -CONFIG_D=y -CONFIG_E=y From 09d5873e4d1f70202314b5fe40160f9b14b9d2d0 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 24 Apr 2020 14:49:29 +0900 Subject: [PATCH 065/427] kconfig: allow only 'config', 'comment', and 'if' inside 'choice' The code block surrounded by 'if' ... 'endif' is reduced into if_stmt, which is accepted in the 'choice' context. Therefore, you can write any statements within a choice block by wrapping 'if y' ... 'end'. For example, you can create a menu inside a choice, like follows: ---------------->8---------------- choice prompt "choice" config A bool "A" config B bool "B" if y menu "strange menu" config C bool "C" endmenu endif endchoice ---------------->8---------------- I want to change such a weird structure into a syntax error. In fact, the USB gadget Kconfig had used nested 'choice' for no good reason until commit df8df5e4bc37 ("usb: get rid of 'choice' for legacy gadget drivers") killed it. I think the 'source' inside 'choice' is on the fence. It is at least gramatically sensible as long as the included file contains only bool/tristate configs. However, it makes the code unreadable, and people tend to forget the fact that the file is included from the choice block. Commit 10e5e6c24963 ("usb: gadget: move choice ... endchoice to legacy/Kconfig") got rid of the only usecase. Going forward, you can only use 'config', 'comment', and 'if' inside 'choice'. This also recursively applies to 'if' blocks inside 'choice'. Signed-off-by: Masahiro Yamada --- scripts/kconfig/parser.y | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/scripts/kconfig/parser.y b/scripts/kconfig/parser.y index 708b6c4b13ca..190f1117f35a 100644 --- a/scripts/kconfig/parser.y +++ b/scripts/kconfig/parser.y @@ -119,20 +119,24 @@ mainmenu_stmt: T_MAINMENU T_WORD_QUOTE T_EOL stmt_list: /* empty */ - | stmt_list common_stmt + | stmt_list assignment_stmt | stmt_list choice_stmt + | stmt_list comment_stmt + | stmt_list config_stmt + | stmt_list if_stmt | stmt_list menu_stmt + | stmt_list menuconfig_stmt + | stmt_list source_stmt | stmt_list T_WORD error T_EOL { zconf_error("unknown statement \"%s\"", $2); } | stmt_list error T_EOL { zconf_error("invalid statement"); } ; -common_stmt: - if_stmt - | comment_stmt - | config_stmt - | menuconfig_stmt - | source_stmt - | assignment_stmt +stmt_list_in_choice: + /* empty */ + | stmt_list_in_choice comment_stmt + | stmt_list_in_choice config_stmt + | stmt_list_in_choice if_stmt_in_choice + | stmt_list_in_choice error T_EOL { zconf_error("invalid statement"); } ; /* config/menuconfig entry */ @@ -254,7 +258,7 @@ choice_end: end } }; -choice_stmt: choice_entry choice_block choice_end +choice_stmt: choice_entry stmt_list_in_choice choice_end ; choice_option_list: @@ -305,11 +309,6 @@ default: | T_DEF_BOOL { $$ = S_BOOLEAN; } | T_DEF_TRISTATE { $$ = S_TRISTATE; } -choice_block: - /* empty */ - | choice_block common_stmt -; - /* if entry */ if_entry: T_IF expr T_EOL @@ -331,6 +330,9 @@ if_end: end if_stmt: if_entry stmt_list if_end ; +if_stmt_in_choice: if_entry stmt_list_in_choice if_end +; + /* menu entry */ menu: T_MENU T_WORD_QUOTE T_EOL From 7e49afc03212010d0ee27532a75cfeb0125bd868 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 18 Apr 2020 03:04:55 +0900 Subject: [PATCH 066/427] um: do not evaluate compiler's library path when cleaning Since commit a83e4ca26af8 ("kbuild: remove cc-option switch from -Wframe-larger-than="), 'make ARCH=um clean' emits an error message as follows: $ make ARCH=um clean gcc: error: missing argument to '-Wframe-larger-than=' We do not care compiler flags when cleaning. Use the '=' operator for lazy expansion because we do not use LDFLAGS_pcap.o or LDFLAGS_vde.o when cleaning. While I was here, I removed the redundant -r option because it already exists in the recipe. Fixes: a83e4ca26af8 ("kbuild: remove cc-option switch from -Wframe-larger-than=") Signed-off-by: Masahiro Yamada Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor [build] --- arch/um/drivers/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile index a290821e355c..2a249f619467 100644 --- a/arch/um/drivers/Makefile +++ b/arch/um/drivers/Makefile @@ -18,9 +18,9 @@ ubd-objs := ubd_kern.o ubd_user.o port-objs := port_kern.o port_user.o harddog-objs := harddog_kern.o harddog_user.o -LDFLAGS_pcap.o := -r $(shell $(CC) $(KBUILD_CFLAGS) -print-file-name=libpcap.a) +LDFLAGS_pcap.o = $(shell $(CC) $(KBUILD_CFLAGS) -print-file-name=libpcap.a) -LDFLAGS_vde.o := -r $(shell $(CC) $(CFLAGS) -print-file-name=libvdeplug.a) +LDFLAGS_vde.o = $(shell $(CC) $(CFLAGS) -print-file-name=libvdeplug.a) targets := pcap_kern.o pcap_user.o vde_kern.o vde_user.o From 6632fa8fcabacd75329bd2ea78cba5c43754639a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 25 Apr 2020 01:15:02 +0900 Subject: [PATCH 067/427] hexagon: suppress error message for 'make clean' 'make ARCH=hexagon clean' emits an error message as follows: $ make ARCH=hexagon clean gcc: error: unrecognized command line option '-G0' You can suppress it by setting the correct CROSS_COMPILE=, but we should not require any compiler for cleaning. Signed-off-by: Masahiro Yamada Acked-by: Brian Cain --- arch/hexagon/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/hexagon/Makefile b/arch/hexagon/Makefile index 4c5858b80f0e..c168c6980d05 100644 --- a/arch/hexagon/Makefile +++ b/arch/hexagon/Makefile @@ -30,7 +30,7 @@ TIR_NAME := r19 KBUILD_CFLAGS += -ffixed-$(TIR_NAME) -DTHREADINFO_REG=$(TIR_NAME) -D__linux__ KBUILD_AFLAGS += -DTHREADINFO_REG=$(TIR_NAME) -LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name) +LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name 2>/dev/null) libs-y += $(LIBGCC) head-y := arch/hexagon/kernel/head.o From dc960bfeedb01cf832c5632ed1f3daed4416b142 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 25 Apr 2020 14:18:16 +0900 Subject: [PATCH 068/427] h8300: suppress error messages for 'make clean' 'make ARCH=h8300 clean' emits error messages as follows: $ make ARCH=h8300 clean gcc: error: missing argument to '-Wframe-larger-than=' gcc: error: unrecognized command line option '-mint32' You can suppress the second one by setting the correct CROSS_COMPILE=, but we should not require any compiler for cleaning. Signed-off-by: Masahiro Yamada --- arch/h8300/boot/compressed/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/h8300/boot/compressed/Makefile b/arch/h8300/boot/compressed/Makefile index 9e2701069bbe..5942793f77a0 100644 --- a/arch/h8300/boot/compressed/Makefile +++ b/arch/h8300/boot/compressed/Makefile @@ -18,7 +18,7 @@ CONFIG_MEMORY_START ?= 0x00400000 CONFIG_BOOT_LINK_OFFSET ?= 0x00280000 IMAGE_OFFSET := $(shell printf "0x%08x" $$(($(CONFIG_MEMORY_START)+$(CONFIG_BOOT_LINK_OFFSET)))) -LIBGCC := $(shell $(CROSS-COMPILE)$(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name) +LIBGCC := $(shell $(CROSS-COMPILE)$(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name 2>/dev/null) LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -estartup -T $(obj)/vmlinux.lds \ --defsym output=$(CONFIG_MEMORY_START) From 081b4b54ff6c58be2ffcf09d42e5df8f031eacd0 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 25 Apr 2020 15:06:40 +0900 Subject: [PATCH 069/427] unicore32: do not evaluate compiler's library path when cleaning Since commit a83e4ca26af8 ("kbuild: remove cc-option switch from -Wframe-larger-than="), 'make ARCH=unicore32 clean' emits error messages as follows: $ make ARCH=unicore32 clean gcc: error: missing argument to '-Wframe-larger-than=' gcc: error: missing argument to '-Wframe-larger-than=' We do not care compiler flags when cleaning. Use the '=' operator for lazy expansion because we do not use GNU_LIBC_A or GNU_LIBGCC_A when cleaning. Fixes: a83e4ca26af8 ("kbuild: remove cc-option switch from -Wframe-larger-than=") Signed-off-by: Masahiro Yamada Reviewed-by: Nick Desaulniers --- arch/unicore32/lib/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/unicore32/lib/Makefile b/arch/unicore32/lib/Makefile index 098981a01841..5af06645b8f0 100644 --- a/arch/unicore32/lib/Makefile +++ b/arch/unicore32/lib/Makefile @@ -10,12 +10,12 @@ lib-y += strncpy_from_user.o strnlen_user.o lib-y += clear_user.o copy_page.o lib-y += copy_from_user.o copy_to_user.o -GNU_LIBC_A := $(shell $(CC) $(KBUILD_CFLAGS) -print-file-name=libc.a) +GNU_LIBC_A = $(shell $(CC) $(KBUILD_CFLAGS) -print-file-name=libc.a) GNU_LIBC_A_OBJS := memchr.o memcpy.o memmove.o memset.o GNU_LIBC_A_OBJS += strchr.o strrchr.o GNU_LIBC_A_OBJS += rawmemchr.o # needed by strrchr.o -GNU_LIBGCC_A := $(shell $(CC) $(KBUILD_CFLAGS) -print-file-name=libgcc.a) +GNU_LIBGCC_A = $(shell $(CC) $(KBUILD_CFLAGS) -print-file-name=libgcc.a) GNU_LIBGCC_A_OBJS := _ashldi3.o _ashrdi3.o _lshrdi3.o GNU_LIBGCC_A_OBJS += _divsi3.o _modsi3.o _ucmpdi2.o _umodsi3.o _udivsi3.o From e33ae3ed331a5cfa24b2abe483f7f4b27bb99c06 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 23 Apr 2020 23:23:51 +0900 Subject: [PATCH 070/427] kbuild: use $(CC_VERSION_TEXT) to evaluate CC_IS_GCC and CC_IS_CLANG The result of '$(CC) --version | head -n 1' has already been computed by the top Makefile, and stored in the environment variable, CC_VERSION_TEXT. 'echo' is cheaper than the two commands $(CC) and 'head' although this optimization is not noticeable level. Signed-off-by: Masahiro Yamada Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor --- init/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 9278a603d399..2bc63a361033 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -9,7 +9,7 @@ config DEFCONFIG_LIST default "arch/$(SRCARCH)/configs/$(KBUILD_DEFCONFIG)" config CC_IS_GCC - def_bool $(success,$(CC) --version | head -n 1 | grep -q gcc) + def_bool $(success,echo "$(CC_VERSION_TEXT)" | grep -q gcc) config GCC_VERSION int @@ -21,7 +21,7 @@ config LD_VERSION default $(shell,$(LD) --version | $(srctree)/scripts/ld-version.sh) config CC_IS_CLANG - def_bool $(success,$(CC) --version | head -n 1 | grep -q clang) + def_bool $(success,echo "$(CC_VERSION_TEXT)" | grep -q clang) config CLANG_VERSION int From 8b59cd81dc5e724eaea283fa6006985891c7bff4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 23 Apr 2020 23:23:52 +0900 Subject: [PATCH 071/427] kbuild: ensure full rebuild when the compiler is updated Commit 21c54b774744 ("kconfig: show compiler version text in the top comment") added the environment variable, CC_VERSION_TEXT in the comment of the top Kconfig file. It can detect the compiler update, and invoke the syncconfig because all environment variables referenced in Kconfig files are recorded in include/config/auto.conf.cmd This commit makes it a CONFIG option in order to ensure the full rebuild when the compiler is updated. This works like follows: include/config/kconfig.h contains "CONFIG_CC_VERSION_TEXT" in the comment block. The top Makefile specifies "-include $(srctree)/include/linux/kconfig.h" to guarantee it is included from all kernel source files. fixdep parses every source file and all headers included from it, searching for words prefixed with "CONFIG_". Then, fixdep finds CONFIG_CC_VERSION_TEXT in include/config/kconfig.h and adds include/config/cc/version/text.h into every .*.cmd file. When the compiler is updated, syncconfig is invoked because init/Kconfig contains the reference to the environment variable CC_VERTION_TEXT. CONFIG_CC_VERSION_TEXT is updated to the new version string, and include/config/cc/version/text.h is touched. In the next rebuild, Make will rebuild every files since the timestamp of include/config/cc/version/text.h is newer than that of target. Signed-off-by: Masahiro Yamada --- Kconfig | 2 -- include/linux/kconfig.h | 2 ++ init/Kconfig | 17 +++++++++++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/Kconfig b/Kconfig index e10b3ee084d4..745bc773f567 100644 --- a/Kconfig +++ b/Kconfig @@ -5,8 +5,6 @@ # mainmenu "Linux/$(ARCH) $(KERNELVERSION) Kernel Configuration" -comment "Compiler: $(CC_VERSION_TEXT)" - source "scripts/Kconfig.include" source "init/Kconfig" diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h index cc8fa109cfa3..9d12c970f18f 100644 --- a/include/linux/kconfig.h +++ b/include/linux/kconfig.h @@ -2,6 +2,8 @@ #ifndef __LINUX_KCONFIG_H #define __LINUX_KCONFIG_H +/* CONFIG_CC_VERSION_TEXT (Do not delete this comment. See help in Kconfig) */ + #include #ifdef CONFIG_CPU_BIG_ENDIAN diff --git a/init/Kconfig b/init/Kconfig index 2bc63a361033..ed1d82c9f1df 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -8,6 +8,23 @@ config DEFCONFIG_LIST default "/boot/config-$(shell,uname -r)" default "arch/$(SRCARCH)/configs/$(KBUILD_DEFCONFIG)" +config CC_VERSION_TEXT + string + default "$(CC_VERSION_TEXT)" + help + This is used in unclear ways: + + - Re-run Kconfig when the compiler is updated + The 'default' property references the environment variable, + CC_VERSION_TEXT so it is recorded in include/config/auto.conf.cmd. + When the compiler is updated, Kconfig will be invoked. + + - Ensure full rebuild when the compier is updated + include/linux/kconfig.h contains this option in the comment line so + fixdep adds include/config/cc/version/text.h into the auto-generated + dependency. When the compiler is updated, syncconfig will touch it + and then every file will be rebuilt. + config CC_IS_GCC def_bool $(success,echo "$(CC_VERSION_TEXT)" | grep -q gcc) From 30a7729771731971839cc969d2a321e6ea7a144b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 23 Apr 2020 23:23:53 +0900 Subject: [PATCH 072/427] kbuild: use -MMD instead of -MD to exclude system headers from dependency This omits system headers from the generated header dependency. System headers are not updated unless you upgrade the compiler. Nor do they contain CONFIG options, so fixdep does not need to parse them. Having said that, the effect of this optimization will be quite small because the kernel code generally does not include system headers except . Host programs include a lot of system headers, but there are not so many in the kernel tree. At first, keeping system headers in .*.cmd files might be useful to detect the compiler update, but there is no guarantee that is included from every file. So, I implemented a more reliable way in the previous commit. Signed-off-by: Masahiro Yamada --- scripts/Kbuild.include | 2 +- scripts/Makefile.host | 4 ++-- scripts/Makefile.lib | 8 ++++---- usr/include/Makefile | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index 6cabf20ce66a..0c3dc983439b 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -16,7 +16,7 @@ pound := \# dot-target = $(dir $@).$(notdir $@) ### -# The temporary file to save gcc -MD generated dependencies must not +# The temporary file to save gcc -MMD generated dependencies must not # contain a comma depfile = $(subst $(comma),_,$(dot-target).d) diff --git a/scripts/Makefile.host b/scripts/Makefile.host index 2045855d0b75..c8a4a033dc3e 100644 --- a/scripts/Makefile.host +++ b/scripts/Makefile.host @@ -88,8 +88,8 @@ _hostcxx_flags += -I $(objtree)/$(obj) endif endif -hostc_flags = -Wp,-MD,$(depfile) $(_hostc_flags) -hostcxx_flags = -Wp,-MD,$(depfile) $(_hostcxx_flags) +hostc_flags = -Wp,-MMD,$(depfile) $(_hostc_flags) +hostcxx_flags = -Wp,-MMD,$(depfile) $(_hostcxx_flags) ##### # Compile programs on the host diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 4b799737722c..12f6a331a8f3 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -171,22 +171,22 @@ modkern_aflags = $(if $(part-of-module), \ $(KBUILD_AFLAGS_MODULE) $(AFLAGS_MODULE), \ $(KBUILD_AFLAGS_KERNEL) $(AFLAGS_KERNEL)) -c_flags = -Wp,-MD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) \ +c_flags = -Wp,-MMD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) \ -include $(srctree)/include/linux/compiler_types.h \ $(_c_flags) $(modkern_cflags) \ $(basename_flags) $(modname_flags) -a_flags = -Wp,-MD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) \ +a_flags = -Wp,-MMD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) \ $(_a_flags) $(modkern_aflags) -cpp_flags = -Wp,-MD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) \ +cpp_flags = -Wp,-MMD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) \ $(_cpp_flags) ld_flags = $(KBUILD_LDFLAGS) $(ldflags-y) $(LDFLAGS_$(@F)) DTC_INCLUDE := $(srctree)/scripts/dtc/include-prefixes -dtc_cpp_flags = -Wp,-MD,$(depfile).pre.tmp -nostdinc \ +dtc_cpp_flags = -Wp,-MMD,$(depfile).pre.tmp -nostdinc \ $(addprefix -I,$(DTC_INCLUDE)) \ -undef -D__DTS__ diff --git a/usr/include/Makefile b/usr/include/Makefile index b568a95d1f62..5a7ee3e5ed86 100644 --- a/usr/include/Makefile +++ b/usr/include/Makefile @@ -8,7 +8,7 @@ # We cannot go as far as adding -Wpedantic since it emits too many warnings. UAPI_CFLAGS := -std=c90 -Wall -Werror=implicit-function-declaration -override c_flags = $(UAPI_CFLAGS) -Wp,-MD,$(depfile) -I$(objtree)/usr/include +override c_flags = $(UAPI_CFLAGS) -Wp,-MMD,$(depfile) -I$(objtree)/usr/include # The following are excluded for now because they fail to build. # From 9a950154668729a472d17b8e307d92e7c60f45f7 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 23 Apr 2020 23:23:54 +0900 Subject: [PATCH 073/427] kbuild: use CONFIG_CC_VERSION_TEXT to construct LINUX_COMPILER macro scripts/mkcompile_h runs $(CC) just for getting the version string. Reuse CONFIG_CC_VERSION_TEXT for optimization. For GCC, this slightly changes the version string. I do not think it is a big deal as we do not have the defined format for LINUX_COMPILER. In fact, the recent commit 4dcc9a88448a ("kbuild: mkcompile_h: Include $LD version in /proc/version") added the linker version. Signed-off-by: Masahiro Yamada --- init/Makefile | 2 +- scripts/mkcompile_h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/init/Makefile b/init/Makefile index d45e967483b2..57499b1ff471 100644 --- a/init/Makefile +++ b/init/Makefile @@ -35,4 +35,4 @@ include/generated/compile.h: FORCE @$($(quiet)chk_compile.h) $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \ "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" \ - "$(CONFIG_PREEMPT_RT)" "$(CC)" "$(LD)" + "$(CONFIG_PREEMPT_RT)" $(CONFIG_CC_VERSION_TEXT) "$(LD)" diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h index 5b80a4699740..baf3ab8d9d49 100755 --- a/scripts/mkcompile_h +++ b/scripts/mkcompile_h @@ -6,7 +6,7 @@ ARCH=$2 SMP=$3 PREEMPT=$4 PREEMPT_RT=$5 -CC=$6 +CC_VERSION="$6" LD=$7 vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; } @@ -62,7 +62,6 @@ UTS_VERSION="$(echo $UTS_VERSION $CONFIG_FLAGS $TIMESTAMP | cut -b -$UTS_LEN)" printf '#define LINUX_COMPILE_BY "%s"\n' "$LINUX_COMPILE_BY" echo \#define LINUX_COMPILE_HOST \"$LINUX_COMPILE_HOST\" - CC_VERSION=$($CC -v 2>&1 | grep ' version ' | sed 's/[[:space:]]*$//') LD_VERSION=$($LD -v | head -n1 | sed 's/(compatible with [^)]*)//' \ | sed 's/[[:space:]]*$//') printf '#define LINUX_COMPILER "%s"\n' "$CC_VERSION, $LD_VERSION" From 85e4a889d3e08a39a8ebde594bab06e6fa732903 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 23 Apr 2020 16:39:15 +0900 Subject: [PATCH 074/427] Revert "objtool: Skip samples subdirectory" This reverts commit 8728497895794d1f207a836e02dae762ad175d56. samples/ contains only sub-directories. Because OBJECT_FILES_NON_STANDARD does not work recursively, this line has no effect. Signed-off-by: Masahiro Yamada Acked-by: Josh Poimboeuf Acked-by: Sam Ravnborg --- samples/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/samples/Makefile b/samples/Makefile index f8f847b4f61f..5ce50ef0f2b2 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -1,6 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for Linux samples code -OBJECT_FILES_NON_STANDARD := y obj-$(CONFIG_SAMPLE_ANDROID_BINDERFS) += binderfs/ obj-$(CONFIG_SAMPLE_CONFIGFS) += configfs/ From ea21e9041404c4cafd63de4762287fb1b7ba357f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 25 Apr 2020 22:18:10 +0900 Subject: [PATCH 075/427] kbuild: remove '/' target This notice has been here for a while. Remove it entirely now. Signed-off-by: Masahiro Yamada --- Makefile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Makefile b/Makefile index 11fe9b1535de..9671fa09c83a 100644 --- a/Makefile +++ b/Makefile @@ -1657,10 +1657,6 @@ _emodinst_post: _emodinst_ clean-dirs := $(KBUILD_EXTMOD) clean: rm-files := $(KBUILD_EXTMOD)/Module.symvers $(KBUILD_EXTMOD)/modules.nsdeps -PHONY += / -/: - @echo >&2 '"$(MAKE) /" is no longer supported. Please use "$(MAKE) ./" instead.' - PHONY += help help: @echo ' Building external modules.' From 1ca0c2f612116a7159ab11c36b555910f90db338 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 27 Apr 2020 09:30:19 +0900 Subject: [PATCH 076/427] kbuild: remove unused AS assignment $(AS) is not used anywhere in the kernel build, hence commit aa824e0c962b ("kbuild: remove AS variable") killed it. Remove the left-over code in arch/{arm,arm64}/Makefile. Signed-off-by: Masahiro Yamada Reviewed-by: Nathan Chancellor Acked-by: Will Deacon --- arch/arm/Makefile | 2 -- arch/arm64/Makefile | 2 -- 2 files changed, 4 deletions(-) diff --git a/arch/arm/Makefile b/arch/arm/Makefile index 7d5cd0f85461..cd28211f1418 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -45,12 +45,10 @@ endif ifeq ($(CONFIG_CPU_BIG_ENDIAN),y) KBUILD_CPPFLAGS += -mbig-endian CHECKFLAGS += -D__ARMEB__ -AS += -EB KBUILD_LDFLAGS += -EB else KBUILD_CPPFLAGS += -mlittle-endian CHECKFLAGS += -D__ARMEL__ -AS += -EL KBUILD_LDFLAGS += -EL endif diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 85e4149cc5d5..d86cc9137539 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -84,7 +84,6 @@ KBUILD_CFLAGS += $(branch-prot-flags-y) ifeq ($(CONFIG_CPU_BIG_ENDIAN), y) KBUILD_CPPFLAGS += -mbig-endian CHECKFLAGS += -D__AARCH64EB__ -AS += -EB # Prefer the baremetal ELF build target, but not all toolchains include # it so fall back to the standard linux version if needed. KBUILD_LDFLAGS += -EB $(call ld-option, -maarch64elfb, -maarch64linuxb) @@ -92,7 +91,6 @@ UTS_MACHINE := aarch64_be else KBUILD_CPPFLAGS += -mlittle-endian CHECKFLAGS += -D__AARCH64EL__ -AS += -EL # Same as above, prefer ELF but fall back to linux target if needed. KBUILD_LDFLAGS += -EL $(call ld-option, -maarch64elf, -maarch64linux) UTS_MACHINE := aarch64 From 78046fabe6e7807a271aad09cde0522d80bd2985 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 27 Apr 2020 22:49:30 +0900 Subject: [PATCH 077/427] kbuild: determine the output format of DTC by the target suffix cmd_dtc takes the additional parameter $(2) to select the target format, dtb or yaml. This makes things complicated when it is used with cmd_and_fixdep and if_changed_rule. I actually stumbled on this. See commit 3d4b2238684a ("kbuild: fix DT binding schema rule again to avoid needless rebuilds"). Extract the suffix part of the target instead of passing the parameter. Fortunately, this works for both $(obj)/%.dtb and $(obj)/%.dt.yaml . Signed-off-by: Masahiro Yamada --- scripts/Makefile.lib | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 12f6a331a8f3..cd52a8c6428f 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -287,13 +287,13 @@ $(obj)/%.dtb.S: $(obj)/%.dtb FORCE quiet_cmd_dtc = DTC $@ cmd_dtc = mkdir -p $(dir ${dtc-tmp}) ; \ $(HOSTCC) -E $(dtc_cpp_flags) -x assembler-with-cpp -o $(dtc-tmp) $< ; \ - $(DTC) -O $(2) -o $@ -b 0 \ + $(DTC) -O $(patsubst .%,%,$(suffix $@)) -o $@ -b 0 \ $(addprefix -i,$(dir $<) $(DTC_INCLUDE)) $(DTC_FLAGS) \ -d $(depfile).dtc.tmp $(dtc-tmp) ; \ cat $(depfile).pre.tmp $(depfile).dtc.tmp > $(depfile) $(obj)/%.dtb: $(src)/%.dts $(DTC) FORCE - $(call if_changed_dep,dtc,dtb) + $(call if_changed_dep,dtc) DT_CHECKER ?= dt-validate DT_BINDING_DIR := Documentation/devicetree/bindings @@ -304,7 +304,7 @@ quiet_cmd_dtb_check = CHECK $@ cmd_dtb_check = $(DT_CHECKER) -u $(srctree)/$(DT_BINDING_DIR) -p $(DT_TMP_SCHEMA) $@ define rule_dtc - $(call cmd_and_fixdep,dtc,yaml) + $(call cmd_and_fixdep,dtc) $(call cmd,dtb_check) endef From a85a6c86c25be2d2a5f9c31491f612ce0edc7869 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 16 Mar 2020 16:43:38 -0500 Subject: [PATCH 078/427] driver core: platform: Clarify that IRQ 0 is invalid These interfaces return a negative error number or an IRQ: platform_get_irq() platform_get_irq_optional() platform_get_irq_byname() platform_get_irq_byname_optional() The function comments suggest checking for error like this: irq = platform_get_irq(...); if (irq < 0) return irq; which is what most callers (~900 of 1400) do, so it's implicit that IRQ 0 is invalid. But some callers check for "irq <= 0", and it's not obvious from the source that we never return an IRQ 0. Make this more explicit by updating the comments to say that an IRQ number is always non-zero and adding a WARN() if we ever do return zero. If we do return IRQ 0, it likely indicates a bug in the arch-specific parts of platform_get_irq(). Relevant prior discussion at [1, 2]. [1] https://lore.kernel.org/r/Pine.LNX.4.64.0701250940220.25027@woody.linux-foundation.org/ [2] https://lore.kernel.org/r/Pine.LNX.4.64.0701252029570.25027@woody.linux-foundation.org/ Signed-off-by: Bjorn Helgaas Acked-by: Greg Kroah-Hartman Acked-by: Linus Walleij --- drivers/base/platform.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 5255550b7c34..084cf1d23d3f 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -152,23 +152,24 @@ EXPORT_SYMBOL_GPL(devm_platform_ioremap_resource_byname); * if (irq < 0) * return irq; * - * Return: IRQ number on success, negative error number on failure. + * Return: non-zero IRQ number on success, negative error number on failure. */ int platform_get_irq_optional(struct platform_device *dev, unsigned int num) { + int ret; #ifdef CONFIG_SPARC /* sparc does not have irqs represented as IORESOURCE_IRQ resources */ if (!dev || num >= dev->archdata.num_irqs) return -ENXIO; - return dev->archdata.irqs[num]; + ret = dev->archdata.irqs[num]; + goto out; #else struct resource *r; - int ret; if (IS_ENABLED(CONFIG_OF_IRQ) && dev->dev.of_node) { ret = of_irq_get(dev->dev.of_node, num); if (ret > 0 || ret == -EPROBE_DEFER) - return ret; + goto out; } r = platform_get_resource(dev, IORESOURCE_IRQ, num); @@ -176,7 +177,7 @@ int platform_get_irq_optional(struct platform_device *dev, unsigned int num) if (r && r->flags & IORESOURCE_DISABLED) { ret = acpi_irq_get(ACPI_HANDLE(&dev->dev), num, r); if (ret) - return ret; + goto out; } } @@ -190,13 +191,17 @@ int platform_get_irq_optional(struct platform_device *dev, unsigned int num) struct irq_data *irqd; irqd = irq_get_irq_data(r->start); - if (!irqd) - return -ENXIO; + if (!irqd) { + ret = -ENXIO; + goto out; + } irqd_set_trigger_type(irqd, r->flags & IORESOURCE_BITS); } - if (r) - return r->start; + if (r) { + ret = r->start; + goto out; + } /* * For the index 0 interrupt, allow falling back to GpioInt @@ -209,11 +214,14 @@ int platform_get_irq_optional(struct platform_device *dev, unsigned int num) ret = acpi_dev_gpio_irq_get(ACPI_COMPANION(&dev->dev), num); /* Our callers expect -ENXIO for missing IRQs. */ if (ret >= 0 || ret == -EPROBE_DEFER) - return ret; + goto out; } - return -ENXIO; + ret = -ENXIO; #endif +out: + WARN(ret == 0, "0 is an invalid IRQ number\n"); + return ret; } EXPORT_SYMBOL_GPL(platform_get_irq_optional); @@ -231,7 +239,7 @@ EXPORT_SYMBOL_GPL(platform_get_irq_optional); * if (irq < 0) * return irq; * - * Return: IRQ number on success, negative error number on failure. + * Return: non-zero IRQ number on success, negative error number on failure. */ int platform_get_irq(struct platform_device *dev, unsigned int num) { @@ -303,8 +311,10 @@ static int __platform_get_irq_byname(struct platform_device *dev, } r = platform_get_resource_byname(dev, IORESOURCE_IRQ, name); - if (r) + if (r) { + WARN(r->start == 0, "0 is an invalid IRQ number\n"); return r->start; + } return -ENXIO; } @@ -316,7 +326,7 @@ static int __platform_get_irq_byname(struct platform_device *dev, * * Get an IRQ like platform_get_irq(), but then by name rather then by index. * - * Return: IRQ number on success, negative error number on failure. + * Return: non-zero IRQ number on success, negative error number on failure. */ int platform_get_irq_byname(struct platform_device *dev, const char *name) { @@ -338,7 +348,7 @@ EXPORT_SYMBOL_GPL(platform_get_irq_byname); * Get an optional IRQ by name like platform_get_irq_byname(). Except that it * does not print an error message if an IRQ can not be obtained. * - * Return: IRQ number on success, negative error number on failure. + * Return: non-zero IRQ number on success, negative error number on failure. */ int platform_get_irq_byname_optional(struct platform_device *dev, const char *name) From 0584bff09629666eea97c7ac428e55b00df211f5 Mon Sep 17 00:00:00 2001 From: Aman Sharma Date: Thu, 12 Mar 2020 00:49:02 +0530 Subject: [PATCH 079/427] PCI: Check for platform_get_irq() failure consistently The platform_get_irq*() interfaces return either a negative error number or a valid IRQ. 0 is not a valid return value, so check for "< 0" to detect failure as recommended by the function documentation. On failure, return the error number from platform_get_irq*() instead of making up a new one. Link: https://lore.kernel.org/r/cover.1583952275.git.amanharitsh123@gmail.com [bhelgaas: commit log, squash into one patch] Signed-off-by: Aman Sharma Signed-off-by: Bjorn Helgaas Acked-by: Linus Walleij Cc: Richard Zhu Cc: Lucas Stach Cc: Thierry Reding Cc: Karthikeyan Mitran Cc: Hou Zhiqiang Cc: Thomas Petazzoni Cc: Ryder Lee Cc: Marc Gonzalez --- drivers/pci/controller/dwc/pci-imx6.c | 4 ++-- drivers/pci/controller/dwc/pcie-tegra194.c | 4 ++-- drivers/pci/controller/mobiveil/pcie-mobiveil-host.c | 4 ++-- drivers/pci/controller/pci-aardvark.c | 3 +++ drivers/pci/controller/pci-v3-semi.c | 4 ++-- drivers/pci/controller/pcie-mediatek.c | 3 +++ drivers/pci/controller/pcie-tango.c | 4 ++-- 7 files changed, 16 insertions(+), 10 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index acfbd34032a8..8f08ae53f53e 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -868,9 +868,9 @@ static int imx6_add_pcie_port(struct imx6_pcie *imx6_pcie, if (IS_ENABLED(CONFIG_PCI_MSI)) { pp->msi_irq = platform_get_irq_byname(pdev, "msi"); - if (pp->msi_irq <= 0) { + if (pp->msi_irq < 0) { dev_err(dev, "failed to get MSI irq\n"); - return -ENODEV; + return pp->msi_irq; } } diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c index ae30a2fd3716..f1f945cc7bcb 100644 --- a/drivers/pci/controller/dwc/pcie-tegra194.c +++ b/drivers/pci/controller/dwc/pcie-tegra194.c @@ -2190,9 +2190,9 @@ static int tegra_pcie_dw_probe(struct platform_device *pdev) } pp->irq = platform_get_irq_byname(pdev, "intr"); - if (!pp->irq) { + if (pp->irq < 0) { dev_err(dev, "Failed to get \"intr\" interrupt\n"); - return -ENODEV; + return pp->irq; } pcie->bpmp = tegra_bpmp_get(dev); diff --git a/drivers/pci/controller/mobiveil/pcie-mobiveil-host.c b/drivers/pci/controller/mobiveil/pcie-mobiveil-host.c index a94be264240f..5907baa9b1f2 100644 --- a/drivers/pci/controller/mobiveil/pcie-mobiveil-host.c +++ b/drivers/pci/controller/mobiveil/pcie-mobiveil-host.c @@ -522,9 +522,9 @@ static int mobiveil_pcie_integrated_interrupt_init(struct mobiveil_pcie *pcie) mobiveil_pcie_enable_msi(pcie); rp->irq = platform_get_irq(pdev, 0); - if (rp->irq <= 0) { + if (rp->irq < 0) { dev_err(dev, "failed to map IRQ: %d\n", rp->irq); - return -ENODEV; + return rp->irq; } /* initialize the IRQ domains */ diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 2a20b649f40c..40a4257f0df1 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -973,6 +973,9 @@ static int advk_pcie_probe(struct platform_device *pdev) return PTR_ERR(pcie->base); irq = platform_get_irq(pdev, 0); + if (irq < 0) + return irq; + ret = devm_request_irq(dev, irq, advk_pcie_irq_handler, IRQF_SHARED | IRQF_NO_THREAD, "advk-pcie", pcie); diff --git a/drivers/pci/controller/pci-v3-semi.c b/drivers/pci/controller/pci-v3-semi.c index bd05221f5a22..a5bf945d2eda 100644 --- a/drivers/pci/controller/pci-v3-semi.c +++ b/drivers/pci/controller/pci-v3-semi.c @@ -777,9 +777,9 @@ static int v3_pci_probe(struct platform_device *pdev) /* Get and request error IRQ resource */ irq = platform_get_irq(pdev, 0); - if (irq <= 0) { + if (irq < 0) { dev_err(dev, "unable to obtain PCIv3 error IRQ\n"); - return -ENODEV; + return irq; } ret = devm_request_irq(dev, irq, v3_irq, 0, "PCIv3 error", v3); diff --git a/drivers/pci/controller/pcie-mediatek.c b/drivers/pci/controller/pcie-mediatek.c index cb982891b22b..ebfa7d5a4e2d 100644 --- a/drivers/pci/controller/pcie-mediatek.c +++ b/drivers/pci/controller/pcie-mediatek.c @@ -651,6 +651,9 @@ static int mtk_pcie_setup_irq(struct mtk_pcie_port *port, } port->irq = platform_get_irq(pdev, port->slot); + if (port->irq < 0) + return port->irq; + irq_set_chained_handler_and_data(port->irq, mtk_pcie_intr_handler, port); diff --git a/drivers/pci/controller/pcie-tango.c b/drivers/pci/controller/pcie-tango.c index 21a208da3f59..18c2c4313eb5 100644 --- a/drivers/pci/controller/pcie-tango.c +++ b/drivers/pci/controller/pcie-tango.c @@ -273,9 +273,9 @@ static int tango_pcie_probe(struct platform_device *pdev) writel_relaxed(0, pcie->base + SMP8759_ENABLE + offset); virq = platform_get_irq(pdev, 1); - if (virq <= 0) { + if (virq < 0) { dev_err(dev, "Failed to map IRQ\n"); - return -ENXIO; + return virq; } irq_dom = irq_domain_create_linear(fwnode, MSI_MAX, &dom_ops, pcie); From 914a1951d88968371c7d43400c9d936382cd7d69 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 14:05:44 -0500 Subject: [PATCH 080/427] PCI: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these as a flexible array member [1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that dynamic memory allocations won't be affected by this change: Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero. [1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type [1]. There are some instances of code in which the sizeof() operator is being incorrectly/erroneously applied to zero-length arrays, and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Link: https://lore.kernel.org/r/20200507190544.GA15633@embeddedor Signed-off-by: Gustavo A. R. Silva Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 2 +- include/linux/pci.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 595fcf59843f..bb78f580814e 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1578,7 +1578,7 @@ EXPORT_SYMBOL(pci_restore_state); struct pci_saved_state { u32 config_space[16]; - struct pci_cap_saved_data cap[0]; + struct pci_cap_saved_data cap[]; }; /** diff --git a/include/linux/pci.h b/include/linux/pci.h index 83ce1cdf5676..0453ee458ab1 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -279,7 +279,7 @@ struct pci_cap_saved_data { u16 cap_nr; bool cap_extended; unsigned int size; - u32 data[0]; + u32 data[]; }; struct pci_cap_saved_state { @@ -532,7 +532,7 @@ struct pci_host_bridge { resource_size_t start, resource_size_t size, resource_size_t align); - unsigned long private[0] ____cacheline_aligned; + unsigned long private[] ____cacheline_aligned; }; #define to_pci_host_bridge(n) container_of(n, struct pci_host_bridge, dev) From a4e439a6f628a52f7074c9d73ec7eb4f6c1a4dfc Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 9 Mar 2020 10:56:08 -0500 Subject: [PATCH 081/427] dlm: dlm_internal: Replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: David Teigland --- fs/dlm/dlm_internal.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 416d9de35679..d231ae5d2c65 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -421,7 +421,7 @@ struct dlm_message { int m_bastmode; int m_asts; int m_result; /* 0 or -EXXX */ - char m_extra[0]; /* name or lvb */ + char m_extra[]; /* name or lvb */ }; @@ -450,7 +450,7 @@ struct dlm_rcom { uint64_t rc_id; /* match reply with request */ uint64_t rc_seq; /* sender's ls_recover_seq */ uint64_t rc_seq_reply; /* remote ls_recover_seq */ - char rc_buf[0]; + char rc_buf[]; }; union dlm_packet { @@ -506,7 +506,7 @@ struct rcom_lock { __le16 rl_wait_type; __le16 rl_namelen; char rl_name[DLM_RESNAME_MAXLEN]; - char rl_lvb[0]; + char rl_lvb[]; }; /* From 3c80d3794dac5b0f50132846113a120d881462ec Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 9 Mar 2020 10:57:22 -0500 Subject: [PATCH 082/427] dlm: user: Replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: David Teigland --- fs/dlm/user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 5264bac75115..e5cefa90b1ce 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -46,7 +46,7 @@ struct dlm_lock_params32 { __u32 bastaddr; __u32 lksb; char lvb[DLM_USER_LVB_LEN]; - char name[0]; + char name[]; }; struct dlm_write_request32 { From 90db4f8be38629bd09183b78079d582221523e25 Mon Sep 17 00:00:00 2001 From: Wu Bo Date: Wed, 22 Apr 2020 14:59:27 +0800 Subject: [PATCH 083/427] fs:dlm:remove unneeded semicolon in rcom.c Fix the following coccicheck warning: fs/dlm/rcom.c:566:2-3: Unneeded semicolon Signed-off-by: Wu Bo Signed-off-by: David Teigland --- fs/dlm/rcom.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index e3d9f72c640d..4daf5dc2b51c 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -563,7 +563,7 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) lock = 1; reply = 1; break; - }; + } spin_lock(&ls->ls_recover_lock); status = ls->ls_recover_status; From f084a4f4a14b97d2ad6e4bd6406933b2d39e6eca Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Wed, 29 Apr 2020 13:15:41 +0100 Subject: [PATCH 084/427] dlm: Switch to using wait_event() We saw an issue in a production server on a customer deployment where DLM 4.0.7 gets "stuck" and unable to join new lockspaces. There is no useful response for the dlm in do_event() if wait_event_interruptible() is interrupted, so switch to wait_event(). Signed-off-by: Ross Lagerwall Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index afb8340918b8..e93670ecfae5 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -197,8 +197,6 @@ static struct kset *dlm_kset; static int do_uevent(struct dlm_ls *ls, int in) { - int error; - if (in) kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE); else @@ -209,20 +207,12 @@ static int do_uevent(struct dlm_ls *ls, int in) /* dlm_controld will see the uevent, do the necessary group management and then write to sysfs to wake us */ - error = wait_event_interruptible(ls->ls_uevent_wait, - test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags)); + wait_event(ls->ls_uevent_wait, + test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags)); - log_rinfo(ls, "group event done %d %d", error, ls->ls_uevent_result); + log_rinfo(ls, "group event done %d", ls->ls_uevent_result); - if (error) - goto out; - - error = ls->ls_uevent_result; - out: - if (error) - log_error(ls, "group %s failed %d %d", in ? "join" : "leave", - error, ls->ls_uevent_result); - return error; + return ls->ls_uevent_result; } static int dlm_uevent(struct kset *kset, struct kobject *kobj, From fe204591cc9480347af7d2d6029b24a62e449486 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 7 May 2020 23:34:28 +0200 Subject: [PATCH 085/427] dlm: remove BUG() before panic() Building a kernel with clang sometimes fails with an objtool error in dlm: fs/dlm/lock.o: warning: objtool: revert_lock_pc()+0xbd: can't find jump dest instruction at .text+0xd7fc The problem is that BUG() never returns and the compiler knows that anything after it is unreachable, however the panic still emits some code that does not get fully eliminated. Having both BUG() and panic() is really pointless as the BUG() kills the current process and the subsequent panic() never hits. In most cases, we probably don't really want either and should replace the DLM_ASSERT() statements with WARN_ON(), as has been done for some of them. Remove the BUG() here so the user at least sees the panic message and we can reliably build randconfig kernels. Fixes: e7fd41792fc0 ("[DLM] The core of the DLM for GFS2/CLVM") Cc: Josh Poimboeuf Cc: clang-built-linux@googlegroups.com Signed-off-by: Arnd Bergmann Signed-off-by: David Teigland --- fs/dlm/dlm_internal.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index d231ae5d2c65..04fe9f525ac7 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -97,7 +97,6 @@ do { \ __LINE__, __FILE__, #x, jiffies); \ {do} \ printk("\n"); \ - BUG(); \ panic("DLM: Record message above and reboot.\n"); \ } \ } From c027b02d89fd42ecee911c39e9098b9609a5ca0b Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 12 May 2020 23:36:07 +0800 Subject: [PATCH 086/427] streamline_config.pl: add LMC_KEEP to preserve some kconfigs Sometimes it is useful to preserve batches of configs when making localmodconfig. For example, I usually don't want any usb and fs modules to be disabled. Now we can do it by: $ make LMC_KEEP="drivers/usb:fs" localmodconfig Signed-off-by: Changbin Du Acked-by: Steven Rostedt (VMware) Signed-off-by: Masahiro Yamada --- Documentation/admin-guide/README.rst | 11 +++++++++-- scripts/kconfig/Makefile | 2 ++ scripts/kconfig/streamline_config.pl | 21 +++++++++++++++++++++ 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/README.rst b/Documentation/admin-guide/README.rst index cc6151fc0845..5fb526900023 100644 --- a/Documentation/admin-guide/README.rst +++ b/Documentation/admin-guide/README.rst @@ -209,15 +209,22 @@ Configuring the kernel store the lsmod of that machine into a file and pass it in as a LSMOD parameter. + Also, you can preserve modules in certain folders + or kconfig files by specifying their paths in + parameter LMC_KEEP. + target$ lsmod > /tmp/mylsmod target$ scp /tmp/mylsmod host:/tmp - host$ make LSMOD=/tmp/mylsmod localmodconfig + host$ make LSMOD=/tmp/mylsmod \ + LMC_KEEP="drivers/usb:drivers/gpu:fs" \ + localmodconfig The above also works when cross compiling. "make localyesconfig" Similar to localmodconfig, except it will convert - all module options to built in (=y) options. + all module options to built in (=y) options. You can + also preserve modules by LMC_KEEP. "make kvmconfig" Enable additional options for kvm guest kernel support. diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile index c9d0a4a8efb3..f3355bd86aa5 100644 --- a/scripts/kconfig/Makefile +++ b/scripts/kconfig/Makefile @@ -123,7 +123,9 @@ help: @echo ' gconfig - Update current config utilising a GTK+ based front-end' @echo ' oldconfig - Update current config utilising a provided .config as base' @echo ' localmodconfig - Update current config disabling modules not loaded' + @echo ' except those preserved by LMC_KEEP environment variable' @echo ' localyesconfig - Update current config converting local mods to core' + @echo ' except those preserved by LMC_KEEP environment variable' @echo ' defconfig - New config with default from ARCH supplied defconfig' @echo ' savedefconfig - Save current config as ./defconfig (minimal config)' @echo ' allnoconfig - New config where all options are answered with no' diff --git a/scripts/kconfig/streamline_config.pl b/scripts/kconfig/streamline_config.pl index e2f8504f5a2d..19857d18d814 100755 --- a/scripts/kconfig/streamline_config.pl +++ b/scripts/kconfig/streamline_config.pl @@ -143,6 +143,7 @@ my %depends; my %selects; my %prompts; my %objects; +my %config2kfile; my $var; my $iflevel = 0; my @ifdeps; @@ -201,6 +202,7 @@ sub read_kconfig { if (/^\s*(menu)?config\s+(\S+)\s*$/) { $state = "NEW"; $config = $2; + $config2kfile{"CONFIG_$config"} = $kconfig; # Add depends for 'if' nesting for (my $i = 0; $i < $iflevel; $i++) { @@ -591,6 +593,20 @@ while ($repeat) { } my %setconfigs; +my @preserved_kconfigs = split(/:/,$ENV{LMC_KEEP}); + +sub in_preserved_kconfigs { + my $kconfig = $config2kfile{$_[0]}; + if (!defined($kconfig)) { + return 0; + } + foreach my $excl (@preserved_kconfigs) { + if($kconfig =~ /^$excl/) { + return 1; + } + } + return 0; +} # Finally, read the .config file and turn off any module enabled that # we could not find a reason to keep enabled. @@ -644,6 +660,11 @@ foreach my $line (@config_file) { } if (/^(CONFIG.*)=(m|y)/) { + if (in_preserved_kconfigs($1)) { + dprint "Preserve config $1"; + print; + next; + } if (defined($configs{$1})) { if ($localyesconfig) { $setconfigs{$1} = 'y'; From ca91ddef2e438c1eaedf92722f66e8c235d373a7 Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Tue, 5 May 2020 18:13:14 +0200 Subject: [PATCH 087/427] soc: bcm2835: Add notify xHCI reset property The property is needed in order to trigger VL805's firmware load. Note that gap between the property introduced and the previous one is due to the properties not being defined. Link: https://lore.kernel.org/r/20200505161318.26200-2-nsaenzjulienne@suse.de Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Lorenzo Pieralisi Reviewed-by: Florian Fainelli Reviewed-by: Rob Herring --- include/soc/bcm2835/raspberrypi-firmware.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/soc/bcm2835/raspberrypi-firmware.h b/include/soc/bcm2835/raspberrypi-firmware.h index 7800e12ee042..cc9cdbc66403 100644 --- a/include/soc/bcm2835/raspberrypi-firmware.h +++ b/include/soc/bcm2835/raspberrypi-firmware.h @@ -90,7 +90,7 @@ enum rpi_firmware_property_tag { RPI_FIRMWARE_SET_PERIPH_REG = 0x00038045, RPI_FIRMWARE_GET_POE_HAT_VAL = 0x00030049, RPI_FIRMWARE_SET_POE_HAT_VAL = 0x00030050, - + RPI_FIRMWARE_NOTIFY_XHCI_RESET = 0x00030058, /* Dispmanx TAGS */ RPI_FIRMWARE_FRAMEBUFFER_ALLOCATE = 0x00040001, From fbbc5ff3f7f9f4cad562e530ae2cf5d8964fe6d3 Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Tue, 5 May 2020 18:13:15 +0200 Subject: [PATCH 088/427] firmware: raspberrypi: Introduce vl805 init routine The Raspberry Pi 4 gets its USB functionality from VL805, a PCIe chip that implements xHCI. After a PCI reset, VL805's firmware may either be loaded directly from an EEPROM or, if not present, by the SoC's co-processor, VideoCore. RPi4's VideoCore OS contains both the non public firmware load logic and the VL805 firmware blob. The function this patch introduces triggers the aforementioned process. Link: https://lore.kernel.org/r/20200505161318.26200-3-nsaenzjulienne@suse.de Tested-by: Stefan Wahren Tested-by: Stefan Wahren Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- drivers/firmware/raspberrypi.c | 61 ++++++++++++++++++++++ include/soc/bcm2835/raspberrypi-firmware.h | 7 +++ 2 files changed, 68 insertions(+) diff --git a/drivers/firmware/raspberrypi.c b/drivers/firmware/raspberrypi.c index da26a584dca0..a166ad0cec2c 100644 --- a/drivers/firmware/raspberrypi.c +++ b/drivers/firmware/raspberrypi.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #define MBOX_MSG(chan, data28) (((data28) & ~0xf) | ((chan) & 0xf)) @@ -19,6 +21,8 @@ #define MBOX_DATA28(msg) ((msg) & ~0xf) #define MBOX_CHAN_PROPERTY 8 +#define VL805_PCI_CONFIG_VERSION_OFFSET 0x50 + static struct platform_device *rpi_hwmon; static struct platform_device *rpi_clk; @@ -286,6 +290,63 @@ struct rpi_firmware *rpi_firmware_get(struct device_node *firmware_node) } EXPORT_SYMBOL_GPL(rpi_firmware_get); +/* + * The Raspberry Pi 4 gets its USB functionality from VL805, a PCIe chip that + * implements xHCI. After a PCI reset, VL805's firmware may either be loaded + * directly from an EEPROM or, if not present, by the SoC's co-processor, + * VideoCore. RPi4's VideoCore OS contains both the non public firmware load + * logic and the VL805 firmware blob. This function triggers the aforementioned + * process. + */ +int rpi_firmware_init_vl805(struct pci_dev *pdev) +{ + struct device_node *fw_np; + struct rpi_firmware *fw; + u32 dev_addr, version; + int ret; + + fw_np = of_find_compatible_node(NULL, NULL, + "raspberrypi,bcm2835-firmware"); + if (!fw_np) + return 0; + + fw = rpi_firmware_get(fw_np); + of_node_put(fw_np); + if (!fw) + return -ENODEV; + + /* + * Make sure we don't trigger a firmware load unnecessarily. + * + * If something went wrong with PCI, this whole exercise would be + * futile as VideoCore expects from us a configured PCI bus. Just take + * the faulty version (likely ~0) and let xHCI's registration fail + * further down the line. + */ + pci_read_config_dword(pdev, VL805_PCI_CONFIG_VERSION_OFFSET, &version); + if (version) + goto exit; + + dev_addr = pdev->bus->number << 20 | PCI_SLOT(pdev->devfn) << 15 | + PCI_FUNC(pdev->devfn) << 12; + + ret = rpi_firmware_property(fw, RPI_FIRMWARE_NOTIFY_XHCI_RESET, + &dev_addr, sizeof(dev_addr)); + if (ret) + return ret; + + /* Wait for vl805 to startup */ + usleep_range(200, 1000); + + pci_read_config_dword(pdev, VL805_PCI_CONFIG_VERSION_OFFSET, + &version); +exit: + pci_info(pdev, "VL805 firmware version %08x\n", version); + + return 0; +} +EXPORT_SYMBOL_GPL(rpi_firmware_init_vl805); + static const struct of_device_id rpi_firmware_of_match[] = { { .compatible = "raspberrypi,bcm2835-firmware", }, {}, diff --git a/include/soc/bcm2835/raspberrypi-firmware.h b/include/soc/bcm2835/raspberrypi-firmware.h index cc9cdbc66403..3025aca3c358 100644 --- a/include/soc/bcm2835/raspberrypi-firmware.h +++ b/include/soc/bcm2835/raspberrypi-firmware.h @@ -10,6 +10,7 @@ #include struct rpi_firmware; +struct pci_dev; enum rpi_firmware_property_status { RPI_FIRMWARE_STATUS_REQUEST = 0, @@ -141,6 +142,7 @@ int rpi_firmware_property(struct rpi_firmware *fw, int rpi_firmware_property_list(struct rpi_firmware *fw, void *data, size_t tag_size); struct rpi_firmware *rpi_firmware_get(struct device_node *firmware_node); +int rpi_firmware_init_vl805(struct pci_dev *pdev); #else static inline int rpi_firmware_property(struct rpi_firmware *fw, u32 tag, void *data, size_t len) @@ -158,6 +160,11 @@ static inline struct rpi_firmware *rpi_firmware_get(struct device_node *firmware { return NULL; } + +static inline int rpi_firmware_init_vl805(struct pci_dev *pdev) +{ + return 0; +} #endif #endif /* __SOC_RASPBERRY_FIRMWARE_H__ */ From 44331189f9082c7e659697bbac1747db3def73e7 Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Tue, 5 May 2020 18:13:16 +0200 Subject: [PATCH 089/427] PCI: brcmstb: Wait for Raspberry Pi's firmware when present xHCI's PCI fixup, run at the end of pcie-brcmstb's probe, depends on RPi4's VideoCore firmware interface to be up and running. It's possible for both initializations to race, so make sure it's available prior to starting. Link: https://lore.kernel.org/r/20200505161318.26200-4-nsaenzjulienne@suse.de Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Lorenzo Pieralisi Reviewed-by: Florian Fainelli Reviewed-by: Rob Herring --- drivers/pci/controller/pcie-brcmstb.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index 752f5b331579..7730ea845ff2 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -28,6 +28,8 @@ #include #include +#include + #include "../pci.h" /* BRCM_PCIE_CAP_REGS - Offset for the mandatory capability config regs */ @@ -929,11 +931,26 @@ static int brcm_pcie_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node, *msi_np; struct pci_host_bridge *bridge; + struct device_node *fw_np; struct brcm_pcie *pcie; struct pci_bus *child; struct resource *res; int ret; + /* + * We have to wait for Raspberry Pi's firmware interface to be up as a + * PCI fixup, rpi_firmware_init_vl805(), depends on it. This driver's + * probe can race with the firmware interface's (see + * drivers/firmware/raspberrypi.c) and potentially break the PCI fixup. + */ + fw_np = of_find_compatible_node(NULL, NULL, + "raspberrypi,bcm2835-firmware"); + if (fw_np && !rpi_firmware_get(fw_np)) { + of_node_put(fw_np); + return -EPROBE_DEFER; + } + of_node_put(fw_np); + bridge = devm_pci_alloc_host_bridge(&pdev->dev, sizeof(*pcie)); if (!bridge) return -ENOMEM; From c65822fef4adc0ba40c37a47337376ce75f7a7bc Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Tue, 5 May 2020 18:13:17 +0200 Subject: [PATCH 090/427] USB: pci-quirks: Add Raspberry Pi 4 quirk On the Raspberry Pi 4, after a PCI reset, VL805's firmware may either be loaded directly from an EEPROM or, if not present, by the SoC's VideoCore. Inform VideoCore that VL805 was just reset. Also, as this creates a dependency between USB_PCI and VideoCore's firmware interface, and since USB_PCI can't be set as a module neither this can. Reflect that on the firmware interface Kconfg. Link: https://lore.kernel.org/r/20200505161318.26200-5-nsaenzjulienne@suse.de Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Mathias Nyman --- drivers/firmware/Kconfig | 3 ++- drivers/usb/host/pci-quirks.c | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig index 8007d4aa76dc..b42140cff8ac 100644 --- a/drivers/firmware/Kconfig +++ b/drivers/firmware/Kconfig @@ -178,8 +178,9 @@ config ISCSI_IBFT Otherwise, say N. config RASPBERRYPI_FIRMWARE - tristate "Raspberry Pi Firmware Driver" + bool "Raspberry Pi Firmware Driver" depends on BCM2835_MBOX + default USB_PCI help This option enables support for communicating with the firmware on the Raspberry Pi. diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c index beb2efa71341..0dc34668bb2a 100644 --- a/drivers/usb/host/pci-quirks.c +++ b/drivers/usb/host/pci-quirks.c @@ -16,6 +16,9 @@ #include #include #include + +#include + #include "pci-quirks.h" #include "xhci-ext-caps.h" @@ -1243,11 +1246,24 @@ iounmap: static void quirk_usb_early_handoff(struct pci_dev *pdev) { + int ret; + /* Skip Netlogic mips SoC's internal PCI USB controller. * This device does not need/support EHCI/OHCI handoff */ if (pdev->vendor == 0x184e) /* vendor Netlogic */ return; + + if (pdev->vendor == PCI_VENDOR_ID_VIA && pdev->device == 0x3483) { + ret = rpi_firmware_init_vl805(pdev); + if (ret) { + /* Firmware might be outdated, or something failed */ + dev_warn(&pdev->dev, + "Failed to load VL805's firmware: %d. Will continue to attempt to work, but bad things might happen. You should fix this...\n", + ret); + } + } + if (pdev->class != PCI_CLASS_SERIAL_USB_UHCI && pdev->class != PCI_CLASS_SERIAL_USB_OHCI && pdev->class != PCI_CLASS_SERIAL_USB_EHCI && From d9d200bcebc1f6e56f0178cbb8db9953e8cc9a11 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 13 May 2020 15:32:08 +0200 Subject: [PATCH 091/427] dma-mapping: add generic helpers for mapping sgtable objects struct sg_table is a common structure used for describing a memory buffer. It consists of a scatterlist with memory pages and DMA addresses (sgl entry), as well as the number of scatterlist entries: CPU pages (orig_nents entry) and DMA mapped pages (nents entry). It turned out that it was a common mistake to misuse nents and orig_nents entries, calling DMA-mapping functions with a wrong number of entries or ignoring the number of mapped entries returned by the dma_map_sg function. To avoid such issues, let's introduce a common wrappers operating directly on the struct sg_table objects, which take care of the proper use of the nents and orig_nents entries. Signed-off-by: Marek Szyprowski Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- include/linux/dma-mapping.h | 80 +++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 330ad58fbf4d..936e30b86cd3 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -609,6 +609,86 @@ static inline void dma_sync_single_range_for_device(struct device *dev, return dma_sync_single_for_device(dev, addr + offset, size, dir); } +/** + * dma_map_sgtable - Map the given buffer for DMA + * @dev: The device for which to perform the DMA operation + * @sgt: The sg_table object describing the buffer + * @dir: DMA direction + * @attrs: Optional DMA attributes for the map operation + * + * Maps a buffer described by a scatterlist stored in the given sg_table + * object for the @dir DMA operation by the @dev device. After success the + * ownership for the buffer is transferred to the DMA domain. One has to + * call dma_sync_sgtable_for_cpu() or dma_unmap_sgtable() to move the + * ownership of the buffer back to the CPU domain before touching the + * buffer by the CPU. + * + * Returns 0 on success or -EINVAL on error during mapping the buffer. + */ +static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt, + enum dma_data_direction dir, unsigned long attrs) +{ + int nents; + + nents = dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs); + if (nents <= 0) + return -EINVAL; + sgt->nents = nents; + return 0; +} + +/** + * dma_unmap_sgtable - Unmap the given buffer for DMA + * @dev: The device for which to perform the DMA operation + * @sgt: The sg_table object describing the buffer + * @dir: DMA direction + * @attrs: Optional DMA attributes for the unmap operation + * + * Unmaps a buffer described by a scatterlist stored in the given sg_table + * object for the @dir DMA operation by the @dev device. After this function + * the ownership of the buffer is transferred back to the CPU domain. + */ +static inline void dma_unmap_sgtable(struct device *dev, struct sg_table *sgt, + enum dma_data_direction dir, unsigned long attrs) +{ + dma_unmap_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs); +} + +/** + * dma_sync_sgtable_for_cpu - Synchronize the given buffer for CPU access + * @dev: The device for which to perform the DMA operation + * @sgt: The sg_table object describing the buffer + * @dir: DMA direction + * + * Performs the needed cache synchronization and moves the ownership of the + * buffer back to the CPU domain, so it is safe to perform any access to it + * by the CPU. Before doing any further DMA operations, one has to transfer + * the ownership of the buffer back to the DMA domain by calling the + * dma_sync_sgtable_for_device(). + */ +static inline void dma_sync_sgtable_for_cpu(struct device *dev, + struct sg_table *sgt, enum dma_data_direction dir) +{ + dma_sync_sg_for_cpu(dev, sgt->sgl, sgt->orig_nents, dir); +} + +/** + * dma_sync_sgtable_for_device - Synchronize the given buffer for DMA + * @dev: The device for which to perform the DMA operation + * @sgt: The sg_table object describing the buffer + * @dir: DMA direction + * + * Performs the needed cache synchronization and moves the ownership of the + * buffer back to the DMA domain, so it is safe to perform the DMA operation. + * Once finished, one has to call dma_sync_sgtable_for_cpu() or + * dma_unmap_sgtable(). + */ +static inline void dma_sync_sgtable_for_device(struct device *dev, + struct sg_table *sgt, enum dma_data_direction dir) +{ + dma_sync_sg_for_device(dev, sgt->sgl, sgt->orig_nents, dir); +} + #define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, 0) #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0) #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0) From 709d6d73c756107fb8a292a9f957d630097425fa Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 13 May 2020 15:32:09 +0200 Subject: [PATCH 092/427] scatterlist: add generic wrappers for iterating over sgtable objects struct sg_table is a common structure used for describing a memory buffer. It consists of a scatterlist with memory pages and DMA addresses (sgl entry), as well as the number of scatterlist entries: CPU pages (orig_nents entry) and DMA mapped pages (nents entry). It turned out that it was a common mistake to misuse nents and orig_nents entries, calling the scatterlist iterating functions with a wrong number of the entries. To avoid such issues, lets introduce a common wrappers operating directly on the struct sg_table objects, which take care of the proper use of the nents and orig_nents entries. While touching this, lets clarify some ambiguities in the comments for the existing for_each helpers. Signed-off-by: Marek Szyprowski Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- include/linux/scatterlist.h | 50 ++++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 6eec50fb36c8..4f922afb607a 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -151,6 +151,20 @@ static inline void sg_set_buf(struct scatterlist *sg, const void *buf, #define for_each_sg(sglist, sg, nr, __i) \ for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg)) +/* + * Loop over each sg element in the given sg_table object. + */ +#define for_each_sgtable_sg(sgt, sg, i) \ + for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) + +/* + * Loop over each sg element in the given *DMA mapped* sg_table object. + * Please use sg_dma_address(sg) and sg_dma_len(sg) to extract DMA addresses + * of the each element. + */ +#define for_each_sgtable_dma_sg(sgt, sg, i) \ + for_each_sg(sgt->sgl, sg, sgt->nents, i) + /** * sg_chain - Chain two sglists together * @prv: First scatterlist @@ -401,9 +415,10 @@ sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter) * @sglist: sglist to iterate over * @piter: page iterator to hold current page, sg, sg_pgoffset * @nents: maximum number of sg entries to iterate over - * @pgoffset: starting page offset + * @pgoffset: starting page offset (in pages) * * Callers may use sg_page_iter_page() to get each page pointer. + * In each loop it operates on PAGE_SIZE unit. */ #define for_each_sg_page(sglist, piter, nents, pgoffset) \ for (__sg_page_iter_start((piter), (sglist), (nents), (pgoffset)); \ @@ -412,18 +427,47 @@ sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter) /** * for_each_sg_dma_page - iterate over the pages of the given sg list * @sglist: sglist to iterate over - * @dma_iter: page iterator to hold current page + * @dma_iter: DMA page iterator to hold current page * @dma_nents: maximum number of sg entries to iterate over, this is the value * returned from dma_map_sg - * @pgoffset: starting page offset + * @pgoffset: starting page offset (in pages) * * Callers may use sg_page_iter_dma_address() to get each page's DMA address. + * In each loop it operates on PAGE_SIZE unit. */ #define for_each_sg_dma_page(sglist, dma_iter, dma_nents, pgoffset) \ for (__sg_page_iter_start(&(dma_iter)->base, sglist, dma_nents, \ pgoffset); \ __sg_page_iter_dma_next(dma_iter);) +/** + * for_each_sgtable_page - iterate over all pages in the sg_table object + * @sgt: sg_table object to iterate over + * @piter: page iterator to hold current page + * @pgoffset: starting page offset (in pages) + * + * Iterates over the all memory pages in the buffer described by + * a scatterlist stored in the given sg_table object. + * See also for_each_sg_page(). In each loop it operates on PAGE_SIZE unit. + */ +#define for_each_sgtable_page(sgt, piter, pgoffset) \ + for_each_sg_page(sgt->sgl, piter, sgt->orig_nents, pgoffset) + +/** + * for_each_sgtable_dma_page - iterate over the DMA mapped sg_table object + * @sgt: sg_table object to iterate over + * @dma_iter: DMA page iterator to hold current page + * @pgoffset: starting page offset (in pages) + * + * Iterates over the all DMA mapped pages in the buffer described by + * a scatterlist stored in the given sg_table object. + * See also for_each_sg_dma_page(). In each loop it operates on PAGE_SIZE + * unit. + */ +#define for_each_sgtable_dma_page(sgt, dma_iter, pgoffset) \ + for_each_sg_dma_page(sgt->sgl, dma_iter, sgt->nents, pgoffset) + + /* * Mapping sg iterator * From 48530d9fab0d3bf08827f9167be54acf66d4d457 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 13 May 2020 15:32:10 +0200 Subject: [PATCH 093/427] iommu: add generic helper for mapping sgtable objects struct sg_table is a common structure used for describing a memory buffer. It consists of a scatterlist with memory pages and DMA addresses (sgl entry), as well as the number of scatterlist entries: CPU pages (orig_nents entry) and DMA mapped pages (nents entry). It turned out that it was a common mistake to misuse nents and orig_nents entries, calling mapping functions with a wrong number of entries. To avoid such issues, lets introduce a common wrapper operating directly on the struct sg_table objects, which take care of the proper use of the nents and orig_nents entries. Signed-off-by: Marek Szyprowski Acked-by: Joerg Roedel Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- include/linux/iommu.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7ef8b0bda695..b1bfbe6dff42 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -466,6 +466,22 @@ extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t io extern void iommu_set_fault_handler(struct iommu_domain *domain, iommu_fault_handler_t handler, void *token); +/** + * iommu_map_sgtable - Map the given buffer to the IOMMU domain + * @domain: The IOMMU domain to perform the mapping + * @iova: The start address to map the buffer + * @sgt: The sg_table object describing the buffer + * @prot: IOMMU protection bits + * + * Creates a mapping at @iova for the buffer described by a scatterlist + * stored in the given sg_table object in the provided IOMMU domain. + */ +static inline size_t iommu_map_sgtable(struct iommu_domain *domain, + unsigned long iova, struct sg_table *sgt, int prot) +{ + return iommu_map_sg(domain, iova, sgt->sgl, sgt->orig_nents, prot); +} + extern void iommu_get_resv_regions(struct device *dev, struct list_head *list); extern void iommu_put_resv_regions(struct device *dev, struct list_head *list); extern void generic_iommu_put_resv_regions(struct device *dev, From ef5fd681d5159d64c464715d657660f0151c7419 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Wed, 15 Apr 2020 15:25:42 +0800 Subject: [PATCH 094/427] ext4: remove redundant variable has_bigalloc in ext4_fill_super We can use the ext4_has_feature_bigalloc() function directly to check bigalloc feature and the variable has_bigalloc is reduncant, so remove it. Signed-off-by: Kaixu Xia Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/1586935542-29588-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 79e07e69cef9..49821d8a0910 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3679,7 +3679,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) int blocksize, clustersize; unsigned int db_count; unsigned int i; - int needs_recovery, has_huge_files, has_bigalloc; + int needs_recovery, has_huge_files; __u64 blocks_count; int err = 0; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; @@ -4194,8 +4194,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Handle clustersize */ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); - has_bigalloc = ext4_has_feature_bigalloc(sb); - if (has_bigalloc) { + if (ext4_has_feature_bigalloc(sb)) { if (clustersize < blocksize) { ext4_msg(sb, KERN_ERR, "cluster size (%d) smaller than " From 6ae72bfa656ea04806f98ef85cb44b0789064362 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Sat, 9 May 2020 18:19:28 +0800 Subject: [PATCH 095/427] PCI: Unify pcie_find_root_port() and pci_find_pcie_root_port() Previously we used pcie_find_root_port() to find a Root Port from a PCIe device and pci_find_pcie_root_port() to find a Root Port from a Conventional PCI device. Unify the two functions and use pcie_find_root_port() to find a Root Port from either a Conventional PCI device or a PCIe device. Then there is no need to distinguish the type of the device. Link: https://lore.kernel.org/r/1589019568-5216-1-git-send-email-yangyicong@hisilicon.com Signed-off-by: Yicong Yang Signed-off-by: Bjorn Helgaas Acked-by: Kalle Valo # wireless Acked-by: Mika Westerberg # thunderbolt --- drivers/pci/pci-acpi.c | 2 +- drivers/pci/pci.c | 24 ------------------------ drivers/pci/probe.c | 2 +- drivers/pci/quirks.c | 2 +- drivers/thunderbolt/switch.c | 4 ++-- include/linux/pci.h | 23 ++++++++++++++--------- 6 files changed, 19 insertions(+), 38 deletions(-) diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index d21969fba6ab..d820a55ae71c 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -948,7 +948,7 @@ static bool acpi_pci_bridge_d3(struct pci_dev *dev) * Look for a special _DSD property for the root port and if it * is set we know the hierarchy behind it supports D3 just fine. */ - root = pci_find_pcie_root_port(dev); + root = pcie_find_root_port(dev); if (!root) return false; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index bb78f580814e..227a3a979ec4 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -751,30 +751,6 @@ struct resource *pci_find_resource(struct pci_dev *dev, struct resource *res) } EXPORT_SYMBOL(pci_find_resource); -/** - * pci_find_pcie_root_port - return PCIe Root Port - * @dev: PCI device to query - * - * Traverse up the parent chain and return the PCIe Root Port PCI Device - * for a given PCI Device. - */ -struct pci_dev *pci_find_pcie_root_port(struct pci_dev *dev) -{ - struct pci_dev *bridge, *highest_pcie_bridge = dev; - - bridge = pci_upstream_bridge(dev); - while (bridge && pci_is_pcie(bridge)) { - highest_pcie_bridge = bridge; - bridge = pci_upstream_bridge(bridge); - } - - if (pci_pcie_type(highest_pcie_bridge) != PCI_EXP_TYPE_ROOT_PORT) - return NULL; - - return highest_pcie_bridge; -} -EXPORT_SYMBOL(pci_find_pcie_root_port); - /** * pci_wait_for_pending - wait for @mask bit(s) to clear in status word @pos * @dev: the PCI device to operate on diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 77b8a145c39b..cdff469ba070 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2056,7 +2056,7 @@ static void pci_configure_relaxed_ordering(struct pci_dev *dev) * For now, we only deal with Relaxed Ordering issues with Root * Ports. Peer-to-Peer DMA is another can of worms. */ - root = pci_find_pcie_root_port(dev); + root = pcie_find_root_port(dev); if (!root) return; diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 28c9a2409c50..885044d050a6 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4319,7 +4319,7 @@ DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_AMD, 0x1a02, PCI_CLASS_NOT_DEFINED, */ static void quirk_disable_root_port_attributes(struct pci_dev *pdev) { - struct pci_dev *root_port = pci_find_pcie_root_port(pdev); + struct pci_dev *root_port = pcie_find_root_port(pdev); if (!root_port) { pci_warn(pdev, "PCIe Completion erratum may cause device errors\n"); diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c index a2ce99051c51..d92c7554520b 100644 --- a/drivers/thunderbolt/switch.c +++ b/drivers/thunderbolt/switch.c @@ -263,7 +263,7 @@ static void nvm_authenticate_start_dma_port(struct tb_switch *sw) * itself. To be on the safe side keep the root port in D0 during * the whole upgrade process. */ - root_port = pci_find_pcie_root_port(sw->tb->nhi->pdev); + root_port = pcie_find_root_port(sw->tb->nhi->pdev); if (root_port) pm_runtime_get_noresume(&root_port->dev); } @@ -272,7 +272,7 @@ static void nvm_authenticate_complete_dma_port(struct tb_switch *sw) { struct pci_dev *root_port; - root_port = pci_find_pcie_root_port(sw->tb->nhi->pdev); + root_port = pcie_find_root_port(sw->tb->nhi->pdev); if (root_port) pm_runtime_put(&root_port->dev); } diff --git a/include/linux/pci.h b/include/linux/pci.h index 0453ee458ab1..bbd6510065a7 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1025,7 +1025,6 @@ void pci_bus_add_device(struct pci_dev *dev); void pci_read_bridge_bases(struct pci_bus *child); struct resource *pci_find_parent_resource(const struct pci_dev *dev, struct resource *res); -struct pci_dev *pci_find_pcie_root_port(struct pci_dev *dev); u8 pci_swizzle_interrupt_pin(const struct pci_dev *dev, u8 pin); int pci_get_interrupt_pin(struct pci_dev *dev, struct pci_dev **bridge); u8 pci_common_swizzle(struct pci_dev *dev, u8 *pinp); @@ -2143,17 +2142,23 @@ static inline int pci_pcie_type(const struct pci_dev *dev) return (pcie_caps_reg(dev) & PCI_EXP_FLAGS_TYPE) >> 4; } +/** + * pcie_find_root_port - Get the PCIe root port device + * @dev: PCI device + * + * Traverse up the parent chain and return the PCIe Root Port PCI Device + * for a given PCI/PCIe Device. + */ static inline struct pci_dev *pcie_find_root_port(struct pci_dev *dev) { - while (1) { - if (!pci_is_pcie(dev)) - break; - if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) - return dev; - if (!dev->bus->self) - break; - dev = dev->bus->self; + struct pci_dev *bridge = pci_upstream_bridge(dev); + + while (bridge) { + if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) + return bridge; + bridge = pci_upstream_bridge(bridge); } + return NULL; } From 1b54ae8327a4d630111c8d88ba7906483ec6010b Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 13 May 2020 17:38:58 -0500 Subject: [PATCH 096/427] PCI: Fix pci_register_host_bridge() device_register() error handling If device_register() has an error, we should bail out of pci_register_host_bridge() rather than continuing on. Fixes: 37d6a0a6f470 ("PCI: Add pci_register_host_bridge() interface") Link: https://lore.kernel.org/r/20200513223859.11295-1-robh@kernel.org Signed-off-by: Rob Herring Signed-off-by: Bjorn Helgaas Reviewed-by: Lorenzo Pieralisi Reviewed-by: Arnd Bergmann --- drivers/pci/probe.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 77b8a145c39b..e21dc71b1907 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -909,9 +909,10 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge) goto free; err = device_register(&bridge->dev); - if (err) + if (err) { put_device(&bridge->dev); - + goto free; + } bus->bridge = get_device(&bridge->dev); device_enable_async_suspend(bus->bridge); pci_set_bus_of_node(bus); From 9885440b16b8fc1dd7275800fd28f56a92f60896 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 13 May 2020 17:38:59 -0500 Subject: [PATCH 097/427] PCI: Fix pci_host_bridge struct device release/free handling The PCI code has several paths where the struct pci_host_bridge is freed directly. This is wrong because it contains a struct device which is refcounted and should be freed using put_device(). This can result in use-after-free errors. I think this problem has existed since 2012 with commit 7b5436635800 ("PCI: add generic device into pci_host_bridge struct"). It generally hasn't mattered as most host bridge drivers are still built-in and can't unbind. The problem is a struct device should never be freed directly once device_initialize() is called and a ref is held, but that doesn't happen until pci_register_host_bridge(). There's then a window between allocating the host bridge and pci_register_host_bridge() where kfree should be used. This is fragile and requires callers to do the right thing. To fix this, we need to split device_register() into device_initialize() and device_add() calls, so that the host bridge struct is always freed by using a put_device(). devm_pci_alloc_host_bridge() is using devm_kzalloc() to allocate struct pci_host_bridge which will be freed directly. Instead, we can use a custom devres action to call put_device(). Link: https://lore.kernel.org/r/20200513223859.11295-2-robh@kernel.org Reported-by: Anders Roxell Tested-by: Anders Roxell Signed-off-by: Rob Herring Signed-off-by: Bjorn Helgaas Reviewed-by: Lorenzo Pieralisi Acked-by: Arnd Bergmann --- drivers/pci/probe.c | 36 +++++++++++++++++++----------------- drivers/pci/remove.c | 2 +- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index e21dc71b1907..e064ded6fbec 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -565,7 +565,7 @@ static struct pci_bus *pci_alloc_bus(struct pci_bus *parent) return b; } -static void devm_pci_release_host_bridge_dev(struct device *dev) +static void pci_release_host_bridge_dev(struct device *dev) { struct pci_host_bridge *bridge = to_pci_host_bridge(dev); @@ -574,12 +574,7 @@ static void devm_pci_release_host_bridge_dev(struct device *dev) pci_free_resource_list(&bridge->windows); pci_free_resource_list(&bridge->dma_ranges); -} - -static void pci_release_host_bridge_dev(struct device *dev) -{ - devm_pci_release_host_bridge_dev(dev); - kfree(to_pci_host_bridge(dev)); + kfree(bridge); } static void pci_init_host_bridge(struct pci_host_bridge *bridge) @@ -599,6 +594,8 @@ static void pci_init_host_bridge(struct pci_host_bridge *bridge) bridge->native_pme = 1; bridge->native_ltr = 1; bridge->native_dpc = 1; + + device_initialize(&bridge->dev); } struct pci_host_bridge *pci_alloc_host_bridge(size_t priv) @@ -616,17 +613,25 @@ struct pci_host_bridge *pci_alloc_host_bridge(size_t priv) } EXPORT_SYMBOL(pci_alloc_host_bridge); +static void devm_pci_alloc_host_bridge_release(void *data) +{ + pci_free_host_bridge(data); +} + struct pci_host_bridge *devm_pci_alloc_host_bridge(struct device *dev, size_t priv) { + int ret; struct pci_host_bridge *bridge; - bridge = devm_kzalloc(dev, sizeof(*bridge) + priv, GFP_KERNEL); + bridge = pci_alloc_host_bridge(priv); if (!bridge) return NULL; - pci_init_host_bridge(bridge); - bridge->dev.release = devm_pci_release_host_bridge_dev; + ret = devm_add_action_or_reset(dev, devm_pci_alloc_host_bridge_release, + bridge); + if (ret) + return NULL; return bridge; } @@ -634,10 +639,7 @@ EXPORT_SYMBOL(devm_pci_alloc_host_bridge); void pci_free_host_bridge(struct pci_host_bridge *bridge) { - pci_free_resource_list(&bridge->windows); - pci_free_resource_list(&bridge->dma_ranges); - - kfree(bridge); + put_device(&bridge->dev); } EXPORT_SYMBOL(pci_free_host_bridge); @@ -908,7 +910,7 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge) if (err) goto free; - err = device_register(&bridge->dev); + err = device_add(&bridge->dev); if (err) { put_device(&bridge->dev); goto free; @@ -978,7 +980,7 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge) unregister: put_device(&bridge->dev); - device_unregister(&bridge->dev); + device_del(&bridge->dev); free: kfree(bus); @@ -2953,7 +2955,7 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus, return bridge->bus; err_out: - kfree(bridge); + put_device(&bridge->dev); return NULL; } EXPORT_SYMBOL_GPL(pci_create_root_bus); diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c index e9c6b120cf45..95dec03d9f2a 100644 --- a/drivers/pci/remove.c +++ b/drivers/pci/remove.c @@ -160,6 +160,6 @@ void pci_remove_root_bus(struct pci_bus *bus) host_bridge->bus = NULL; /* remove the host bridge */ - device_unregister(&host_bridge->dev); + device_del(&host_bridge->dev); } EXPORT_SYMBOL_GPL(pci_remove_root_bus); From aa0ce96d72dd2e1b0dfd0fb868f82876e7790878 Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Fri, 27 Mar 2020 14:16:15 -0700 Subject: [PATCH 098/427] PCI: Program MPS for RCiEP devices Root Complex Integrated Endpoints (RCiEPs) do not have an upstream bridge, so pci_configure_mps() previously ignored them, which may result in reduced performance. Instead, program the Max_Payload_Size of RCiEPs to the maximum supported value (unless it is limited for the PCIE_BUS_PEER2PEER case). This also affects the subsequent programming of Max_Read_Request_Size because Linux programs MRRS based on the MPS value. Fixes: 9dae3a97297f ("PCI: Move MPS configuration check to pci_configure_device()") Link: https://lore.kernel.org/r/1585343775-4019-1-git-send-email-ashok.raj@intel.com Tested-by: Dave Jiang Signed-off-by: Ashok Raj Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org --- drivers/pci/probe.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index e064ded6fbec..27c43ff63cd5 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1937,13 +1937,33 @@ static void pci_configure_mps(struct pci_dev *dev) struct pci_dev *bridge = pci_upstream_bridge(dev); int mps, mpss, p_mps, rc; - if (!pci_is_pcie(dev) || !bridge || !pci_is_pcie(bridge)) + if (!pci_is_pcie(dev)) return; /* MPS and MRRS fields are of type 'RsvdP' for VFs, short-circuit out */ if (dev->is_virtfn) return; + /* + * For Root Complex Integrated Endpoints, program the maximum + * supported value unless limited by the PCIE_BUS_PEER2PEER case. + */ + if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) { + if (pcie_bus_config == PCIE_BUS_PEER2PEER) + mps = 128; + else + mps = 128 << dev->pcie_mpss; + rc = pcie_set_mps(dev, mps); + if (rc) { + pci_warn(dev, "can't set Max Payload Size to %d; if necessary, use \"pci=pcie_bus_safe\" and report a bug\n", + mps); + } + return; + } + + if (!bridge || !pci_is_pcie(bridge)) + return; + mps = pcie_get_mps(dev); p_mps = pcie_get_mps(bridge); From 27f5411a718c431c20007e3a2fbba6589942d04f Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Mon, 20 Apr 2020 16:46:59 +0300 Subject: [PATCH 099/427] dm crypt: support using encrypted keys Allow one to use "encrypted" in addition to "user" and "logon" key types for device encryption. Signed-off-by: Dmitry Baryshkov Signed-off-by: Mike Snitzer --- drivers/md/Kconfig | 1 + drivers/md/dm-crypt.c | 76 ++++++++++++++++++++++++++++++++----------- 2 files changed, 58 insertions(+), 19 deletions(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index d6d5ab23c088..8b0c646d2f59 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -269,6 +269,7 @@ config DM_UNSTRIPED config DM_CRYPT tristate "Crypt target support" depends on BLK_DEV_DM + depends on (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) select CRYPTO select CRYPTO_CBC select CRYPTO_ESSIV diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 3df90daba89e..91787cde369b 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -34,7 +34,9 @@ #include #include #include /* for struct rtattr and RTA macros only */ +#include #include +#include #include @@ -2215,12 +2217,47 @@ static bool contains_whitespace(const char *str) return false; } +static int set_key_user(struct crypt_config *cc, struct key *key) +{ + const struct user_key_payload *ukp; + + ukp = user_key_payload_locked(key); + if (!ukp) + return -EKEYREVOKED; + + if (cc->key_size != ukp->datalen) + return -EINVAL; + + memcpy(cc->key, ukp->data, cc->key_size); + + return 0; +} + +#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE) +static int set_key_encrypted(struct crypt_config *cc, struct key *key) +{ + const struct encrypted_key_payload *ekp; + + ekp = key->payload.data[0]; + if (!ekp) + return -EKEYREVOKED; + + if (cc->key_size != ekp->decrypted_datalen) + return -EINVAL; + + memcpy(cc->key, ekp->decrypted_data, cc->key_size); + + return 0; +} +#endif /* CONFIG_ENCRYPTED_KEYS */ + static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string) { char *new_key_string, *key_desc; int ret; + struct key_type *type; struct key *key; - const struct user_key_payload *ukp; + int (*set_key)(struct crypt_config *cc, struct key *key); /* * Reject key_string with whitespace. dm core currently lacks code for @@ -2236,16 +2273,26 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string if (!key_desc || key_desc == key_string || !strlen(key_desc + 1)) return -EINVAL; - if (strncmp(key_string, "logon:", key_desc - key_string + 1) && - strncmp(key_string, "user:", key_desc - key_string + 1)) + if (!strncmp(key_string, "logon:", key_desc - key_string + 1)) { + type = &key_type_logon; + set_key = set_key_user; + } else if (!strncmp(key_string, "user:", key_desc - key_string + 1)) { + type = &key_type_user; + set_key = set_key_user; +#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE) + } else if (!strncmp(key_string, "encrypted:", key_desc - key_string + 1)) { + type = &key_type_encrypted; + set_key = set_key_encrypted; +#endif + } else { return -EINVAL; + } new_key_string = kstrdup(key_string, GFP_KERNEL); if (!new_key_string) return -ENOMEM; - key = request_key(key_string[0] == 'l' ? &key_type_logon : &key_type_user, - key_desc + 1, NULL); + key = request_key(type, key_desc + 1, NULL); if (IS_ERR(key)) { kzfree(new_key_string); return PTR_ERR(key); @@ -2253,23 +2300,14 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string down_read(&key->sem); - ukp = user_key_payload_locked(key); - if (!ukp) { + ret = set_key(cc, key); + if (ret < 0) { up_read(&key->sem); key_put(key); kzfree(new_key_string); - return -EKEYREVOKED; + return ret; } - if (cc->key_size != ukp->datalen) { - up_read(&key->sem); - key_put(key); - kzfree(new_key_string); - return -EINVAL; - } - - memcpy(cc->key, ukp->data, cc->key_size); - up_read(&key->sem); key_put(key); @@ -2323,7 +2361,7 @@ static int get_key_size(char **key_string) return (*key_string[0] == ':') ? -EINVAL : strlen(*key_string) >> 1; } -#endif +#endif /* CONFIG_KEYS */ static int crypt_set_key(struct crypt_config *cc, char *key) { @@ -3282,7 +3320,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type crypt_target = { .name = "crypt", - .version = {1, 20, 0}, + .version = {1, 21, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, From 2361ae595352dec015d14292f1b539242d8446d6 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Mon, 20 Apr 2020 22:29:09 +0200 Subject: [PATCH 100/427] dm mpath: switch paths in dm_blk_ioctl() code path SCSI LUN passthrough code such as qemu's "scsi-block" device model pass every IO to the host via SG_IO ioctls. Currently, dm-multipath calls choose_pgpath() only in the block IO code path, not in the ioctl code path (unless current_pgpath is NULL). This has the effect that no path switching and thus no load balancing is done for SCSI-passthrough IO, unless the active path fails. Fix this by using the same logic in multipath_prepare_ioctl() as in multipath_clone_and_map(). Note: The allegedly best path selection algorithm, service-time, still wouldn't work perfectly, because the io size of the current request is always set to 0. Changing that for the IO passthrough case would require the ioctl cmd and arg to be passed to dm's prepare_ioctl() method. Signed-off-by: Martin Wilck Reviewed-by: Hannes Reinecke Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 3e500098132f..e0c800cf87a9 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1918,7 +1918,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, int r; current_pgpath = READ_ONCE(m->current_pgpath); - if (!current_pgpath) + if (!current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) current_pgpath = choose_pgpath(m, 0); if (current_pgpath) { From d3c7b35c20d60650bac8b55c17b194adda03a979 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Mon, 9 Mar 2020 23:26:38 +0100 Subject: [PATCH 101/427] dm: add emulated block size target This new target is similar to the linear target except that it emulates a smaller logical block size on a device with a larger logical block size. Its main purpose is to emulate 512 byte sectors on 4K native disks (i.e. 512e). See Documentation/admin-guide/device-mapper/dm-ebs.rst for details. Reviewed-by: Damien Le Moal Signed-off-by: Heinz Mauelshagen Signed-off-by: Randy Dunlap [Kconfig fixes] Signed-off-by: Zheng Bin [static fixes] Signed-off-by: Mike Snitzer --- .../admin-guide/device-mapper/dm-ebs.rst | 51 ++ drivers/md/Kconfig | 8 + drivers/md/Makefile | 2 + drivers/md/dm-ebs-target.c | 444 ++++++++++++++++++ 4 files changed, 505 insertions(+) create mode 100644 Documentation/admin-guide/device-mapper/dm-ebs.rst create mode 100644 drivers/md/dm-ebs-target.c diff --git a/Documentation/admin-guide/device-mapper/dm-ebs.rst b/Documentation/admin-guide/device-mapper/dm-ebs.rst new file mode 100644 index 000000000000..534fa38e8862 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-ebs.rst @@ -0,0 +1,51 @@ +====== +dm-ebs +====== + + +This target is similar to the linear target except that it emulates +a smaller logical block size on a device with a larger logical block +size. Its main purpose is to provide emulation of 512 byte sectors on +devices that do not provide this emulation (i.e. 4K native disks). + +Supported emulated logical block sizes 512, 1024, 2048 and 4096. + +Underlying block size can be set to > 4K to test buffering larger units. + + +Table parameters +---------------- + [] + +Mandatory parameters: + + : + Full pathname to the underlying block-device, + or a "major:minor" device-number. + : + Starting sector within the device; + has to be a multiple of . + : + Number of sectors defining the logical block size to be emulated; + 1, 2, 4, 8 sectors of 512 bytes supported. + +Optional parameter: + + : + Number of sectors defining the logical block size of . + 2^N supported, e.g. 8 = emulate 8 sectors of 512 bytes = 4KiB. + If not provided, the logical block size of will be used. + + +Examples: + +Emulate 1 sector = 512 bytes logical block size on /dev/sda starting at +offset 1024 sectors with underlying devices block size automatically set: + +ebs /dev/sda 1024 1 + +Emulate 2 sector = 1KiB logical block size on /dev/sda starting at +offset 128 sectors, enforce 2KiB underlying device block size. +This presumes 2KiB logical blocksize on /dev/sda or less to work: + +ebs /dev/sda 128 2 4 diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 8b0c646d2f59..6cb6188a61df 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -337,6 +337,14 @@ config DM_WRITECACHE The writecache target doesn't cache reads because reads are supposed to be cached in standard RAM. +config DM_EBS + tristate "Emulated block size target (EXPERIMENTAL)" + depends on BLK_DEV_DM + select DM_BUFIO + help + dm-ebs emulates smaller logical block size on backing devices + with larger ones (e.g. 512 byte sectors on 4K native disks). + config DM_ERA tristate "Era target (EXPERIMENTAL)" depends on BLK_DEV_DM diff --git a/drivers/md/Makefile b/drivers/md/Makefile index d91a7edcd2ab..9a2d673f94bc 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -17,6 +17,7 @@ dm-thin-pool-y += dm-thin.o dm-thin-metadata.o dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \ dm-cache-background-tracker.o dm-cache-smq-y += dm-cache-policy-smq.o +dm-ebs-y += dm-ebs-target.o dm-era-y += dm-era-target.o dm-clone-y += dm-clone-target.o dm-clone-metadata.o dm-verity-y += dm-verity-target.o @@ -65,6 +66,7 @@ obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o obj-$(CONFIG_DM_VERITY) += dm-verity.o obj-$(CONFIG_DM_CACHE) += dm-cache.o obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o +obj-$(CONFIG_DM_EBS) += dm-ebs.o obj-$(CONFIG_DM_ERA) += dm-era.o obj-$(CONFIG_DM_CLONE) += dm-clone.o obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c new file mode 100644 index 000000000000..c2bd21fa7002 --- /dev/null +++ b/drivers/md/dm-ebs-target.c @@ -0,0 +1,444 @@ +/* + * Copyright (C) 2020 Red Hat GmbH + * + * This file is released under the GPL. + * + * Device-mapper target to emulate smaller logical block + * size on backing devices exposing (natively) larger ones. + * + * E.g. 512 byte sector emulation on 4K native disks. + */ + +#include "dm.h" +#include +#include +#include + +#define DM_MSG_PREFIX "ebs" + +static void ebs_dtr(struct dm_target *ti); + +/* Emulated block size context. */ +struct ebs_c { + struct dm_dev *dev; /* Underlying device to emulate block size on. */ + struct dm_bufio_client *bufio; /* Use dm-bufio for read and read-modify-write processing. */ + struct workqueue_struct *wq; /* Workqueue for ^ processing of bios. */ + struct work_struct ws; /* Work item used for ^. */ + struct bio_list bios_in; /* Worker bios input list. */ + spinlock_t lock; /* Guard bios input list above. */ + sector_t start; /* table line argument, see ebs_ctr below. */ + unsigned int e_bs; /* Emulated block size in sectors exposed to upper layer. */ + unsigned int u_bs; /* Underlying block size in sectors retrievd from/set on lower layer device. */ + unsigned char block_shift; /* bitshift sectors -> blocks used in dm-bufio API. */ + bool u_bs_set:1; /* Flag to indicate underlying block size is set on table line. */ +}; + +static inline sector_t __sector_to_block(struct ebs_c *ec, sector_t sector) +{ + return sector >> ec->block_shift; +} + +static inline sector_t __block_mod(sector_t sector, unsigned int bs) +{ + return sector & (bs - 1); +} + +/* Return number of blocks for a bio, accounting for misalignement of start and end sectors. */ +static inline unsigned int __nr_blocks(struct ebs_c *ec, struct bio *bio) +{ + sector_t end_sector = __block_mod(bio->bi_iter.bi_sector, ec->u_bs) + bio_sectors(bio); + + return __sector_to_block(ec, end_sector) + (__block_mod(end_sector, ec->u_bs) ? 1 : 0); +} + +static inline bool __ebs_check_bs(unsigned int bs) +{ + return bs && is_power_of_2(bs); +} + +/* + * READ/WRITE: + * + * copy blocks between bufio blocks and bio vector's (partial/overlapping) pages. + */ +static int __ebs_rw_bvec(struct ebs_c *ec, int rw, struct bio_vec *bv, struct bvec_iter *iter) +{ + int r = 0; + unsigned char *ba, *pa; + unsigned int cur_len; + unsigned int bv_len = bv->bv_len; + unsigned int buf_off = to_bytes(__block_mod(iter->bi_sector, ec->u_bs)); + sector_t block = __sector_to_block(ec, iter->bi_sector); + struct dm_buffer *b; + + if (unlikely(!bv->bv_page || !bv_len)) + return -EIO; + + pa = page_address(bv->bv_page) + bv->bv_offset; + + /* Handle overlapping page <-> blocks */ + while (bv_len) { + cur_len = min(dm_bufio_get_block_size(ec->bufio) - buf_off, bv_len); + + /* Avoid reading for writes in case bio vector's page overwrites block completely. */ + if (rw == READ || buf_off || bv_len < dm_bufio_get_block_size(ec->bufio)) + ba = dm_bufio_read(ec->bufio, block, &b); + else + ba = dm_bufio_new(ec->bufio, block, &b); + + if (unlikely(IS_ERR(ba))) { + /* + * Carry on with next buffer, if any, to issue all possible + * data but return error. + */ + r = PTR_ERR(ba); + } else { + /* Copy data to/from bio to buffer if read/new was successful above. */ + ba += buf_off; + if (rw == READ) { + memcpy(pa, ba, cur_len); + flush_dcache_page(bv->bv_page); + } else { + flush_dcache_page(bv->bv_page); + memcpy(ba, pa, cur_len); + dm_bufio_mark_partial_buffer_dirty(b, buf_off, buf_off + cur_len); + } + + dm_bufio_release(b); + } + + pa += cur_len; + bv_len -= cur_len; + buf_off = 0; + block++; + } + + return r; +} + +/* READ/WRITE: iterate bio vector's copying between (partial) pages and bufio blocks. */ +static int __ebs_rw_bio(struct ebs_c *ec, int rw, struct bio *bio) +{ + int r = 0, rr; + struct bio_vec bv; + struct bvec_iter iter; + + bio_for_each_bvec(bv, bio, iter) { + rr = __ebs_rw_bvec(ec, rw, &bv, &iter); + if (rr) + r = rr; + } + + return r; +} + +/* 'Discard' blocks, i.e. release them from the bufio cache. */ +static int __ebs_forget_bio(struct ebs_c *ec, struct bio *bio) +{ + sector_t blocks, sector = bio->bi_iter.bi_sector; + + blocks = __nr_blocks(ec, bio); + for (; blocks--; sector += ec->u_bs) + dm_bufio_forget(ec->bufio, __sector_to_block(ec, sector)); + + return 0; +} + +/* Worker funtion to process incoming bios. */ +static void __ebs_process_bios(struct work_struct *ws) +{ + int r; + bool write = false; + sector_t block1, block2; + struct ebs_c *ec = container_of(ws, struct ebs_c, ws); + struct bio *bio; + struct bio_list bios; + + bio_list_init(&bios); + + spin_lock_irq(&ec->lock); + bios = ec->bios_in; + bio_list_init(&ec->bios_in); + spin_unlock_irq(&ec->lock); + + /* Prefetch all read and any mis-aligned write buffers */ + bio_list_for_each(bio, &bios) { + block1 = __sector_to_block(ec, bio->bi_iter.bi_sector); + if (bio_op(bio) == REQ_OP_READ) + dm_bufio_prefetch(ec->bufio, block1, __nr_blocks(ec, bio)); + else if (bio_op(bio) == REQ_OP_WRITE && !(bio->bi_opf & REQ_PREFLUSH)) { + block2 = __sector_to_block(ec, bio_end_sector(bio)); + if (__block_mod(bio->bi_iter.bi_sector, ec->u_bs)) + dm_bufio_prefetch(ec->bufio, block1, 1); + if (__block_mod(bio_end_sector(bio), ec->u_bs) && block2 != block1) + dm_bufio_prefetch(ec->bufio, block2, 1); + } + } + + bio_list_for_each(bio, &bios) { + r = -EIO; + if (bio_op(bio) == REQ_OP_READ) + r = __ebs_rw_bio(ec, READ, bio); + else if (bio_op(bio) == REQ_OP_WRITE) { + write = true; + r = __ebs_rw_bio(ec, WRITE, bio); + } else if (bio_op(bio) == REQ_OP_DISCARD) { + /* FIXME: (optionally) call dm_bufio_discard_buffers() once upstream. */ + r = __ebs_forget_bio(ec, bio); + } + + if (r < 0) + bio->bi_status = errno_to_blk_status(r); + } + + /* + * We write dirty buffers after processing I/O on them + * but before we endio thus addressing REQ_FUA/REQ_SYNC. + */ + r = write ? dm_bufio_write_dirty_buffers(ec->bufio) : 0; + + while ((bio = bio_list_pop(&bios))) { + /* Any other request is endioed. */ + if (unlikely(r && bio_op(bio) == REQ_OP_WRITE)) + bio_io_error(bio); + else + bio_endio(bio); + } +} + +/* + * Construct an emulated block size mapping: [] + * + * : path of the underlying device + * : offset in 512 bytes sectors into + * : emulated block size in units of 512 bytes exposed to the upper layer + * []: underlying block size in units of 512 bytes imposed on the lower layer; + * optional, if not supplied, retrieve logical block size from underlying device + */ +static int ebs_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + unsigned short tmp1; + unsigned long long tmp; + char dummy; + struct ebs_c *ec; + + if (argc < 3 || argc > 4) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + ec = ti->private = kzalloc(sizeof(*ec), GFP_KERNEL); + if (!ec) { + ti->error = "Cannot allocate ebs context"; + return -ENOMEM; + } + + r = -EINVAL; + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || + tmp != (sector_t)tmp || + (sector_t)tmp >= ti->len) { + ti->error = "Invalid device offset sector"; + goto bad; + } + ec->start = tmp; + + if (sscanf(argv[2], "%hu%c", &tmp1, &dummy) != 1 || + !__ebs_check_bs(tmp1) || + to_bytes(tmp1) > PAGE_SIZE) { + ti->error = "Invalid emulated block size"; + goto bad; + } + ec->e_bs = tmp1; + + if (argc > 3) { + if (sscanf(argv[3], "%hu%c", &tmp1, &dummy) != 1 || !__ebs_check_bs(tmp1)) { + ti->error = "Invalid underlying block size"; + goto bad; + } + ec->u_bs = tmp1; + ec->u_bs_set = true; + } else + ec->u_bs_set = false; + + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ec->dev); + if (r) { + ti->error = "Device lookup failed"; + ec->dev = NULL; + goto bad; + } + + r = -EINVAL; + if (!ec->u_bs_set) { + ec->u_bs = to_sector(bdev_logical_block_size(ec->dev->bdev)); + if (!__ebs_check_bs(ec->u_bs)) { + ti->error = "Invalid retrieved underlying block size"; + goto bad; + } + } + + if (!ec->u_bs_set && ec->e_bs == ec->u_bs) + DMINFO("Emulation superfluous: emulated equal to underlying block size"); + + if (__block_mod(ec->start, ec->u_bs)) { + ti->error = "Device offset must be multiple of underlying block size"; + goto bad; + } + + ec->bufio = dm_bufio_client_create(ec->dev->bdev, to_bytes(ec->u_bs), 1, 0, NULL, NULL); + if (IS_ERR(ec->bufio)) { + ti->error = "Cannot create dm bufio client"; + r = PTR_ERR(ec->bufio); + ec->bufio = NULL; + goto bad; + } + + ec->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); + if (!ec->wq) { + ti->error = "Cannot create dm-" DM_MSG_PREFIX " workqueue"; + r = -ENOMEM; + goto bad; + } + + ec->block_shift = __ffs(ec->u_bs); + INIT_WORK(&ec->ws, &__ebs_process_bios); + bio_list_init(&ec->bios_in); + spin_lock_init(&ec->lock); + + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; + ti->num_secure_erase_bios = 0; + ti->num_write_same_bios = 0; + ti->num_write_zeroes_bios = 0; + return 0; +bad: + ebs_dtr(ti); + return r; +} + +static void ebs_dtr(struct dm_target *ti) +{ + struct ebs_c *ec = ti->private; + + if (ec->wq) + destroy_workqueue(ec->wq); + if (ec->bufio) + dm_bufio_client_destroy(ec->bufio); + if (ec->dev) + dm_put_device(ti, ec->dev); + kfree(ec); +} + +static int ebs_map(struct dm_target *ti, struct bio *bio) +{ + struct ebs_c *ec = ti->private; + + bio_set_dev(bio, ec->dev->bdev); + bio->bi_iter.bi_sector = ec->start + dm_target_offset(ti, bio->bi_iter.bi_sector); + + if (unlikely(bio->bi_opf & REQ_OP_FLUSH)) + return DM_MAPIO_REMAPPED; + /* + * Only queue for bufio processing in case of partial or overlapping buffers + * -or- + * emulation with ebs == ubs aiming for tests of dm-bufio overhead. + */ + if (likely(__block_mod(bio->bi_iter.bi_sector, ec->u_bs) || + __block_mod(bio_end_sector(bio), ec->u_bs) || + ec->e_bs == ec->u_bs)) { + spin_lock_irq(&ec->lock); + bio_list_add(&ec->bios_in, bio); + spin_unlock_irq(&ec->lock); + + queue_work(ec->wq, &ec->ws); + + return DM_MAPIO_SUBMITTED; + } + + /* Forget any buffer content relative to this direct backing device I/O. */ + __ebs_forget_bio(ec, bio); + + return DM_MAPIO_REMAPPED; +} + +static void ebs_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) +{ + struct ebs_c *ec = ti->private; + + switch (type) { + case STATUSTYPE_INFO: + *result = '\0'; + break; + case STATUSTYPE_TABLE: + snprintf(result, maxlen, ec->u_bs_set ? "%s %llu %u %u" : "%s %llu %u", + ec->dev->name, (unsigned long long) ec->start, ec->e_bs, ec->u_bs); + break; + } +} + +static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +{ + struct ebs_c *ec = ti->private; + struct dm_dev *dev = ec->dev; + + /* + * Only pass ioctls through if the device sizes match exactly. + */ + *bdev = dev->bdev; + return !!(ec->start || ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT); +} + +static void ebs_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct ebs_c *ec = ti->private; + + limits->logical_block_size = to_bytes(ec->e_bs); + limits->physical_block_size = to_bytes(ec->u_bs); + limits->alignment_offset = limits->physical_block_size; + blk_limits_io_min(limits, limits->logical_block_size); +} + +static int ebs_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct ebs_c *ec = ti->private; + + return fn(ti, ec->dev, ec->start, ti->len, data); +} + +static struct target_type ebs_target = { + .name = "ebs", + .version = {1, 0, 0}, + .features = DM_TARGET_PASSES_INTEGRITY, + .module = THIS_MODULE, + .ctr = ebs_ctr, + .dtr = ebs_dtr, + .map = ebs_map, + .status = ebs_status, + .io_hints = ebs_io_hints, + .prepare_ioctl = ebs_prepare_ioctl, + .iterate_devices = ebs_iterate_devices, +}; + +static int __init dm_ebs_init(void) +{ + int r = dm_register_target(&ebs_target); + + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +static void dm_ebs_exit(void) +{ + dm_unregister_target(&ebs_target); +} + +module_init(dm_ebs_init); +module_exit(dm_ebs_exit); + +MODULE_AUTHOR("Heinz Mauelshagen "); +MODULE_DESCRIPTION(DM_NAME " emulated block size target"); +MODULE_LICENSE("GPL"); From 6fbeb0048e6b93f7b7f195864f3ddc876ac4d42e Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 7 Feb 2020 15:59:25 -0500 Subject: [PATCH 102/427] dm bufio: implement discard Add functions dm_bufio_issue_discard and dm_bufio_discard_buffers. dm_bufio_issue_discard sends discard request to the underlying device. dm_bufio_discard_buffers frees buffers in the range and then calls dm_bufio_issue_discard. Also, factor out block_to_sector for reuse in dm_bufio_issue_discard. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 69 +++++++++++++++++++++++++++++++++++++--- include/linux/dm-bufio.h | 12 +++++++ 2 files changed, 76 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 2d519c223562..bf289be1ee3a 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -631,6 +631,19 @@ dmio: submit_bio(bio); } +static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block) +{ + sector_t sector; + + if (likely(c->sectors_per_block_bits >= 0)) + sector = block << c->sectors_per_block_bits; + else + sector = block * (c->block_size >> SECTOR_SHIFT); + sector += c->start; + + return sector; +} + static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buffer *, blk_status_t)) { unsigned n_sectors; @@ -639,11 +652,7 @@ static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buff b->end_io = end_io; - if (likely(b->c->sectors_per_block_bits >= 0)) - sector = b->block << b->c->sectors_per_block_bits; - else - sector = b->block * (b->c->block_size >> SECTOR_SHIFT); - sector += b->c->start; + sector = block_to_sector(b->c, b->block); if (rw != REQ_OP_WRITE) { n_sectors = b->c->block_size >> SECTOR_SHIFT; @@ -1325,6 +1334,56 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c) } EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); +/* + * Use dm-io to send a discard request to flush the device. + */ +int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count) +{ + struct dm_io_request io_req = { + .bi_op = REQ_OP_DISCARD, + .bi_op_flags = REQ_SYNC, + .mem.type = DM_IO_KMEM, + .mem.ptr.addr = NULL, + .client = c->dm_io, + }; + struct dm_io_region io_reg = { + .bdev = c->bdev, + .sector = block_to_sector(c, block), + .count = block_to_sector(c, count), + }; + + BUG_ON(dm_bufio_in_request()); + + return dm_io(&io_req, 1, &io_reg, NULL); +} +EXPORT_SYMBOL_GPL(dm_bufio_issue_discard); + +/* + * Free the specified range of buffers. If a buffer is held by other process, it + * is not freed. If a buffer is dirty, it is discarded without writeback. + * Finally, send the discard request to the device. + */ +int dm_bufio_discard_buffers(struct dm_bufio_client *c, sector_t block, sector_t count) +{ + sector_t i; + + for (i = block; i < block + count; i++) { + struct dm_buffer *b; + dm_bufio_lock(c); + b = __find(c, i); + if (b && likely(!b->hold_count)) { + wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); + __unlink_buffer(b); + __free_buffer_wake(b); + } + dm_bufio_unlock(c); + } + + return dm_bufio_issue_discard(c, block, count); +} +EXPORT_SYMBOL_GPL(dm_bufio_discard_buffers); + /* * We first delete any other buffer that may be at that new location. * diff --git a/include/linux/dm-bufio.h b/include/linux/dm-bufio.h index 3c8b7d274bd9..07e1f163e299 100644 --- a/include/linux/dm-bufio.h +++ b/include/linux/dm-bufio.h @@ -118,6 +118,18 @@ int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c); */ int dm_bufio_issue_flush(struct dm_bufio_client *c); +/* + * Send a discard request to the underlying device. + */ +int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count); + +/* + * Free the specified range of buffers. If a buffer is held by other process, it + * is not freed. If a buffer is dirty, it is discarded without writeback. + * Finally, send the discard request to the device. + */ +int dm_bufio_discard_buffers(struct dm_bufio_client *c, sector_t block, sector_t count); + /* * Like dm_bufio_release but also move the buffer to the new * block. dm_bufio_write_dirty_buffers is needed to commit the new block. From a5089a95d84c1e861e2d1f549ae368c8e89e3674 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 29 Apr 2020 16:47:03 +0200 Subject: [PATCH 103/427] dm ebs: pass discards down to underlying device Make use of dm_bufio_issue_discard() to pass discards down to the underlying device. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-ebs-target.c | 41 +++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index c2bd21fa7002..ae3f5fad3b39 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -132,16 +132,43 @@ static int __ebs_rw_bio(struct ebs_c *ec, int rw, struct bio *bio) return r; } -/* 'Discard' blocks, i.e. release them from the bufio cache. */ -static int __ebs_forget_bio(struct ebs_c *ec, struct bio *bio) +/* + * Discard bio's blocks, i.e. pass discards down. + * + * Avoid discarding partial blocks at beginning and end; + * return 0 in case no blocks can be discarded as a result. + */ +static int __ebs_discard_bio(struct ebs_c *ec, struct bio *bio) +{ + sector_t block, blocks, sector = bio->bi_iter.bi_sector; + + block = __sector_to_block(ec, sector); + blocks = __nr_blocks(ec, bio); + + /* + * Partial first underlying block (__nr_blocks() may have + * resulted in one block). + */ + if (__block_mod(sector, ec->u_bs)) { + block++; + blocks--; + } + + /* Partial last underlying block if any. */ + if (blocks && __block_mod(bio_end_sector(bio), ec->u_bs)) + blocks--; + + return blocks ? dm_bufio_issue_discard(ec->bufio, block, blocks) : 0; +} + +/* Release blocks them from the bufio cache. */ +static void __ebs_forget_bio(struct ebs_c *ec, struct bio *bio) { sector_t blocks, sector = bio->bi_iter.bi_sector; blocks = __nr_blocks(ec, bio); for (; blocks--; sector += ec->u_bs) dm_bufio_forget(ec->bufio, __sector_to_block(ec, sector)); - - return 0; } /* Worker funtion to process incoming bios. */ @@ -183,8 +210,8 @@ static void __ebs_process_bios(struct work_struct *ws) write = true; r = __ebs_rw_bio(ec, WRITE, bio); } else if (bio_op(bio) == REQ_OP_DISCARD) { - /* FIXME: (optionally) call dm_bufio_discard_buffers() once upstream. */ - r = __ebs_forget_bio(ec, bio); + __ebs_forget_bio(ec, bio); + r = __ebs_discard_bio(ec, bio); } if (r < 0) @@ -409,7 +436,7 @@ static int ebs_iterate_devices(struct dm_target *ti, static struct target_type ebs_target = { .name = "ebs", - .version = {1, 0, 0}, + .version = {1, 0, 1}, .features = DM_TARGET_PASSES_INTEGRITY, .module = THIS_MODULE, .ctr = ebs_ctr, From a86fe8be514534363c8fb12a3a38bdba6354316b Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 8 Apr 2020 07:29:48 +0000 Subject: [PATCH 104/427] dm integrity: remove set but not used variables Fixes gcc '-Wunused-but-set-variable' warning: drivers/md/dm-integrity.c: In function 'integrity_metadata': drivers/md/dm-integrity.c:1557:12: warning: variable 'save_metadata_offset' set but not used [-Wunused-but-set-variable] drivers/md/dm-integrity.c:1556:12: warning: variable 'save_metadata_block' set but not used [-Wunused-but-set-variable] They are never used, so remove it. Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 4094c47eca7f..3726b987151e 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1553,8 +1553,6 @@ static void integrity_metadata(struct work_struct *w) char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; sector_t sector; unsigned sectors_to_process; - sector_t save_metadata_block; - unsigned save_metadata_offset; if (unlikely(ic->mode == 'R')) goto skip_io; @@ -1605,8 +1603,6 @@ static void integrity_metadata(struct work_struct *w) goto skip_io; } - save_metadata_block = dio->metadata_block; - save_metadata_offset = dio->metadata_offset; sector = dio->range.logical_sector; sectors_to_process = dio->range.n_sectors; From 9431cf6efc3659eaa1cdd591e02a09045bc9983f Mon Sep 17 00:00:00 2001 From: Zhiqiang Liu Date: Wed, 15 Apr 2020 19:57:31 +0800 Subject: [PATCH 105/427] dm persistent data: switch exit_ro_spine to return void In commit 4c7da06f5a78 ("dm persistent data: eliminate unnecessary return values"), r value in exit_ro_spine will not change, so exit_ro_spine doesn't need a return value. Signed-off-by: Zhiqiang Liu Signed-off-by: Mike Snitzer --- drivers/md/persistent-data/dm-btree-internal.h | 2 +- drivers/md/persistent-data/dm-btree-spine.c | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index a240990a7f33..55a4096f1334 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -68,7 +68,7 @@ struct ro_spine { }; void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info); -int exit_ro_spine(struct ro_spine *s); +void exit_ro_spine(struct ro_spine *s); int ro_step(struct ro_spine *s, dm_block_t new_child); void ro_pop(struct ro_spine *s); struct btree_node *ro_node(struct ro_spine *s); diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c index b27b8091a1ca..e03cb9e48773 100644 --- a/drivers/md/persistent-data/dm-btree-spine.c +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -132,15 +132,13 @@ void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info) s->nodes[1] = NULL; } -int exit_ro_spine(struct ro_spine *s) +void exit_ro_spine(struct ro_spine *s) { - int r = 0, i; + int i; for (i = 0; i < s->count; i++) { unlock_block(s->info, s->nodes[i]); } - - return r; } int ro_step(struct ro_spine *s, dm_block_t new_child) From 499c18045eab16656ef4159c35b05865038f9f25 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 19 Apr 2020 04:34:00 -0400 Subject: [PATCH 106/427] dm writecache: remove superfluous test in persistent_memory_claim Remove superfluous test if dax_dev is NULL - dax_direct_access already does this test. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 613c171b1b6d..d29d3e234e01 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -234,10 +234,6 @@ static int persistent_memory_claim(struct dm_writecache *wc) wc->memory_vmapped = false; - if (!wc->ssd_dev->dax_dev) { - r = -EOPNOTSUPP; - goto err1; - } s = wc->memory_map_size; p = s >> PAGE_SHIFT; if (!p) { From 48338daaa00e6137a43fa5d0e54b763aa34f450b Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 29 Apr 2020 12:30:03 -0400 Subject: [PATCH 107/427] dm writecache: improve performance on DDR persistent memory (Optane) When testing the dm-writecache target on a real DDR persistent memory (Intel Optane), it turned out that explicit cache flushing using the clflushopt instruction performs better than non-temporal stores for block sizes 1k, 2k and 4k. The dm-writecache target is singlethreaded (all the copying is done while holding the writecache lock), so it benefits from clwb, see: http://lore.kernel.org/r/alpine.LRH.2.02.2004160411460.7833@file01.intranet.prod.int.rdu2.redhat.com Add a new function memcpy_flushcache_optimized() that tests if clflushopt is present - and if it is, we use it instead of memcpy_flushcache. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index d29d3e234e01..74f3c506f084 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1139,6 +1139,42 @@ static int writecache_message(struct dm_target *ti, unsigned argc, char **argv, return r; } +static void memcpy_flushcache_optimized(void *dest, void *source, size_t size) +{ + /* + * clflushopt performs better with block size 1024, 2048, 4096 + * non-temporal stores perform better with block size 512 + * + * block size 512 1024 2048 4096 + * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s + * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s + * + * We see that movnti performs better for 512-byte blocks, and + * clflushopt performs better for 1024-byte and larger blocks. So, we + * prefer clflushopt for sizes >= 768. + * + * NOTE: this happens to be the case now (with dm-writecache's single + * threaded model) but re-evaluate this once memcpy_flushcache() is + * enabled to use movdir64b which might invalidate this performance + * advantage seen with cache-allocating-writes plus flushing. + */ +#ifdef CONFIG_X86 + if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) && + likely(boot_cpu_data.x86_clflush_size == 64) && + likely(size >= 768)) { + do { + memcpy((void *)dest, (void *)source, 64); + clflushopt((void *)dest); + dest += 64; + source += 64; + size -= 64; + } while (size >= 64); + return; + } +#endif + memcpy_flushcache(dest, source, size); +} + static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data) { void *buf; @@ -1164,7 +1200,7 @@ static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data } } else { flush_dcache_page(bio_page(bio)); - memcpy_flushcache(data, buf, size); + memcpy_flushcache_optimized(data, buf, size); } bvec_kunmap_irq(buf, &flags); From 087615bf3acdafd0ba7c7c9ed5286e7b7c80fe1b Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Thu, 30 Apr 2020 16:48:29 -0400 Subject: [PATCH 108/427] dm mpath: pass IO start time to path selector The HST path selector needs this information to perform path prediction. For request-based mpath, struct request's io_start_time_ns is used, while for bio-based, use the start_time stored in dm_io. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 9 ++++++--- drivers/md/dm-path-selector.h | 2 +- drivers/md/dm-queue-length.c | 2 +- drivers/md/dm-service-time.c | 2 +- drivers/md/dm.c | 9 +++++++++ include/linux/device-mapper.h | 2 ++ 6 files changed, 20 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index e0c800cf87a9..74246d7c7d68 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -567,7 +567,8 @@ static void multipath_release_clone(struct request *clone, if (pgpath && pgpath->pg->ps.type->end_io) pgpath->pg->ps.type->end_io(&pgpath->pg->ps, &pgpath->path, - mpio->nr_bytes); + mpio->nr_bytes, + clone->io_start_time_ns); } blk_put_request(clone); @@ -1617,7 +1618,8 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, struct path_selector *ps = &pgpath->pg->ps; if (ps->type->end_io) - ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); + ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes, + clone->io_start_time_ns); } return r; @@ -1661,7 +1663,8 @@ done: struct path_selector *ps = &pgpath->pg->ps; if (ps->type->end_io) - ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); + ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes, + dm_start_time_ns_from_clone(clone)); } return r; diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index b6eb5365b1a4..c47bc0e20275 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h @@ -74,7 +74,7 @@ struct path_selector_type { int (*start_io) (struct path_selector *ps, struct dm_path *path, size_t nr_bytes); int (*end_io) (struct path_selector *ps, struct dm_path *path, - size_t nr_bytes); + size_t nr_bytes, u64 start_time); }; /* Register a path selector */ diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c index 969c4f1a3633..5fd018d18418 100644 --- a/drivers/md/dm-queue-length.c +++ b/drivers/md/dm-queue-length.c @@ -227,7 +227,7 @@ static int ql_start_io(struct path_selector *ps, struct dm_path *path, } static int ql_end_io(struct path_selector *ps, struct dm_path *path, - size_t nr_bytes) + size_t nr_bytes, u64 start_time) { struct path_info *pi = path->pscontext; diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c index f006a9005593..9cfda665e9eb 100644 --- a/drivers/md/dm-service-time.c +++ b/drivers/md/dm-service-time.c @@ -309,7 +309,7 @@ static int st_start_io(struct path_selector *ps, struct dm_path *path, } static int st_end_io(struct path_selector *ps, struct dm_path *path, - size_t nr_bytes) + size_t nr_bytes, u64 start_time) { struct path_info *pi = path->pscontext; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index db9e46114653..2fcb932eb4bd 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -675,6 +675,15 @@ static bool md_in_flight(struct mapped_device *md) return md_in_flight_bios(md); } +u64 dm_start_time_ns_from_clone(struct bio *bio) +{ + struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); + struct dm_io *io = tio->io; + + return jiffies_to_nsecs(io->start_time); +} +EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone); + static void start_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index af48d9da3916..934037d938b9 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -332,6 +332,8 @@ void *dm_per_bio_data(struct bio *bio, size_t data_size); struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size); unsigned dm_bio_get_target_bio_nr(const struct bio *bio); +u64 dm_start_time_ns_from_clone(struct bio *bio); + int dm_register_target(struct target_type *t); void dm_unregister_target(struct target_type *t); From 2613eab11996c8d1439c2a44fbca52807be7faa6 Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Thu, 30 Apr 2020 16:48:30 -0400 Subject: [PATCH 109/427] dm mpath: add Historical Service Time Path Selector This new selector keeps an exponential moving average of the service time for each path (losely defined as delta between start_io and end_io), and uses this along with the number of inflight requests to estimate future service time for a path. Since we don't have a prober to account for temporally slow paths, re-try "slow" paths every once in a while (num_paths * historical_service_time). To account for fast paths transitioning to slow, if a path has not completed any request within (num_paths * historical_service_time), limit the number of outstanding requests. To account for low volume situations where number of inflight IOs would be zero, the last finish time of each path is factored in. Signed-off-by: Khazhismel Kumykov Co-developed-by: Gabriel Krisman Bertazi Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Mike Snitzer --- drivers/md/Kconfig | 11 + drivers/md/Makefile | 1 + drivers/md/dm-historical-service-time.c | 561 ++++++++++++++++++++++++ 3 files changed, 573 insertions(+) create mode 100644 drivers/md/dm-historical-service-time.c diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 6cb6188a61df..6665b56865b7 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -452,6 +452,17 @@ config DM_MULTIPATH_ST If unsure, say N. +config DM_MULTIPATH_HST + tristate "I/O Path Selector based on historical service time" + depends on DM_MULTIPATH + help + This path selector is a dynamic load balancer which selects + the path expected to complete the incoming I/O in the shortest + time by comparing estimated service time (based on historical + service time). + + If unsure, say N. + config DM_DELAY tristate "I/O delaying target" depends on BLK_DEV_DM diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 9a2d673f94bc..31840f95cd40 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -55,6 +55,7 @@ obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o +obj-$(CONFIG_DM_MULTIPATH_HST) += dm-historical-service-time.o obj-$(CONFIG_DM_SWITCH) += dm-switch.o obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/ diff --git a/drivers/md/dm-historical-service-time.c b/drivers/md/dm-historical-service-time.c new file mode 100644 index 000000000000..186f91e2752c --- /dev/null +++ b/drivers/md/dm-historical-service-time.c @@ -0,0 +1,561 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Historical Service Time + * + * Keeps a time-weighted exponential moving average of the historical + * service time. Estimates future service time based on the historical + * service time and the number of outstanding requests. + * + * Marks paths stale if they have not finished within hst * + * num_paths. If a path is stale and unused, we will send a single + * request to probe in case the path has improved. This situation + * generally arises if the path is so much worse than others that it + * will never have the best estimated service time, or if the entire + * multipath device is unused. If a path is stale and in use, limit the + * number of requests it can receive with the assumption that the path + * has become degraded. + * + * To avoid repeatedly calculating exponents for time weighting, times + * are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT) + * ns, and the weighting is pre-calculated. + * + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#include +#include +#include + + +#define DM_MSG_PREFIX "multipath historical-service-time" +#define HST_MIN_IO 1 +#define HST_VERSION "0.1.1" + +#define HST_FIXED_SHIFT 10 /* 10 bits of decimal precision */ +#define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT) +#define HST_FIXED_1 (1 << HST_FIXED_SHIFT) +#define HST_FIXED_95 972 +#define HST_MAX_INFLIGHT HST_FIXED_1 +#define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */ +#define HST_WEIGHT_COUNT 64ULL + +struct selector { + struct list_head valid_paths; + struct list_head failed_paths; + int valid_count; + spinlock_t lock; + + unsigned int weights[HST_WEIGHT_COUNT]; + unsigned int threshold_multiplier; +}; + +struct path_info { + struct list_head list; + struct dm_path *path; + unsigned int repeat_count; + + spinlock_t lock; + + u64 historical_service_time; /* Fixed point */ + + u64 stale_after; + u64 last_finish; + + u64 outstanding; +}; + +/** + * fixed_power - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + * + * (see: kernel/sched/loadavg.c) + */ +static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) { + for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + } + + return result; +} + +/* + * Calculate the next value of an exponential moving average + * a_1 = a_0 * e + a * (1 - e) + * + * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT] + * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT] + * @weight: [0, HST_FIXED_1] + * + * Note: + * To account for multiple periods in the same calculation, + * a_n = a_0 * e^n + a * (1 - e^n), + * so call fixed_ema(last, next, pow(weight, N)) + */ +static u64 fixed_ema(u64 last, u64 next, u64 weight) +{ + last *= weight; + last += next * (HST_FIXED_1 - weight); + last += 1ULL << (HST_FIXED_SHIFT - 1); + return last >> HST_FIXED_SHIFT; +} + +static struct selector *alloc_selector(void) +{ + struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->failed_paths); + spin_lock_init(&s->lock); + s->valid_count = 0; + } + + return s; +} + +/* + * Get the weight for a given time span. + */ +static u64 hst_weight(struct path_selector *ps, u64 delta) +{ + struct selector *s = ps->context; + int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL, + HST_WEIGHT_COUNT - 1); + + return s->weights[bucket]; +} + +/* + * Set up the weights array. + * + * weights[len-1] = 0 + * weights[n] = base ^ (n + 1) + */ +static void hst_set_weights(struct path_selector *ps, unsigned int base) +{ + struct selector *s = ps->context; + int i; + + if (base >= HST_FIXED_1) + return; + + for (i = 0; i < HST_WEIGHT_COUNT - 1; i++) + s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1); + s->weights[HST_WEIGHT_COUNT - 1] = 0; +} + +static int hst_create(struct path_selector *ps, unsigned int argc, char **argv) +{ + struct selector *s; + unsigned int base_weight = HST_FIXED_95; + unsigned int threshold_multiplier = 0; + char dummy; + + /* + * Arguments: [ []] + * : Base weight for ema [0, 1024) 10-bit fixed point. A + * value of 0 will completely ignore any history. + * If not given, default (HST_FIXED_95) is used. + * : Minimum threshold multiplier for paths to + * be considered different. That is, a path is + * considered different iff (p1 > N * p2) where p1 + * is the path with higher service time. A threshold + * of 1 or 0 has no effect. Defaults to 0. + */ + if (argc > 2) + return -EINVAL; + + if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 || + base_weight >= HST_FIXED_1)) { + return -EINVAL; + } + + if (argc > 1 && (sscanf(argv[1], "%u%c", + &threshold_multiplier, &dummy) != 1)) { + return -EINVAL; + } + + s = alloc_selector(); + if (!s) + return -ENOMEM; + + ps->context = s; + + hst_set_weights(ps, base_weight); + s->threshold_multiplier = threshold_multiplier; + return 0; +} + +static void free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + kfree(pi); + } +} + +static void hst_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + + free_paths(&s->valid_paths); + free_paths(&s->failed_paths); + kfree(s); + ps->context = NULL; +} + +static int hst_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned int maxlen) +{ + unsigned int sz = 0; + struct path_info *pi; + + if (!path) { + struct selector *s = ps->context; + + DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier); + } else { + pi = path->pscontext; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%llu %llu %llu ", pi->historical_service_time, + pi->outstanding, pi->stale_after); + break; + case STATUSTYPE_TABLE: + DMEMIT("0 "); + break; + } + } + + return sz; +} + +static int hst_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi; + unsigned int repeat_count = HST_MIN_IO; + char dummy; + unsigned long flags; + + /* + * Arguments: [] + * : The number of I/Os before switching path. + * If not given, default (HST_MIN_IO) is used. + */ + if (argc > 1) { + *error = "historical-service-time ps: incorrect number of arguments"; + return -EINVAL; + } + + if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { + *error = "historical-service-time ps: invalid repeat count"; + return -EINVAL; + } + + /* allocate the path */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "historical-service-time ps: Error allocating path context"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + + pi->historical_service_time = HST_FIXED_1; + + spin_lock_init(&pi->lock); + pi->outstanding = 0; + + pi->stale_after = 0; + pi->last_finish = 0; + + path->pscontext = pi; + + spin_lock_irqsave(&s->lock, flags); + list_add_tail(&pi->list, &s->valid_paths); + s->valid_count++; + spin_unlock_irqrestore(&s->lock, flags); + + return 0; +} + +static void hst_fail_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + list_move(&pi->list, &s->failed_paths); + s->valid_count--; + spin_unlock_irqrestore(&s->lock, flags); +} + +static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + list_move_tail(&pi->list, &s->valid_paths); + s->valid_count++; + spin_unlock_irqrestore(&s->lock, flags); + + return 0; +} + +static void hst_fill_compare(struct path_info *pi, u64 *hst, + u64 *out, u64 *stale) +{ + unsigned long flags; + + spin_lock_irqsave(&pi->lock, flags); + *hst = pi->historical_service_time; + *out = pi->outstanding; + *stale = pi->stale_after; + spin_unlock_irqrestore(&pi->lock, flags); +} + +/* + * Compare the estimated service time of 2 paths, pi1 and pi2, + * for the incoming I/O. + * + * Returns: + * < 0 : pi1 is better + * 0 : no difference between pi1 and pi2 + * > 0 : pi2 is better + * + */ +static long long hst_compare(struct path_info *pi1, struct path_info *pi2, + u64 time_now, struct path_selector *ps) +{ + struct selector *s = ps->context; + u64 hst1, hst2; + long long out1, out2, stale1, stale2; + int pi2_better, over_threshold; + + hst_fill_compare(pi1, &hst1, &out1, &stale1); + hst_fill_compare(pi2, &hst2, &out2, &stale2); + + /* Check here if estimated latency for two paths are too similar. + * If this is the case, we skip extra calculation and just compare + * outstanding requests. In this case, any unloaded paths will + * be preferred. + */ + if (hst1 > hst2) + over_threshold = hst1 > (s->threshold_multiplier * hst2); + else + over_threshold = hst2 > (s->threshold_multiplier * hst1); + + if (!over_threshold) + return out1 - out2; + + /* + * If an unloaded path is stale, choose it. If both paths are unloaded, + * choose path that is the most stale. + * (If one path is loaded, choose the other) + */ + if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) || + (!out1 && !out2)) + return (!out2 * stale1) - (!out1 * stale2); + + /* Compare estimated service time. If outstanding is the same, we + * don't need to multiply + */ + if (out1 == out2) { + pi2_better = hst1 > hst2; + } else { + /* Potential overflow with out >= 1024 */ + if (unlikely(out1 >= HST_MAX_INFLIGHT || + out2 >= HST_MAX_INFLIGHT)) { + /* If over 1023 in-flights, we may overflow if hst + * is at max. (With this shift we still overflow at + * 1048576 in-flights, which is high enough). + */ + hst1 >>= HST_FIXED_SHIFT; + hst2 >>= HST_FIXED_SHIFT; + } + pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2; + } + + /* In the case that the 'winner' is stale, limit to equal usage. */ + if (pi2_better) { + if (stale2 < time_now) + return out1 - out2; + return 1; + } + if (stale1 < time_now) + return out1 - out2; + return -1; +} + +static struct dm_path *hst_select_path(struct path_selector *ps, + size_t nr_bytes) +{ + struct selector *s = ps->context; + struct path_info *pi = NULL, *best = NULL; + u64 time_now = sched_clock(); + struct dm_path *ret = NULL; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + if (list_empty(&s->valid_paths)) + goto out; + + list_for_each_entry(pi, &s->valid_paths, list) { + if (!best || (hst_compare(pi, best, time_now, ps) < 0)) + best = pi; + } + + if (!best) + goto out; + + /* Move last used path to end (least preferred in case of ties) */ + list_move_tail(&best->list, &s->valid_paths); + + ret = best->path; + +out: + spin_unlock_irqrestore(&s->lock, flags); + return ret; +} + +static int hst_start_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + unsigned long flags; + + spin_lock_irqsave(&pi->lock, flags); + pi->outstanding++; + spin_unlock_irqrestore(&pi->lock, flags); + + return 0; +} + +static u64 path_service_time(struct path_info *pi, u64 start_time) +{ + u64 sched_now = ktime_get_ns(); + + /* if a previous disk request has finished after this IO was + * sent to the hardware, pretend the submission happened + * serially. + */ + if (time_after64(pi->last_finish, start_time)) + start_time = pi->last_finish; + + pi->last_finish = sched_now; + if (time_before64(sched_now, start_time)) + return 0; + + return sched_now - start_time; +} + +static int hst_end_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes, u64 start_time) +{ + struct path_info *pi = path->pscontext; + struct selector *s = ps->context; + unsigned long flags; + u64 st; + + spin_lock_irqsave(&pi->lock, flags); + + st = path_service_time(pi, start_time); + pi->outstanding--; + pi->historical_service_time = + fixed_ema(pi->historical_service_time, + min(st * HST_FIXED_1, HST_FIXED_MAX), + hst_weight(ps, st)); + + /* + * On request end, mark path as fresh. If a path hasn't + * finished any requests within the fresh period, the estimated + * service time is considered too optimistic and we limit the + * maximum requests on that path. + */ + pi->stale_after = pi->last_finish + + (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT)); + + spin_unlock_irqrestore(&pi->lock, flags); + + return 0; +} + +static struct path_selector_type hst_ps = { + .name = "historical-service-time", + .module = THIS_MODULE, + .table_args = 1, + .info_args = 3, + .create = hst_create, + .destroy = hst_destroy, + .status = hst_status, + .add_path = hst_add_path, + .fail_path = hst_fail_path, + .reinstate_path = hst_reinstate_path, + .select_path = hst_select_path, + .start_io = hst_start_io, + .end_io = hst_end_io, +}; + +static int __init dm_hst_init(void) +{ + int r = dm_register_path_selector(&hst_ps); + + if (r < 0) + DMERR("register failed %d", r); + + DMINFO("version " HST_VERSION " loaded"); + + return r; +} + +static void __exit dm_hst_exit(void) +{ + int r = dm_unregister_path_selector(&hst_ps); + + if (r < 0) + DMERR("unregister failed %d", r); +} + +module_init(dm_hst_init); +module_exit(dm_hst_exit); + +MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector"); +MODULE_AUTHOR("Khazhismel Kumykov "); +MODULE_LICENSE("GPL"); From bc3d5717d242a37d2e9ea85d7e7b2e3569324d24 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:16 +0200 Subject: [PATCH 110/427] dm zoned: add 'status' callback Add callback to supply information for 'dmsetup status' and 'dmsetup table'. The output for 'dmsetup status' is 0 zoned zones / random / sequential where is the number of unmapped (ie free) random zones, the total number of random zones, the number of unmapped sequential zones, and the total number of sequential zones. Signed-off-by: Hannes Reinecke Reviewed-by: Bob Liu Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- .../admin-guide/device-mapper/dm-zoned.rst | 16 ++++++++++++ drivers/md/dm-zoned-metadata.c | 15 +++++++++++ drivers/md/dm-zoned-target.c | 26 +++++++++++++++++++ drivers/md/dm-zoned.h | 3 +++ 4 files changed, 60 insertions(+) diff --git a/Documentation/admin-guide/device-mapper/dm-zoned.rst b/Documentation/admin-guide/device-mapper/dm-zoned.rst index 07f56ebc1730..4165fbf1aeb6 100644 --- a/Documentation/admin-guide/device-mapper/dm-zoned.rst +++ b/Documentation/admin-guide/device-mapper/dm-zoned.rst @@ -144,3 +144,19 @@ underlying zoned block device name. Ex:: echo "0 `blockdev --getsize ${dev}` zoned ${dev}" | \ dmsetup create dmz-`basename ${dev}` + +Information about the internal layout and current usage of the zones can +be obtained with the 'status' callback from dmsetup: + +Ex:: + + dmsetup status /dev/dm-X + +will return a line + + 0 zoned zones / random / sequential + +where is the total number of zones, is the number +of unmapped (ie free) random zones, the total number of zones, + the number of unmapped sequential zones, and the +total number of sequential zones. diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 369de15c4e80..c8787560fa9f 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -202,6 +202,11 @@ sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone) return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_blocks_shift; } +unsigned int dmz_nr_zones(struct dmz_metadata *zmd) +{ + return zmd->dev->nr_zones; +} + unsigned int dmz_nr_chunks(struct dmz_metadata *zmd) { return zmd->nr_chunks; @@ -217,6 +222,16 @@ unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd) return atomic_read(&zmd->unmap_nr_rnd); } +unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd) +{ + return zmd->nr_seq; +} + +unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd) +{ + return atomic_read(&zmd->unmap_nr_seq); +} + /* * Lock/unlock mapping table. * The map lock also protects all the zone lists. diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index f4f83d39b3dc..0b4b27d280fb 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -965,6 +965,31 @@ static int dmz_iterate_devices(struct dm_target *ti, return fn(ti, dmz->ddev, 0, capacity, data); } +static void dmz_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, + unsigned int maxlen) +{ + struct dmz_target *dmz = ti->private; + ssize_t sz = 0; + char buf[BDEVNAME_SIZE]; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%u zones %u/%u random %u/%u sequential", + dmz_nr_zones(dmz->metadata), + dmz_nr_unmap_rnd_zones(dmz->metadata), + dmz_nr_rnd_zones(dmz->metadata), + dmz_nr_unmap_seq_zones(dmz->metadata), + dmz_nr_seq_zones(dmz->metadata)); + break; + case STATUSTYPE_TABLE: + format_dev_t(buf, dmz->dev->bdev->bd_dev); + DMEMIT("%s", buf); + break; + } + return; +} + static struct target_type dmz_type = { .name = "zoned", .version = {1, 1, 0}, @@ -978,6 +1003,7 @@ static struct target_type dmz_type = { .postsuspend = dmz_suspend, .resume = dmz_resume, .iterate_devices = dmz_iterate_devices, + .status = dmz_status, }; static int __init dmz_init(void) diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 5b5e493d479c..884c0e586082 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -190,8 +190,11 @@ void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone); void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *zone, unsigned int chunk); void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone); +unsigned int dmz_nr_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd); +unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd); +unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd); /* * Activate a zone (increment its reference count). From 90b39d58f39e1f3f3147caee6fb2a71528db74a2 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:17 +0200 Subject: [PATCH 111/427] dm zoned: add 'message' callback Add callback for 'dmsetup message' to allow the reclaim process to be triggered manually. Eg. dmsetup message /dev/dm-X 0 message will start the reclaim process even if the default threshold of 50 percent of free random zones is not reached. Signed-off-by: Hannes Reinecke Reviewed-by: Bob Liu Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- .../admin-guide/device-mapper/dm-zoned.rst | 12 ++++++++++++ drivers/md/dm-zoned-target.c | 15 +++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/Documentation/admin-guide/device-mapper/dm-zoned.rst b/Documentation/admin-guide/device-mapper/dm-zoned.rst index 4165fbf1aeb6..7547ce635161 100644 --- a/Documentation/admin-guide/device-mapper/dm-zoned.rst +++ b/Documentation/admin-guide/device-mapper/dm-zoned.rst @@ -160,3 +160,15 @@ where is the total number of zones, is the number of unmapped (ie free) random zones, the total number of zones, the number of unmapped sequential zones, and the total number of sequential zones. + +Normally the reclaim process will be started once there are less than 50 +percent free random zones. In order to start the reclaim process manually +even before reaching this threshold the 'dmsetup message' function can be +used: + +Ex:: + + dmsetup message /dev/dm-X 0 reclaim + +will start the reclaim process and random zones will be moved to sequential +zones. diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 0b4b27d280fb..0bfe34162dbb 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -990,6 +990,20 @@ static void dmz_status(struct dm_target *ti, status_type_t type, return; } +static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result, unsigned int maxlen) +{ + struct dmz_target *dmz = ti->private; + int r = -EINVAL; + + if (!strcasecmp(argv[0], "reclaim")) { + dmz_schedule_reclaim(dmz->reclaim); + r = 0; + } else + DMERR("unrecognized message %s", argv[0]); + return r; +} + static struct target_type dmz_type = { .name = "zoned", .version = {1, 1, 0}, @@ -1004,6 +1018,7 @@ static struct target_type dmz_type = { .resume = dmz_resume, .iterate_devices = dmz_iterate_devices, .status = dmz_status, + .message = dmz_message, }; static int __init dmz_init(void) From b71228739851a9b384a59ba0467259eba508b408 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:18 +0200 Subject: [PATCH 112/427] dm zoned: store zone id within the zone structure and kill dmz_id() Instead of calculating the zone index by the offset within the zone array store the index within the structure itself. With that the helper dmz_id() is pointless and can be replaced with accessing the ->id value directly. Signed-off-by: Hannes Reinecke Reviewed-by: Bob Liu Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 40 +++++++++++++++------------------- drivers/md/dm-zoned-reclaim.c | 17 +++++++-------- drivers/md/dm-zoned-target.c | 6 ++--- drivers/md/dm-zoned.h | 4 +++- 4 files changed, 31 insertions(+), 36 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index c8787560fa9f..1993eeb26bc1 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -187,19 +187,14 @@ struct dmz_metadata { /* * Various accessors */ -unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone) -{ - return ((unsigned int)(zone - zmd->zones)); -} - sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone) { - return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_sectors_shift; + return (sector_t)zone->id << zmd->dev->zone_nr_sectors_shift; } sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone) { - return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_blocks_shift; + return (sector_t)zone->id << zmd->dev->zone_nr_blocks_shift; } unsigned int dmz_nr_zones(struct dmz_metadata *zmd) @@ -1119,6 +1114,7 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data) INIT_LIST_HEAD(&zone->link); atomic_set(&zone->refcount, 0); + zone->id = idx; zone->chunk = DMZ_MAP_UNMAPPED; switch (blkz->type) { @@ -1246,7 +1242,7 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) ret = -EIO; if (ret < 0) { dmz_dev_err(zmd->dev, "Get zone %u report failed", - dmz_id(zmd, zone)); + zone->id); dmz_check_bdev(zmd->dev); return ret; } @@ -1270,7 +1266,7 @@ static int dmz_handle_seq_write_err(struct dmz_metadata *zmd, return ret; dmz_dev_warn(zmd->dev, "Processing zone %u write error (zone wp %u/%u)", - dmz_id(zmd, zone), zone->wp_block, wp); + zone->id, zone->wp_block, wp); if (zone->wp_block < wp) { dmz_invalidate_blocks(zmd, zone, zone->wp_block, @@ -1309,7 +1305,7 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) dev->zone_nr_sectors, GFP_NOIO); if (ret) { dmz_dev_err(dev, "Reset zone %u failed %d", - dmz_id(zmd, zone), ret); + zone->id, ret); return ret; } } @@ -1757,8 +1753,7 @@ again: } /* Update the chunk mapping */ - dmz_set_chunk_mapping(zmd, dzone->chunk, dmz_id(zmd, dzone), - dmz_id(zmd, bzone)); + dmz_set_chunk_mapping(zmd, dzone->chunk, dzone->id, bzone->id); set_bit(DMZ_BUF, &bzone->flags); bzone->chunk = dzone->chunk; @@ -1810,7 +1805,7 @@ again: atomic_dec(&zmd->unmap_nr_seq); if (dmz_is_offline(zone)) { - dmz_dev_warn(zmd->dev, "Zone %u is offline", dmz_id(zmd, zone)); + dmz_dev_warn(zmd->dev, "Zone %u is offline", zone->id); zone = NULL; goto again; } @@ -1852,7 +1847,7 @@ void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone, unsigned int chunk) { /* Set the chunk mapping */ - dmz_set_chunk_mapping(zmd, chunk, dmz_id(zmd, dzone), + dmz_set_chunk_mapping(zmd, chunk, dzone->id, DMZ_MAP_UNMAPPED); dzone->chunk = chunk; if (dmz_is_rnd(dzone)) @@ -1880,7 +1875,7 @@ void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone) * Unmapping the chunk buffer zone: clear only * the chunk buffer mapping */ - dzone_id = dmz_id(zmd, zone->bzone); + dzone_id = zone->bzone->id; zone->bzone->bzone = NULL; zone->bzone = NULL; @@ -1942,7 +1937,7 @@ static struct dmz_mblock *dmz_get_bitmap(struct dmz_metadata *zmd, sector_t chunk_block) { sector_t bitmap_block = 1 + zmd->nr_map_blocks + - (sector_t)(dmz_id(zmd, zone) * zmd->zone_nr_bitmap_blocks) + + (sector_t)(zone->id * zmd->zone_nr_bitmap_blocks) + (chunk_block >> DMZ_BLOCK_SHIFT_BITS); return dmz_get_mblock(zmd, bitmap_block); @@ -2022,7 +2017,7 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, unsigned int n = 0; dmz_dev_debug(zmd->dev, "=> VALIDATE zone %u, block %llu, %u blocks", - dmz_id(zmd, zone), (unsigned long long)chunk_block, + zone->id, (unsigned long long)chunk_block, nr_blocks); WARN_ON(chunk_block + nr_blocks > zone_nr_blocks); @@ -2052,7 +2047,7 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, zone->weight += n; else { dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be <= %u", - dmz_id(zmd, zone), zone->weight, + zone->id, zone->weight, zone_nr_blocks - n); zone->weight = zone_nr_blocks; } @@ -2102,7 +2097,7 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, unsigned int n = 0; dmz_dev_debug(zmd->dev, "=> INVALIDATE zone %u, block %llu, %u blocks", - dmz_id(zmd, zone), (u64)chunk_block, nr_blocks); + zone->id, (u64)chunk_block, nr_blocks); WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks); @@ -2132,7 +2127,7 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, zone->weight -= n; else { dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be >= %u", - dmz_id(zmd, zone), zone->weight, n); + zone->id, zone->weight, n); zone->weight = 0; } @@ -2378,7 +2373,7 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd) int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata) { struct dmz_metadata *zmd; - unsigned int i, zid; + unsigned int i; struct dm_zone *zone; int ret; @@ -2419,9 +2414,8 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata) goto err; /* Set metadata zones starting from sb_zone */ - zid = dmz_id(zmd, zmd->sb_zone); for (i = 0; i < zmd->nr_meta_zones << 1; i++) { - zone = dmz_get(zmd, zid + i); + zone = dmz_get(zmd, zmd->sb_zone->id + i); if (!dmz_is_rnd(zone)) goto err; set_bit(DMZ_META, &zone->flags); diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index e7ace908a9b7..7f57c4299a2f 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -80,7 +80,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, if (ret) { dmz_dev_err(zrc->dev, "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d", - dmz_id(zmd, zone), (unsigned long long)wp_block, + zone->id, (unsigned long long)wp_block, (unsigned long long)block, nr_blocks, ret); dmz_check_bdev(zrc->dev); return ret; @@ -196,8 +196,8 @@ static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone) dmz_dev_debug(zrc->dev, "Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)", - dzone->chunk, dmz_id(zmd, bzone), dmz_weight(bzone), - dmz_id(zmd, dzone), dmz_weight(dzone)); + dzone->chunk, bzone->id, dmz_weight(bzone), + dzone->id, dmz_weight(dzone)); /* Flush data zone into the buffer zone */ ret = dmz_reclaim_copy(zrc, bzone, dzone); @@ -235,8 +235,8 @@ static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) dmz_dev_debug(zrc->dev, "Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)", - chunk, dmz_id(zmd, dzone), dmz_weight(dzone), - dmz_id(zmd, bzone), dmz_weight(bzone)); + chunk, dzone->id, dmz_weight(dzone), + bzone->id, dmz_weight(bzone)); /* Flush data zone into the buffer zone */ ret = dmz_reclaim_copy(zrc, dzone, bzone); @@ -287,8 +287,7 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) dmz_dev_debug(zrc->dev, "Chunk %u, move rnd zone %u (weight %u) to seq zone %u", - chunk, dmz_id(zmd, dzone), dmz_weight(dzone), - dmz_id(zmd, szone)); + chunk, dzone->id, dmz_weight(dzone), szone->id); /* Flush the random data zone into the sequential zone */ ret = dmz_reclaim_copy(zrc, dzone, szone); @@ -403,12 +402,12 @@ out: if (ret) { dmz_dev_debug(zrc->dev, "Metadata flush for zone %u failed, err %d\n", - dmz_id(zmd, rzone), ret); + rzone->id, ret); return ret; } dmz_dev_debug(zrc->dev, "Reclaimed zone %u in %u ms", - dmz_id(zmd, rzone), jiffies_to_msecs(jiffies - start)); + rzone->id, jiffies_to_msecs(jiffies - start)); return 0; } diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 0bfe34162dbb..859ccc30ba7f 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -180,7 +180,7 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, dmz_dev_debug(dmz->dev, "READ chunk %llu -> %s zone %u, block %llu, %u blocks", (unsigned long long)dmz_bio_chunk(dmz->dev, bio), (dmz_is_rnd(zone) ? "RND" : "SEQ"), - dmz_id(dmz->metadata, zone), + zone->id, (unsigned long long)chunk_block, nr_blocks); /* Check block validity to determine the read location */ @@ -317,7 +317,7 @@ static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone, dmz_dev_debug(dmz->dev, "WRITE chunk %llu -> %s zone %u, block %llu, %u blocks", (unsigned long long)dmz_bio_chunk(dmz->dev, bio), (dmz_is_rnd(zone) ? "RND" : "SEQ"), - dmz_id(dmz->metadata, zone), + zone->id, (unsigned long long)chunk_block, nr_blocks); if (dmz_is_rnd(zone) || chunk_block == zone->wp_block) { @@ -357,7 +357,7 @@ static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone, dmz_dev_debug(dmz->dev, "DISCARD chunk %llu -> zone %u, block %llu, %u blocks", (unsigned long long)dmz_bio_chunk(dmz->dev, bio), - dmz_id(zmd, zone), + zone->id, (unsigned long long)chunk_block, nr_blocks); /* diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 884c0e586082..30781646741a 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -87,6 +87,9 @@ struct dm_zone { /* Zone activation reference count */ atomic_t refcount; + /* Zone id */ + unsigned int id; + /* Zone write pointer block (relative to the zone start block) */ unsigned int wp_block; @@ -176,7 +179,6 @@ void dmz_lock_flush(struct dmz_metadata *zmd); void dmz_unlock_flush(struct dmz_metadata *zmd); int dmz_flush_metadata(struct dmz_metadata *zmd); -unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone); sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone); sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone); unsigned int dmz_nr_chunks(struct dmz_metadata *zmd); From 735bd7e4cd16270b7b67cb82ff4ba2811bfd8d7b Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:19 +0200 Subject: [PATCH 113/427] dm zoned: use array for superblock zones Instead of storing just the first superblock zone and calculate the secondary relative to that we should be using an array for holding the superblock zones. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Reviewed-by: Bob Liu Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 41 +++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 1993eeb26bc1..900b1c1224f5 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -124,6 +124,7 @@ struct dmz_sb { sector_t block; struct dmz_mblock *mblk; struct dmz_super *sb; + struct dm_zone *zone; }; /* @@ -150,7 +151,6 @@ struct dmz_metadata { /* Zone information array */ struct dm_zone *zones; - struct dm_zone *sb_zone; struct dmz_sb sb[2]; unsigned int mblk_primary; u64 sb_gen; @@ -839,8 +839,9 @@ err: /* * Check super block. */ -static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_super *sb) +static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) { + struct dmz_super *sb = zmd->sb[set].sb; unsigned int nr_meta_zones, nr_data_zones; struct dmz_dev *dev = zmd->dev; u32 crc, stored_crc; @@ -932,16 +933,20 @@ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) /* Bad first super block: search for the second one */ zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks; + zmd->sb[1].zone = zmd->sb[0].zone + 1; for (i = 0; i < zmd->nr_rnd_zones - 1; i++) { if (dmz_read_sb(zmd, 1) != 0) break; - if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC) + if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC) { + zmd->sb[1].zone += i; return 0; + } zmd->sb[1].block += zone_nr_blocks; } dmz_free_mblock(zmd, mblk); zmd->sb[1].mblk = NULL; + zmd->sb[1].zone = NULL; return -EIO; } @@ -985,11 +990,9 @@ static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set) dmz_dev_warn(zmd->dev, "Metadata set %u invalid: recovering", dst_set); if (dst_set == 0) - zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone); - else { - zmd->sb[1].block = zmd->sb[0].block + - (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift); - } + zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); + else + zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); page = alloc_page(GFP_NOIO); if (!page) @@ -1033,21 +1036,27 @@ static int dmz_load_sb(struct dmz_metadata *zmd) u64 sb_gen[2] = {0, 0}; int ret; + if (!zmd->sb[0].zone) { + dmz_dev_err(zmd->dev, "Primary super block zone not set"); + return -ENXIO; + } + /* Read and check the primary super block */ - zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone); + zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); ret = dmz_get_sb(zmd, 0); if (ret) { dmz_dev_err(zmd->dev, "Read primary super block failed"); return ret; } - ret = dmz_check_sb(zmd, zmd->sb[0].sb); + ret = dmz_check_sb(zmd, 0); /* Read and check secondary super block */ if (ret == 0) { sb_good[0] = true; - zmd->sb[1].block = zmd->sb[0].block + - (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift); + if (!zmd->sb[1].zone) + zmd->sb[1].zone = zmd->sb[0].zone + zmd->nr_meta_zones; + zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); ret = dmz_get_sb(zmd, 1); } else ret = dmz_lookup_secondary_sb(zmd); @@ -1057,7 +1066,7 @@ static int dmz_load_sb(struct dmz_metadata *zmd) return ret; } - ret = dmz_check_sb(zmd, zmd->sb[1].sb); + ret = dmz_check_sb(zmd, 1); if (ret == 0) sb_good[1] = true; @@ -1142,9 +1151,9 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data) zmd->nr_useable_zones++; if (dmz_is_rnd(zone)) { zmd->nr_rnd_zones++; - if (!zmd->sb_zone) { + if (!zmd->sb[0].zone) { /* Super block zone */ - zmd->sb_zone = zone; + zmd->sb[0].zone = zone; } } } @@ -2415,7 +2424,7 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata) /* Set metadata zones starting from sb_zone */ for (i = 0; i < zmd->nr_meta_zones << 1; i++) { - zone = dmz_get(zmd, zmd->sb_zone->id + i); + zone = dmz_get(zmd, zmd->sb[0].zone->id + i); if (!dmz_is_rnd(zone)) goto err; set_bit(DMZ_META, &zone->flags); From bf28a3ba098676831bde49e8bc47849727d532a5 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:20 +0200 Subject: [PATCH 114/427] dm zoned: store device in struct dmz_sb Store the device together with the superblock so that we don't have to recur to the metadata to find it. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Reviewed-by: Bob Liu Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 90 ++++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 31 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 900b1c1224f5..def836e12dd9 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -122,6 +122,7 @@ enum { */ struct dmz_sb { sector_t block; + struct dmz_dev *dev; struct dmz_mblock *mblk; struct dmz_super *sb; struct dm_zone *zone; @@ -197,6 +198,11 @@ sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone) return (sector_t)zone->id << zmd->dev->zone_nr_blocks_shift; } +struct dmz_dev *dmz_zone_to_dev(struct dmz_metadata *zmd, struct dm_zone *zone) +{ + return &zmd->dev[0]; +} + unsigned int dmz_nr_zones(struct dmz_metadata *zmd) { return zmd->dev->nr_zones; @@ -412,9 +418,10 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, { struct dmz_mblock *mblk, *m; sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no; + struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev; struct bio *bio; - if (dmz_bdev_is_dying(zmd->dev)) + if (dmz_bdev_is_dying(dev)) return ERR_PTR(-EIO); /* Get a new block and a BIO to read it */ @@ -450,7 +457,7 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, /* Submit read BIO */ bio->bi_iter.bi_sector = dmz_blk2sect(block); - bio_set_dev(bio, zmd->dev->bdev); + bio_set_dev(bio, dev->bdev); bio->bi_private = mblk; bio->bi_end_io = dmz_mblock_bio_end_io; bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO); @@ -547,6 +554,7 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, sector_t mblk_no) { struct dmz_mblock *mblk; + struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev; /* Check rbtree */ spin_lock(&zmd->mblk_lock); @@ -565,7 +573,7 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, TASK_UNINTERRUPTIBLE); if (test_bit(DMZ_META_ERROR, &mblk->state)) { dmz_release_mblock(zmd, mblk); - dmz_check_bdev(zmd->dev); + dmz_check_bdev(dev); return ERR_PTR(-EIO); } @@ -589,10 +597,11 @@ static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, unsigned int set) { + struct dmz_dev *dev = zmd->sb[set].dev; sector_t block = zmd->sb[set].block + mblk->no; struct bio *bio; - if (dmz_bdev_is_dying(zmd->dev)) + if (dmz_bdev_is_dying(dev)) return -EIO; bio = bio_alloc(GFP_NOIO, 1); @@ -604,7 +613,7 @@ static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, set_bit(DMZ_META_WRITING, &mblk->state); bio->bi_iter.bi_sector = dmz_blk2sect(block); - bio_set_dev(bio, zmd->dev->bdev); + bio_set_dev(bio, dev->bdev); bio->bi_private = mblk; bio->bi_end_io = dmz_mblock_bio_end_io; bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO); @@ -617,13 +626,13 @@ static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, /* * Read/write a metadata block. */ -static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block, - struct page *page) +static int dmz_rdwr_block(struct dmz_dev *dev, int op, + sector_t block, struct page *page) { struct bio *bio; int ret; - if (dmz_bdev_is_dying(zmd->dev)) + if (dmz_bdev_is_dying(dev)) return -EIO; bio = bio_alloc(GFP_NOIO, 1); @@ -631,14 +640,14 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block, return -ENOMEM; bio->bi_iter.bi_sector = dmz_blk2sect(block); - bio_set_dev(bio, zmd->dev->bdev); + bio_set_dev(bio, dev->bdev); bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO); bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0); ret = submit_bio_wait(bio); bio_put(bio); if (ret) - dmz_check_bdev(zmd->dev); + dmz_check_bdev(dev); return ret; } @@ -650,6 +659,7 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) sector_t block = zmd->sb[set].block; struct dmz_mblock *mblk = zmd->sb[set].mblk; struct dmz_super *sb = zmd->sb[set].sb; + struct dmz_dev *dev = zmd->sb[set].dev; u64 sb_gen = zmd->sb_gen + 1; int ret; @@ -669,9 +679,9 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) sb->crc = 0; sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE)); - ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page); + ret = dmz_rdwr_block(dev, REQ_OP_WRITE, block, mblk->page); if (ret == 0) - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); + ret = blkdev_issue_flush(dev->bdev, GFP_NOIO, NULL); return ret; } @@ -684,6 +694,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, unsigned int set) { struct dmz_mblock *mblk; + struct dmz_dev *dev = zmd->sb[set].dev; struct blk_plug plug; int ret = 0, nr_mblks_submitted = 0; @@ -705,7 +716,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, TASK_UNINTERRUPTIBLE); if (test_bit(DMZ_META_ERROR, &mblk->state)) { clear_bit(DMZ_META_ERROR, &mblk->state); - dmz_check_bdev(zmd->dev); + dmz_check_bdev(dev); ret = -EIO; } nr_mblks_submitted--; @@ -713,7 +724,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, /* Flush drive cache (this will also sync data) */ if (ret == 0) - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); + ret = blkdev_issue_flush(dev->bdev, GFP_NOIO, NULL); return ret; } @@ -750,6 +761,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) { struct dmz_mblock *mblk; struct list_head write_list; + struct dmz_dev *dev; int ret; if (WARN_ON(!zmd)) @@ -763,6 +775,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) * from modifying metadata. */ down_write(&zmd->mblk_sem); + dev = zmd->sb[zmd->mblk_primary].dev; /* * This is called from the target flush work and reclaim work. @@ -770,7 +783,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) */ dmz_lock_flush(zmd); - if (dmz_bdev_is_dying(zmd->dev)) { + if (dmz_bdev_is_dying(dev)) { ret = -EIO; goto out; } @@ -782,7 +795,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) /* If there are no dirty metadata blocks, just flush the device cache */ if (list_empty(&write_list)) { - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); + ret = blkdev_issue_flush(dev->bdev, GFP_NOIO, NULL); goto err; } @@ -831,7 +844,7 @@ err: list_splice(&write_list, &zmd->mblk_dirty_list); spin_unlock(&zmd->mblk_lock); } - if (!dmz_check_bdev(zmd->dev)) + if (!dmz_check_bdev(dev)) ret = -EIO; goto out; } @@ -842,8 +855,8 @@ err: static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) { struct dmz_super *sb = zmd->sb[set].sb; + struct dmz_dev *dev = zmd->sb[set].dev; unsigned int nr_meta_zones, nr_data_zones; - struct dmz_dev *dev = zmd->dev; u32 crc, stored_crc; u64 gen; @@ -908,8 +921,8 @@ static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) */ static int dmz_read_sb(struct dmz_metadata *zmd, unsigned int set) { - return dmz_rdwr_block(zmd, REQ_OP_READ, zmd->sb[set].block, - zmd->sb[set].mblk->page); + return dmz_rdwr_block(zmd->sb[set].dev, REQ_OP_READ, + zmd->sb[set].block, zmd->sb[set].mblk->page); } /* @@ -934,6 +947,7 @@ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) /* Bad first super block: search for the second one */ zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks; zmd->sb[1].zone = zmd->sb[0].zone + 1; + zmd->sb[1].dev = dmz_zone_to_dev(zmd, zmd->sb[1].zone); for (i = 0; i < zmd->nr_rnd_zones - 1; i++) { if (dmz_read_sb(zmd, 1) != 0) break; @@ -942,11 +956,13 @@ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) return 0; } zmd->sb[1].block += zone_nr_blocks; + zmd->sb[1].dev = dmz_zone_to_dev(zmd, zmd->sb[1].zone + i); } dmz_free_mblock(zmd, mblk); zmd->sb[1].mblk = NULL; zmd->sb[1].zone = NULL; + zmd->sb[1].dev = NULL; return -EIO; } @@ -987,7 +1003,8 @@ static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set) struct page *page; int i, ret; - dmz_dev_warn(zmd->dev, "Metadata set %u invalid: recovering", dst_set); + dmz_dev_warn(zmd->sb[dst_set].dev, + "Metadata set %u invalid: recovering", dst_set); if (dst_set == 0) zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); @@ -1000,11 +1017,11 @@ static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set) /* Copy metadata blocks */ for (i = 1; i < zmd->nr_meta_blocks; i++) { - ret = dmz_rdwr_block(zmd, REQ_OP_READ, + ret = dmz_rdwr_block(zmd->sb[src_set].dev, REQ_OP_READ, zmd->sb[src_set].block + i, page); if (ret) goto out; - ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, + ret = dmz_rdwr_block(zmd->sb[dst_set].dev, REQ_OP_WRITE, zmd->sb[dst_set].block + i, page); if (ret) goto out; @@ -1043,9 +1060,10 @@ static int dmz_load_sb(struct dmz_metadata *zmd) /* Read and check the primary super block */ zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); + zmd->sb[0].dev = dmz_zone_to_dev(zmd, zmd->sb[0].zone); ret = dmz_get_sb(zmd, 0); if (ret) { - dmz_dev_err(zmd->dev, "Read primary super block failed"); + dmz_dev_err(zmd->sb[0].dev, "Read primary super block failed"); return ret; } @@ -1057,12 +1075,13 @@ static int dmz_load_sb(struct dmz_metadata *zmd) if (!zmd->sb[1].zone) zmd->sb[1].zone = zmd->sb[0].zone + zmd->nr_meta_zones; zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); + zmd->sb[1].dev = dmz_zone_to_dev(zmd, zmd->sb[1].zone); ret = dmz_get_sb(zmd, 1); } else ret = dmz_lookup_secondary_sb(zmd); if (ret) { - dmz_dev_err(zmd->dev, "Read secondary super block failed"); + dmz_dev_err(zmd->sb[1].dev, "Read secondary super block failed"); return ret; } @@ -1078,17 +1097,25 @@ static int dmz_load_sb(struct dmz_metadata *zmd) if (sb_good[0]) sb_gen[0] = le64_to_cpu(zmd->sb[0].sb->gen); - else + else { ret = dmz_recover_mblocks(zmd, 0); + if (ret) { + dmz_dev_err(zmd->sb[0].dev, + "Recovery of superblock 0 failed"); + return -EIO; + } + } if (sb_good[1]) sb_gen[1] = le64_to_cpu(zmd->sb[1].sb->gen); - else + else { ret = dmz_recover_mblocks(zmd, 1); - if (ret) { - dmz_dev_err(zmd->dev, "Recovery failed"); - return -EIO; + if (ret) { + dmz_dev_err(zmd->sb[1].dev, + "Recovery of superblock 1 failed"); + return -EIO; + } } if (sb_gen[0] >= sb_gen[1]) { @@ -1099,7 +1126,8 @@ static int dmz_load_sb(struct dmz_metadata *zmd) zmd->mblk_primary = 1; } - dmz_dev_debug(zmd->dev, "Using super block %u (gen %llu)", + dmz_dev_debug(zmd->sb[zmd->mblk_primary].dev, + "Using super block %u (gen %llu)", zmd->mblk_primary, zmd->sb_gen); return 0; From 368205601375bbfb41b07ec8295eab208b6fced5 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:21 +0200 Subject: [PATCH 115/427] dm zoned: move fields from struct dmz_dev to dmz_metadata Move fields from the device structure into the metadata structure and provide accessor functions. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Reviewed-by: Bob Liu Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 88 +++++++++++++++++++++++----------- drivers/md/dm-zoned-reclaim.c | 8 ++-- drivers/md/dm-zoned-target.c | 48 +++++++++---------- drivers/md/dm-zoned.h | 14 +++--- 4 files changed, 95 insertions(+), 63 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index def836e12dd9..b844ff02ae7b 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -138,9 +138,16 @@ struct dmz_metadata { unsigned int zone_nr_bitmap_blocks; unsigned int zone_bits_per_mblk; + sector_t zone_nr_blocks; + sector_t zone_nr_blocks_shift; + + sector_t zone_nr_sectors; + sector_t zone_nr_sectors_shift; + unsigned int nr_bitmap_blocks; unsigned int nr_map_blocks; + unsigned int nr_zones; unsigned int nr_useable_zones; unsigned int nr_meta_blocks; unsigned int nr_meta_zones; @@ -190,12 +197,12 @@ struct dmz_metadata { */ sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone) { - return (sector_t)zone->id << zmd->dev->zone_nr_sectors_shift; + return (sector_t)zone->id << zmd->zone_nr_sectors_shift; } sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone) { - return (sector_t)zone->id << zmd->dev->zone_nr_blocks_shift; + return (sector_t)zone->id << zmd->zone_nr_blocks_shift; } struct dmz_dev *dmz_zone_to_dev(struct dmz_metadata *zmd, struct dm_zone *zone) @@ -203,9 +210,29 @@ struct dmz_dev *dmz_zone_to_dev(struct dmz_metadata *zmd, struct dm_zone *zone) return &zmd->dev[0]; } +unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd) +{ + return zmd->zone_nr_blocks; +} + +unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd) +{ + return zmd->zone_nr_blocks_shift; +} + +unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd) +{ + return zmd->zone_nr_sectors; +} + +unsigned int dmz_zone_nr_sectors_shift(struct dmz_metadata *zmd) +{ + return zmd->zone_nr_sectors_shift; +} + unsigned int dmz_nr_zones(struct dmz_metadata *zmd) { - return zmd->dev->nr_zones; + return zmd->nr_zones; } unsigned int dmz_nr_chunks(struct dmz_metadata *zmd) @@ -882,8 +909,8 @@ static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) return -ENXIO; } - nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + dev->zone_nr_blocks - 1) - >> dev->zone_nr_blocks_shift; + nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + zmd->zone_nr_blocks - 1) + >> zmd->zone_nr_blocks_shift; if (!nr_meta_zones || nr_meta_zones >= zmd->nr_rnd_zones) { dmz_dev_err(dev, "Invalid number of metadata blocks"); @@ -932,7 +959,7 @@ static int dmz_read_sb(struct dmz_metadata *zmd, unsigned int set) */ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) { - unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks; + unsigned int zone_nr_blocks = zmd->zone_nr_blocks; struct dmz_mblock *mblk; int i; @@ -1143,7 +1170,7 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data) struct dmz_dev *dev = zmd->dev; /* Ignore the eventual last runt (smaller) zone */ - if (blkz->len != dev->zone_nr_sectors) { + if (blkz->len != zmd->zone_nr_sectors) { if (blkz->start + blkz->len == dev->capacity) return 0; return -ENXIO; @@ -1208,19 +1235,24 @@ static int dmz_init_zones(struct dmz_metadata *zmd) int ret; /* Init */ - zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3; + zmd->zone_nr_sectors = dev->zone_nr_sectors; + zmd->zone_nr_sectors_shift = ilog2(zmd->zone_nr_sectors); + zmd->zone_nr_blocks = dmz_sect2blk(zmd->zone_nr_sectors); + zmd->zone_nr_blocks_shift = ilog2(zmd->zone_nr_blocks); + zmd->zone_bitmap_size = zmd->zone_nr_blocks >> 3; zmd->zone_nr_bitmap_blocks = max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT); - zmd->zone_bits_per_mblk = min_t(sector_t, dev->zone_nr_blocks, + zmd->zone_bits_per_mblk = min_t(sector_t, zmd->zone_nr_blocks, DMZ_BLOCK_SIZE_BITS); /* Allocate zone array */ - zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL); + zmd->nr_zones = dev->nr_zones; + zmd->zones = kcalloc(zmd->nr_zones, sizeof(struct dm_zone), GFP_KERNEL); if (!zmd->zones) return -ENOMEM; dmz_dev_info(dev, "Using %zu B for zone information", - sizeof(struct dm_zone) * dev->nr_zones); + sizeof(struct dm_zone) * zmd->nr_zones); /* * Get zone information and initialize zone descriptors. At the same @@ -1339,7 +1371,7 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, dmz_start_sect(zmd, zone), - dev->zone_nr_sectors, GFP_NOIO); + zmd->zone_nr_sectors, GFP_NOIO); if (ret) { dmz_dev_err(dev, "Reset zone %u failed %d", zone->id, ret); @@ -1393,7 +1425,7 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) if (dzone_id == DMZ_MAP_UNMAPPED) goto next; - if (dzone_id >= dev->nr_zones) { + if (dzone_id >= zmd->nr_zones) { dmz_dev_err(dev, "Chunk %u mapping: invalid data zone ID %u", chunk, dzone_id); return -EIO; @@ -1414,7 +1446,7 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) if (bzone_id == DMZ_MAP_UNMAPPED) goto next; - if (bzone_id >= dev->nr_zones) { + if (bzone_id >= zmd->nr_zones) { dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone ID %u", chunk, bzone_id); return -EIO; @@ -1446,7 +1478,7 @@ next: * fully initialized. All remaining zones are unmapped data * zones. Finish initializing those here. */ - for (i = 0; i < dev->nr_zones; i++) { + for (i = 0; i < zmd->nr_zones; i++) { dzone = dmz_get(zmd, i); if (dmz_is_meta(dzone)) continue; @@ -1990,7 +2022,7 @@ int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, sector_t chunk_block = 0; /* Get the zones bitmap blocks */ - while (chunk_block < zmd->dev->zone_nr_blocks) { + while (chunk_block < zmd->zone_nr_blocks) { from_mblk = dmz_get_bitmap(zmd, from_zone, chunk_block); if (IS_ERR(from_mblk)) return PTR_ERR(from_mblk); @@ -2025,7 +2057,7 @@ int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, int ret; /* Get the zones bitmap blocks */ - while (chunk_block < zmd->dev->zone_nr_blocks) { + while (chunk_block < zmd->zone_nr_blocks) { /* Get a valid region from the source zone */ ret = dmz_first_valid_block(zmd, from_zone, &chunk_block); if (ret <= 0) @@ -2049,7 +2081,7 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, sector_t chunk_block, unsigned int nr_blocks) { unsigned int count, bit, nr_bits; - unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks; + unsigned int zone_nr_blocks = zmd->zone_nr_blocks; struct dmz_mblock *mblk; unsigned int n = 0; @@ -2136,7 +2168,7 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, dmz_dev_debug(zmd->dev, "=> INVALIDATE zone %u, block %llu, %u blocks", zone->id, (u64)chunk_block, nr_blocks); - WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks); + WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks); while (nr_blocks) { /* Get bitmap block */ @@ -2180,7 +2212,7 @@ static int dmz_test_block(struct dmz_metadata *zmd, struct dm_zone *zone, struct dmz_mblock *mblk; int ret; - WARN_ON(chunk_block >= zmd->dev->zone_nr_blocks); + WARN_ON(chunk_block >= zmd->zone_nr_blocks); /* Get bitmap block */ mblk = dmz_get_bitmap(zmd, zone, chunk_block); @@ -2210,7 +2242,7 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone, unsigned long *bitmap; int n = 0; - WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks); + WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks); while (nr_blocks) { /* Get bitmap block */ @@ -2254,7 +2286,7 @@ int dmz_block_valid(struct dmz_metadata *zmd, struct dm_zone *zone, /* The block is valid: get the number of valid blocks from block */ return dmz_to_next_set_block(zmd, zone, chunk_block, - zmd->dev->zone_nr_blocks - chunk_block, 0); + zmd->zone_nr_blocks - chunk_block, 0); } /* @@ -2270,7 +2302,7 @@ int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone, int ret; ret = dmz_to_next_set_block(zmd, zone, start_block, - zmd->dev->zone_nr_blocks - start_block, 1); + zmd->zone_nr_blocks - start_block, 1); if (ret < 0) return ret; @@ -2278,7 +2310,7 @@ int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone, *chunk_block = start_block; return dmz_to_next_set_block(zmd, zone, start_block, - zmd->dev->zone_nr_blocks - start_block, 0); + zmd->zone_nr_blocks - start_block, 0); } /* @@ -2317,7 +2349,7 @@ static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone) struct dmz_mblock *mblk; sector_t chunk_block = 0; unsigned int bit, nr_bits; - unsigned int nr_blocks = zmd->dev->zone_nr_blocks; + unsigned int nr_blocks = zmd->zone_nr_blocks; void *bitmap; int n = 0; @@ -2488,7 +2520,7 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata) dmz_dev_info(dev, " %llu 512-byte logical sectors", (u64)dev->capacity); dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors", - dev->nr_zones, (u64)dev->zone_nr_sectors); + zmd->nr_zones, (u64)zmd->zone_nr_sectors); dmz_dev_info(dev, " %u metadata zones", zmd->nr_meta_zones * 2); dmz_dev_info(dev, " %u data zones for %u chunks", @@ -2541,7 +2573,7 @@ int dmz_resume_metadata(struct dmz_metadata *zmd) int ret; /* Check zones */ - for (i = 0; i < dev->nr_zones; i++) { + for (i = 0; i < zmd->nr_zones; i++) { zone = dmz_get(zmd, i); if (!zone) { dmz_dev_err(dev, "Unable to get zone %u", i); @@ -2569,7 +2601,7 @@ int dmz_resume_metadata(struct dmz_metadata *zmd) i, (u64)zone->wp_block, (u64)wp_block); zone->wp_block = wp_block; dmz_invalidate_blocks(zmd, zone, zone->wp_block, - dev->zone_nr_blocks - zone->wp_block); + zmd->zone_nr_blocks - zone->wp_block); } } diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 7f57c4299a2f..5aa5e5130fe8 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -128,7 +128,7 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, if (dmz_is_seq(src_zone)) end_block = src_zone->wp_block; else - end_block = dev->zone_nr_blocks; + end_block = dmz_zone_nr_blocks(zmd); src_zone_block = dmz_start_block(zmd, src_zone); dst_zone_block = dmz_start_block(zmd, dst_zone); @@ -210,7 +210,7 @@ static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone) ret = dmz_merge_valid_blocks(zmd, bzone, dzone, chunk_block); if (ret == 0) { /* Free the buffer zone */ - dmz_invalidate_blocks(zmd, bzone, 0, zrc->dev->zone_nr_blocks); + dmz_invalidate_blocks(zmd, bzone, 0, dmz_zone_nr_blocks(zmd)); dmz_lock_map(zmd); dmz_unmap_zone(zmd, bzone); dmz_unlock_zone_reclaim(dzone); @@ -252,7 +252,7 @@ static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) * Free the data zone and remap the chunk to * the buffer zone. */ - dmz_invalidate_blocks(zmd, dzone, 0, zrc->dev->zone_nr_blocks); + dmz_invalidate_blocks(zmd, dzone, 0, dmz_zone_nr_blocks(zmd)); dmz_lock_map(zmd); dmz_unmap_zone(zmd, bzone); dmz_unmap_zone(zmd, dzone); @@ -305,7 +305,7 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) dmz_unlock_map(zmd); } else { /* Free the data zone and remap the chunk */ - dmz_invalidate_blocks(zmd, dzone, 0, zrc->dev->zone_nr_blocks); + dmz_invalidate_blocks(zmd, dzone, 0, dmz_zone_nr_blocks(zmd)); dmz_lock_map(zmd); dmz_unmap_zone(zmd, dzone); dmz_unlock_zone_reclaim(dzone); diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 859ccc30ba7f..68c5684d7b01 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -165,7 +165,8 @@ static void dmz_handle_read_zero(struct dmz_target *dmz, struct bio *bio, static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, struct bio *bio) { - sector_t chunk_block = dmz_chunk_block(dmz->dev, dmz_bio_block(bio)); + struct dmz_metadata *zmd = dmz->metadata; + sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio)); unsigned int nr_blocks = dmz_bio_blocks(bio); sector_t end_block = chunk_block + nr_blocks; struct dm_zone *rzone, *bzone; @@ -178,7 +179,7 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, } dmz_dev_debug(dmz->dev, "READ chunk %llu -> %s zone %u, block %llu, %u blocks", - (unsigned long long)dmz_bio_chunk(dmz->dev, bio), + (unsigned long long)dmz_bio_chunk(zmd, bio), (dmz_is_rnd(zone) ? "RND" : "SEQ"), zone->id, (unsigned long long)chunk_block, nr_blocks); @@ -189,7 +190,7 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, nr_blocks = 0; if (dmz_is_rnd(zone) || chunk_block < zone->wp_block) { /* Test block validity in the data zone */ - ret = dmz_block_valid(dmz->metadata, zone, chunk_block); + ret = dmz_block_valid(zmd, zone, chunk_block); if (ret < 0) return ret; if (ret > 0) { @@ -204,7 +205,7 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, * Check the buffer zone, if there is one. */ if (!nr_blocks && bzone) { - ret = dmz_block_valid(dmz->metadata, bzone, chunk_block); + ret = dmz_block_valid(zmd, bzone, chunk_block); if (ret < 0) return ret; if (ret > 0) { @@ -308,14 +309,15 @@ static int dmz_handle_buffered_write(struct dmz_target *dmz, static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone, struct bio *bio) { - sector_t chunk_block = dmz_chunk_block(dmz->dev, dmz_bio_block(bio)); + struct dmz_metadata *zmd = dmz->metadata; + sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio)); unsigned int nr_blocks = dmz_bio_blocks(bio); if (!zone) return -ENOSPC; dmz_dev_debug(dmz->dev, "WRITE chunk %llu -> %s zone %u, block %llu, %u blocks", - (unsigned long long)dmz_bio_chunk(dmz->dev, bio), + (unsigned long long)dmz_bio_chunk(zmd, bio), (dmz_is_rnd(zone) ? "RND" : "SEQ"), zone->id, (unsigned long long)chunk_block, nr_blocks); @@ -345,7 +347,7 @@ static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone, struct dmz_metadata *zmd = dmz->metadata; sector_t block = dmz_bio_block(bio); unsigned int nr_blocks = dmz_bio_blocks(bio); - sector_t chunk_block = dmz_chunk_block(dmz->dev, block); + sector_t chunk_block = dmz_chunk_block(zmd, block); int ret = 0; /* For unmapped chunks, there is nothing to do */ @@ -356,7 +358,7 @@ static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone, return -EROFS; dmz_dev_debug(dmz->dev, "DISCARD chunk %llu -> zone %u, block %llu, %u blocks", - (unsigned long long)dmz_bio_chunk(dmz->dev, bio), + (unsigned long long)dmz_bio_chunk(zmd, bio), zone->id, (unsigned long long)chunk_block, nr_blocks); @@ -402,7 +404,7 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, * mapping for read and discard. If a mapping is obtained, + the zone returned will be set to active state. */ - zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(dmz->dev, bio), + zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(zmd, bio), bio_op(bio)); if (IS_ERR(zone)) { ret = PTR_ERR(zone); @@ -525,7 +527,7 @@ static void dmz_flush_work(struct work_struct *work) */ static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) { - unsigned int chunk = dmz_bio_chunk(dmz->dev, bio); + unsigned int chunk = dmz_bio_chunk(dmz->metadata, bio); struct dm_chunk_work *cw; int ret = 0; @@ -618,6 +620,7 @@ bool dmz_check_bdev(struct dmz_dev *dmz_dev) static int dmz_map(struct dm_target *ti, struct bio *bio) { struct dmz_target *dmz = ti->private; + struct dmz_metadata *zmd = dmz->metadata; struct dmz_dev *dev = dmz->dev; struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); sector_t sector = bio->bi_iter.bi_sector; @@ -630,8 +633,8 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks", bio_op(bio), (unsigned long long)sector, nr_sectors, - (unsigned long long)dmz_bio_chunk(dmz->dev, bio), - (unsigned long long)dmz_chunk_block(dmz->dev, dmz_bio_block(bio)), + (unsigned long long)dmz_bio_chunk(zmd, bio), + (unsigned long long)dmz_chunk_block(zmd, dmz_bio_block(bio)), (unsigned int)dmz_bio_blocks(bio)); bio_set_dev(bio, dev->bdev); @@ -659,16 +662,16 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) } /* Split zone BIOs to fit entirely into a zone */ - chunk_sector = sector & (dev->zone_nr_sectors - 1); - if (chunk_sector + nr_sectors > dev->zone_nr_sectors) - dm_accept_partial_bio(bio, dev->zone_nr_sectors - chunk_sector); + chunk_sector = sector & (dmz_zone_nr_sectors(zmd) - 1); + if (chunk_sector + nr_sectors > dmz_zone_nr_sectors(zmd)) + dm_accept_partial_bio(bio, dmz_zone_nr_sectors(zmd) - chunk_sector); /* Now ready to handle this BIO */ ret = dmz_queue_chunk_work(dmz, bio); if (ret) { dmz_dev_debug(dmz->dev, "BIO op %d, can't process chunk %llu, err %i\n", - bio_op(bio), (u64)dmz_bio_chunk(dmz->dev, bio), + bio_op(bio), (u64)dmz_bio_chunk(zmd, bio), ret); return DM_MAPIO_REQUEUE; } @@ -722,10 +725,6 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path) } dev->zone_nr_sectors = blk_queue_zone_sectors(q); - dev->zone_nr_sectors_shift = ilog2(dev->zone_nr_sectors); - - dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors); - dev->zone_nr_blocks_shift = ilog2(dev->zone_nr_blocks); dev->nr_zones = blkdev_nr_zones(dev->bdev->bd_disk); @@ -790,7 +789,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) } /* Set target (no write same support) */ - ti->max_io_len = dev->zone_nr_sectors << 9; + ti->max_io_len = dmz_zone_nr_sectors(dmz->metadata) << 9; ti->num_flush_bios = 1; ti->num_discard_bios = 1; ti->num_write_zeroes_bios = 1; @@ -799,7 +798,8 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->discards_supported = true; /* The exposed capacity is the number of chunks that can be mapped */ - ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << dev->zone_nr_sectors_shift; + ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << + dmz_zone_nr_sectors_shift(dmz->metadata); /* Zone BIO */ ret = bioset_init(&dmz->bio_set, DMZ_MIN_BIOS, 0, 0); @@ -895,7 +895,7 @@ static void dmz_dtr(struct dm_target *ti) static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct dmz_target *dmz = ti->private; - unsigned int chunk_sectors = dmz->dev->zone_nr_sectors; + unsigned int chunk_sectors = dmz_zone_nr_sectors(dmz->metadata); limits->logical_block_size = DMZ_BLOCK_SIZE; limits->physical_block_size = DMZ_BLOCK_SIZE; @@ -960,7 +960,7 @@ static int dmz_iterate_devices(struct dm_target *ti, { struct dmz_target *dmz = ti->private; struct dmz_dev *dev = dmz->dev; - sector_t capacity = dev->capacity & ~(dev->zone_nr_sectors - 1); + sector_t capacity = dev->capacity & ~(dmz_zone_nr_sectors(dmz->metadata) - 1); return fn(ti, dmz->ddev, 0, capacity, data); } diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 30781646741a..f997ad62c7b4 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -60,15 +60,11 @@ struct dmz_dev { unsigned int flags; sector_t zone_nr_sectors; - unsigned int zone_nr_sectors_shift; - - sector_t zone_nr_blocks; - sector_t zone_nr_blocks_shift; }; -#define dmz_bio_chunk(dev, bio) ((bio)->bi_iter.bi_sector >> \ - (dev)->zone_nr_sectors_shift) -#define dmz_chunk_block(dev, b) ((b) & ((dev)->zone_nr_blocks - 1)) +#define dmz_bio_chunk(zmd, bio) ((bio)->bi_iter.bi_sector >> \ + dmz_zone_nr_sectors_shift(zmd)) +#define dmz_chunk_block(zmd, b) ((b) & (dmz_zone_nr_blocks(zmd) - 1)) /* Device flags. */ #define DMZ_BDEV_DYING (1 << 0) @@ -197,6 +193,10 @@ unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd); +unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd); +unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd); +unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd); +unsigned int dmz_zone_nr_sectors_shift(struct dmz_metadata *zmd); /* * Activate a zone (increment its reference count). From 2234e7321dc61f116de1dc913f3ffa7efff02068 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:22 +0200 Subject: [PATCH 116/427] dm zoned: introduce dmz_metadata_label() to format device name Introduce dmz_metadata_label() to format the device-mapper device name and use it instead of the device name of the underlying device. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Reviewed-by: Bob Liu Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 11 ++++- drivers/md/dm-zoned-reclaim.c | 15 +++---- drivers/md/dm-zoned-target.c | 74 +++++++++++++++++++--------------- drivers/md/dm-zoned.h | 4 +- 4 files changed, 62 insertions(+), 42 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index b844ff02ae7b..7cda48683c0b 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -134,6 +134,8 @@ struct dmz_sb { struct dmz_metadata { struct dmz_dev *dev; + char devname[BDEVNAME_SIZE]; + sector_t zone_bitmap_size; unsigned int zone_nr_bitmap_blocks; unsigned int zone_bits_per_mblk; @@ -260,6 +262,11 @@ unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd) return atomic_read(&zmd->unmap_nr_seq); } +const char *dmz_metadata_label(struct dmz_metadata *zmd) +{ + return (const char *)zmd->devname; +} + /* * Lock/unlock mapping table. * The map lock also protects all the zone lists. @@ -2439,7 +2446,8 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd) /* * Initialize the zoned metadata. */ -int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata) +int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata, + const char *devname) { struct dmz_metadata *zmd; unsigned int i; @@ -2450,6 +2458,7 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata) if (!zmd) return -ENOMEM; + strcpy(zmd->devname, devname); zmd->dev = dev; zmd->mblk_rbtree = RB_ROOT; init_rwsem(&zmd->mblk_sem); diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 5aa5e5130fe8..699c4145306e 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -480,15 +480,16 @@ static void dmz_reclaim_work(struct work_struct *work) zrc->kc_throttle.throttle = min(75U, 100U - p_unmap_rnd / 2); } - dmz_dev_debug(zrc->dev, - "Reclaim (%u): %s, %u%% free rnd zones (%u/%u)", - zrc->kc_throttle.throttle, - (dmz_target_idle(zrc) ? "Idle" : "Busy"), - p_unmap_rnd, nr_unmap_rnd, nr_rnd); + DMDEBUG("(%s): Reclaim (%u): %s, %u%% free rnd zones (%u/%u)", + dmz_metadata_label(zmd), + zrc->kc_throttle.throttle, + (dmz_target_idle(zrc) ? "Idle" : "Busy"), + p_unmap_rnd, nr_unmap_rnd, nr_rnd); ret = dmz_do_reclaim(zrc); if (ret) { - dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret); + DMDEBUG("(%s): Reclaim error %d\n", + dmz_metadata_label(zmd), ret); if (!dmz_check_bdev(zrc->dev)) return; } @@ -524,7 +525,7 @@ int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd, /* Reclaim work */ INIT_DELAYED_WORK(&zrc->work, dmz_reclaim_work); zrc->wq = alloc_ordered_workqueue("dmz_rwq_%s", WQ_MEM_RECLAIM, - dev->name); + dmz_metadata_label(zmd)); if (!zrc->wq) { ret = -ENOMEM; goto err; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 68c5684d7b01..ba5b8c507c98 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -178,11 +178,12 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, return 0; } - dmz_dev_debug(dmz->dev, "READ chunk %llu -> %s zone %u, block %llu, %u blocks", - (unsigned long long)dmz_bio_chunk(zmd, bio), - (dmz_is_rnd(zone) ? "RND" : "SEQ"), - zone->id, - (unsigned long long)chunk_block, nr_blocks); + DMDEBUG("(%s): READ chunk %llu -> %s zone %u, block %llu, %u blocks", + dmz_metadata_label(zmd), + (unsigned long long)dmz_bio_chunk(zmd, bio), + (dmz_is_rnd(zone) ? "RND" : "SEQ"), + zone->id, + (unsigned long long)chunk_block, nr_blocks); /* Check block validity to determine the read location */ bzone = zone->bzone; @@ -316,11 +317,12 @@ static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone, if (!zone) return -ENOSPC; - dmz_dev_debug(dmz->dev, "WRITE chunk %llu -> %s zone %u, block %llu, %u blocks", - (unsigned long long)dmz_bio_chunk(zmd, bio), - (dmz_is_rnd(zone) ? "RND" : "SEQ"), - zone->id, - (unsigned long long)chunk_block, nr_blocks); + DMDEBUG("(%s): WRITE chunk %llu -> %s zone %u, block %llu, %u blocks", + dmz_metadata_label(zmd), + (unsigned long long)dmz_bio_chunk(zmd, bio), + (dmz_is_rnd(zone) ? "RND" : "SEQ"), + zone->id, + (unsigned long long)chunk_block, nr_blocks); if (dmz_is_rnd(zone) || chunk_block == zone->wp_block) { /* @@ -357,10 +359,11 @@ static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone, if (dmz_is_readonly(zone)) return -EROFS; - dmz_dev_debug(dmz->dev, "DISCARD chunk %llu -> zone %u, block %llu, %u blocks", - (unsigned long long)dmz_bio_chunk(zmd, bio), - zone->id, - (unsigned long long)chunk_block, nr_blocks); + DMDEBUG("(%s): DISCARD chunk %llu -> zone %u, block %llu, %u blocks", + dmz_metadata_label(dmz->metadata), + (unsigned long long)dmz_bio_chunk(zmd, bio), + zone->id, + (unsigned long long)chunk_block, nr_blocks); /* * Invalidate blocks in the data zone and its @@ -429,8 +432,8 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, ret = dmz_handle_discard(dmz, zone, bio); break; default: - dmz_dev_err(dmz->dev, "Unsupported BIO operation 0x%x", - bio_op(bio)); + DMERR("(%s): Unsupported BIO operation 0x%x", + dmz_metadata_label(dmz->metadata), bio_op(bio)); ret = -EIO; } @@ -504,7 +507,8 @@ static void dmz_flush_work(struct work_struct *work) /* Flush dirty metadata blocks */ ret = dmz_flush_metadata(dmz->metadata); if (ret) - dmz_dev_debug(dmz->dev, "Metadata flush failed, rc=%d\n", ret); + DMDEBUG("(%s): Metadata flush failed, rc=%d\n", + dmz_metadata_label(dmz->metadata), ret); /* Process queued flush requests */ while (1) { @@ -631,11 +635,12 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) if (dmz_bdev_is_dying(dmz->dev)) return DM_MAPIO_KILL; - dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks", - bio_op(bio), (unsigned long long)sector, nr_sectors, - (unsigned long long)dmz_bio_chunk(zmd, bio), - (unsigned long long)dmz_chunk_block(zmd, dmz_bio_block(bio)), - (unsigned int)dmz_bio_blocks(bio)); + DMDEBUG("(%s): BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks", + dmz_metadata_label(zmd), + bio_op(bio), (unsigned long long)sector, nr_sectors, + (unsigned long long)dmz_bio_chunk(zmd, bio), + (unsigned long long)dmz_chunk_block(zmd, dmz_bio_block(bio)), + (unsigned int)dmz_bio_blocks(bio)); bio_set_dev(bio, dev->bdev); @@ -669,10 +674,10 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) /* Now ready to handle this BIO */ ret = dmz_queue_chunk_work(dmz, bio); if (ret) { - dmz_dev_debug(dmz->dev, - "BIO op %d, can't process chunk %llu, err %i\n", - bio_op(bio), (u64)dmz_bio_chunk(zmd, bio), - ret); + DMDEBUG("(%s): BIO op %d, can't process chunk %llu, err %i\n", + dmz_metadata_label(zmd), + bio_op(bio), (u64)dmz_bio_chunk(zmd, bio), + ret); return DM_MAPIO_REQUEUE; } @@ -782,7 +787,8 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) /* Initialize metadata */ dev = dmz->dev; - ret = dmz_ctr_metadata(dev, &dmz->metadata); + ret = dmz_ctr_metadata(dev, &dmz->metadata, + dm_table_device_name(ti->table)); if (ret) { ti->error = "Metadata initialization failed"; goto err_dev; @@ -811,8 +817,9 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) /* Chunk BIO work */ mutex_init(&dmz->chunk_lock); INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOIO); - dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", WQ_MEM_RECLAIM | WQ_UNBOUND, - 0, dev->name); + dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", + WQ_MEM_RECLAIM | WQ_UNBOUND, 0, + dmz_metadata_label(dmz->metadata)); if (!dmz->chunk_wq) { ti->error = "Create chunk workqueue failed"; ret = -ENOMEM; @@ -824,7 +831,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) bio_list_init(&dmz->flush_list); INIT_DELAYED_WORK(&dmz->flush_work, dmz_flush_work); dmz->flush_wq = alloc_ordered_workqueue("dmz_fwq_%s", WQ_MEM_RECLAIM, - dev->name); + dmz_metadata_label(dmz->metadata)); if (!dmz->flush_wq) { ti->error = "Create flush workqueue failed"; ret = -ENOMEM; @@ -839,9 +846,10 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto err_fwq; } - dmz_dev_info(dev, "Target device: %llu 512-byte logical sectors (%llu blocks)", - (unsigned long long)ti->len, - (unsigned long long)dmz_sect2blk(ti->len)); + DMINFO("(%s): Target device: %llu 512-byte logical sectors (%llu blocks)", + dmz_metadata_label(dmz->metadata), + (unsigned long long)ti->len, + (unsigned long long)dmz_sect2blk(ti->len)); return 0; err_fwq: diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index f997ad62c7b4..dd768dc60341 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -163,7 +163,8 @@ struct dmz_reclaim; /* * Functions defined in dm-zoned-metadata.c */ -int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **zmd); +int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **zmd, + const char *devname); void dmz_dtr_metadata(struct dmz_metadata *zmd); int dmz_resume_metadata(struct dmz_metadata *zmd); @@ -174,6 +175,7 @@ void dmz_unlock_metadata(struct dmz_metadata *zmd); void dmz_lock_flush(struct dmz_metadata *zmd); void dmz_unlock_flush(struct dmz_metadata *zmd); int dmz_flush_metadata(struct dmz_metadata *zmd); +const char *dmz_metadata_label(struct dmz_metadata *zmd); sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone); sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone); From d0e21ce40c7a41df43b70b863cc64395c7787abd Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:23 +0200 Subject: [PATCH 117/427] dm zoned: Introduce dmz_dev_is_dying() and dmz_check_dev() Introduce accessors dmz_dev_is_dying() and dmz_check_dev() to avoid having to reference the devices directly. Signed-off-by: Hannes Reinecke Reviewed-by: Bob Liu Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 14 ++++++++++++-- drivers/md/dm-zoned-reclaim.c | 4 ++-- drivers/md/dm-zoned-target.c | 2 +- drivers/md/dm-zoned.h | 3 +++ 4 files changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 7cda48683c0b..426af738f1ca 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -267,6 +267,16 @@ const char *dmz_metadata_label(struct dmz_metadata *zmd) return (const char *)zmd->devname; } +bool dmz_check_dev(struct dmz_metadata *zmd) +{ + return dmz_check_bdev(&zmd->dev[0]); +} + +bool dmz_dev_is_dying(struct dmz_metadata *zmd) +{ + return dmz_bdev_is_dying(&zmd->dev[0]); +} + /* * Lock/unlock mapping table. * The map lock also protects all the zone lists. @@ -1719,7 +1729,7 @@ again: /* Allocate a random zone */ dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND); if (!dzone) { - if (dmz_bdev_is_dying(zmd->dev)) { + if (dmz_dev_is_dying(zmd)) { dzone = ERR_PTR(-EIO); goto out; } @@ -1820,7 +1830,7 @@ again: /* Allocate a random zone */ bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND); if (!bzone) { - if (dmz_bdev_is_dying(zmd->dev)) { + if (dmz_dev_is_dying(zmd)) { bzone = ERR_PTR(-EIO); goto out; } diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 699c4145306e..5daede0daf92 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -455,7 +455,7 @@ static void dmz_reclaim_work(struct work_struct *work) unsigned int p_unmap_rnd; int ret; - if (dmz_bdev_is_dying(zrc->dev)) + if (dmz_dev_is_dying(zmd)) return; if (!dmz_should_reclaim(zrc)) { @@ -490,7 +490,7 @@ static void dmz_reclaim_work(struct work_struct *work) if (ret) { DMDEBUG("(%s): Reclaim error %d\n", dmz_metadata_label(zmd), ret); - if (!dmz_check_bdev(zrc->dev)) + if (!dmz_check_dev(zmd)) return; } diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index ba5b8c507c98..b32e791b8a5c 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -632,7 +632,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) sector_t chunk_sector; int ret; - if (dmz_bdev_is_dying(dmz->dev)) + if (dmz_dev_is_dying(zmd)) return DM_MAPIO_KILL; DMDEBUG("(%s): BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks", diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index dd768dc60341..e0883df8a903 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -181,6 +181,9 @@ sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone); sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone); unsigned int dmz_nr_chunks(struct dmz_metadata *zmd); +bool dmz_check_dev(struct dmz_metadata *zmd); +bool dmz_dev_is_dying(struct dmz_metadata *zmd); + #define DMZ_ALLOC_RND 0x01 #define DMZ_ALLOC_RECLAIM 0x02 From 1574051e52cb4b5b7f7509cfd729b76ca1117808 Mon Sep 17 00:00:00 2001 From: Xiaochun Lee Date: Thu, 14 May 2020 23:31:07 -0400 Subject: [PATCH 118/427] x86/PCI: Mark Intel C620 MROMs as having non-compliant BARs The Intel C620 Platform Controller Hub has MROM functions that have non-PCI registers (undocumented in the public spec) where BAR 0 is supposed to be, which results in messages like this: pci 0000:00:11.0: [Firmware Bug]: reg 0x30: invalid BAR (can't size) Mark these MROM functions as having non-compliant BARs so we don't try to probe any of them. There are no other BARs on these devices. See the Intel C620 Series Chipset Platform Controller Hub Datasheet, May 2019, Document Number 336067-007US, sec 2.1, 35.5, 35.6. [bhelgaas: commit log, add 0xa26d] Link: https://lore.kernel.org/r/1589513467-17070-1-git-send-email-lixiaochun.2888@163.com Signed-off-by: Xiaochun Lee Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org --- arch/x86/pci/fixup.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index e723559c386a..0c67a5a94de3 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -572,6 +572,10 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa1ec, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa1ed, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa26c, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa26d, pci_invalid_bar); /* * Device [1022:7808] From f044baaff1eb7ae5aa7a36f1b7ad5bd8eeb672c4 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 15 May 2020 14:31:16 -0500 Subject: [PATCH 119/427] PCI/PM: Adjust pcie_wait_for_link_delay() for caller delay The caller of pcie_wait_for_link_delay() specifies the time to wait after the link becomes active. When the downstream port doesn't support link active reporting, obviously we can't tell when the link becomes active, so we waited the worst-case time (1000 ms) plus 100 ms, ignoring the delay from the caller. Instead, wait for 1000 ms + the delay from the caller. Fixes: 4827d63891b6 ("PCI/PM: Add pcie_wait_for_link_delay()") Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index dfa7ec008963..a4efc7e0061f 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -4675,10 +4675,10 @@ static bool pcie_wait_for_link_delay(struct pci_dev *pdev, bool active, /* * Some controllers might not implement link active reporting. In this - * case, we wait for 1000 + 100 ms. + * case, we wait for 1000 ms + any delay requested by the caller. */ if (!pdev->link_active_reporting) { - msleep(1100); + msleep(timeout + delay); return true; } From ec411e02b7a2e785a4ed9ed283207cd14f48699d Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 14 May 2020 16:30:43 +0300 Subject: [PATCH 120/427] PCI/PM: Assume ports without DLL Link Active train links in 100 ms Kai-Heng Feng reported that it takes a long time (> 1 s) to resume Thunderbolt-connected devices from both runtime suspend and system sleep (s2idle). This was because some Downstream Ports that support > 5 GT/s do not also support Data Link Layer Link Active reporting. Per PCIe r5.0 sec 6.6.1: With a Downstream Port that supports Link speeds greater than 5.0 GT/s, software must wait a minimum of 100 ms after Link training completes before sending a Configuration Request to the device immediately below that Port. Software can determine when Link training completes by polling the Data Link Layer Link Active bit or by setting up an associated interrupt (see Section 6.7.3.3). Sec 7.5.3.6 requires such Ports to support DLL Link Active reporting, but at least the Intel JHL6240 Thunderbolt 3 Bridge [8086:15c0] and the Intel JHL7540 Thunderbolt 3 Bridge [8086:15ea] do not. Previously we tried to wait for Link training to complete, but since there was no DLL Link Active reporting, all we could do was wait the worst-case 1000 ms, then another 100 ms. Instead of using the supported speeds to determine whether to wait for Link training, check whether the port supports DLL Link Active reporting. The Ports in question do not, so we'll wait only the 100 ms required for Ports that support Link speeds <= 5 GT/s. This of course assumes these Ports always train the Link within 100 ms even if they are operating at > 5 GT/s, which is not required by the spec. [bhelgaas: commit log, comment] Link: https://bugzilla.kernel.org/show_bug.cgi?id=206837 Link: https://lore.kernel.org/r/20200514133043.27429-1-mika.westerberg@linux.intel.com Reported-by: Kai-Heng Feng Tested-by: Kai-Heng Feng Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index a4efc7e0061f..d4758518a97b 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -4662,7 +4662,8 @@ static int pci_pm_reset(struct pci_dev *dev, int probe) * pcie_wait_for_link_delay - Wait until link is active or inactive * @pdev: Bridge device * @active: waiting for active or inactive? - * @delay: Delay to wait after link has become active (in ms) + * @delay: Delay to wait after link has become active (in ms). Specify %0 + * for no delay. * * Use this to wait till link becomes active or inactive. */ @@ -4703,7 +4704,7 @@ static bool pcie_wait_for_link_delay(struct pci_dev *pdev, bool active, msleep(10); timeout -= 10; } - if (active && ret) + if (active && ret && delay) msleep(delay); else if (ret != active) pci_info(pdev, "Data Link Layer Link Active not %s in 1000 msec\n", @@ -4824,17 +4825,28 @@ void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev) if (!pcie_downstream_port(dev)) return; - if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) { - pci_dbg(dev, "waiting %d ms for downstream link\n", delay); - msleep(delay); - } else { - pci_dbg(dev, "waiting %d ms for downstream link, after activation\n", - delay); - if (!pcie_wait_for_link_delay(dev, true, delay)) { + /* + * Per PCIe r5.0, sec 6.6.1, for downstream ports that support + * speeds > 5 GT/s, we must wait for link training to complete + * before the mandatory delay. + * + * We can only tell when link training completes via DLL Link + * Active, which is required for downstream ports that support + * speeds > 5 GT/s (sec 7.5.3.6). Unfortunately some common + * devices do not implement Link Active reporting even when it's + * required, so we'll check for that directly instead of checking + * the supported link speed. We assume devices without Link Active + * reporting can train in 100 ms regardless of speed. + */ + if (dev->link_active_reporting) { + pci_dbg(dev, "waiting for link to train\n"); + if (!pcie_wait_for_link_delay(dev, true, 0)) { /* Did not train, no need to wait any further */ return; } } + pci_dbg(child, "waiting %d ms to become accessible\n", delay); + msleep(delay); if (!pci_device_is_present(child)) { pci_dbg(child, "waiting additional %d ms to become accessible\n", delay); From 59721d4eb7f66f27440ad74f875b97e64133ee3b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 28 Apr 2020 01:03:57 +0900 Subject: [PATCH 121/427] kbuild: warn if always, hostprogs-y, or hostprogs-m is used always, hostprogs-y, and hostprogs-m are deprecated. There is no user in upstream code, but I will keep them for external modules. I want to remove them entirely someday. Prompt downstream users for the migration. Signed-off-by: Masahiro Yamada --- scripts/Makefile.lib | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index cd52a8c6428f..52299d5dba28 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -4,8 +4,18 @@ asflags-y += $(EXTRA_AFLAGS) ccflags-y += $(EXTRA_CFLAGS) cppflags-y += $(EXTRA_CPPFLAGS) ldflags-y += $(EXTRA_LDFLAGS) +ifneq ($(always),) +$(warning 'always' is deprecated. Please use 'always-y' instead) always-y += $(always) -hostprogs += $(hostprogs-y) $(hostprogs-m) +endif +ifneq ($(hostprogs-y),) +$(warning 'hostprogs-y' is deprecated. Please use 'hostprogs' instead) +hostprogs += $(hostprogs-y) +endif +ifneq ($(hostprogs-m),) +$(warning 'hostprogs-m' is deprecated. Please use 'hostprogs' instead) +hostprogs += $(hostprogs-m) +endif # flags that take effect in current and sub directories KBUILD_AFLAGS += $(subdir-asflags-y) From 9371f86ecb60f6f1f120e3d93fe892bbb70d04c0 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 29 Apr 2020 12:45:13 +0900 Subject: [PATCH 122/427] bpfilter: match bit size of bpfilter_umh to that of the kernel bpfilter_umh is built for the default machine bit of the compiler, which may not match to the bit size of the kernel. This happens in the scenario below: You can use biarch GCC that defaults to 64-bit for building the 32-bit kernel. In this case, Kbuild passes -m32 to teach the compiler to produce 32-bit kernel space objects. However, it is missing when building bpfilter_umh. It is built as a 64-bit ELF, and then embedded into the 32-bit kernel. The 32-bit kernel and 64-bit umh is a bad combination. In theory, we can have 32-bit umh running on 64-bit kernel, but we do not have a good reason to support such a usecase. The best is to match the bit size between them. Pass -m32 or -m64 to the umh build command if it is found in $(KBUILD_CFLAGS). Evaluate CC_CAN_LINK against the kernel bit-size. Signed-off-by: Masahiro Yamada --- init/Kconfig | 4 +++- net/bpfilter/Makefile | 5 +++-- usr/include/Makefile | 4 ++++ 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index ed1d82c9f1df..b2ce83c2e84c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -45,7 +45,9 @@ config CLANG_VERSION default $(shell,$(srctree)/scripts/clang-version.sh $(CC)) config CC_CAN_LINK - def_bool $(success,$(srctree)/scripts/cc-can-link.sh $(CC)) + bool + default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(m64-flag)) if 64BIT + default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(m32-flag)) config CC_HAS_ASM_GOTO def_bool $(success,$(srctree)/scripts/gcc-goto.sh $(CC)) diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile index 36580301da70..f6209e4827b9 100644 --- a/net/bpfilter/Makefile +++ b/net/bpfilter/Makefile @@ -5,14 +5,15 @@ hostprogs := bpfilter_umh bpfilter_umh-objs := main.o -KBUILD_HOSTCFLAGS += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi +KBUILD_HOSTCFLAGS += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \ + $(filter -m32 -m64, $(KBUILD_CFLAGS)) HOSTCC := $(CC) ifeq ($(CONFIG_BPFILTER_UMH), y) # builtin bpfilter_umh should be compiled with -static # since rootfs isn't mounted at the time of __init # function is called and do_execv won't find elf interpreter -KBUILD_HOSTLDFLAGS += -static +KBUILD_HOSTLDFLAGS += -static $(filter -m32 -m64, $(KBUILD_CFLAGS)) endif $(obj)/bpfilter_umh_blob.o: $(obj)/bpfilter_umh diff --git a/usr/include/Makefile b/usr/include/Makefile index 5a7ee3e5ed86..55362f3ab393 100644 --- a/usr/include/Makefile +++ b/usr/include/Makefile @@ -8,6 +8,10 @@ # We cannot go as far as adding -Wpedantic since it emits too many warnings. UAPI_CFLAGS := -std=c90 -Wall -Werror=implicit-function-declaration +# In theory, we do not care -m32 or -m64 for header compile tests. +# It is here just because CONFIG_CC_CAN_LINK is tested with -m32 or -m64. +UAPI_CFLAGS += $(filter -m32 -m64, $(KBUILD_CFLAGS)) + override c_flags = $(UAPI_CFLAGS) -Wp,-MMD,$(depfile) -I$(objtree)/usr/include # The following are excluded for now because they fail to build. From b1183b6dca3e0d59ce8fa81767def6ea6188e8ec Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 9 May 2020 16:39:15 +0900 Subject: [PATCH 123/427] bpfilter: check if $(CC) can link static libc in Kconfig On Fedora, linking static glibc requires the glibc-static RPM package, which is not part of the glibc-devel package. CONFIG_CC_CAN_LINK does not check the capability of static linking, so you can enable CONFIG_BPFILTER_UMH, then fail to build: HOSTLD net/bpfilter/bpfilter_umh /usr/bin/ld: cannot find -lc collect2: error: ld returned 1 exit status Add CONFIG_CC_CAN_LINK_STATIC, and make CONFIG_BPFILTER_UMH depend on it. Reported-by: Valdis Kletnieks Signed-off-by: Masahiro Yamada Acked-by: Alexei Starovoitov --- init/Kconfig | 5 +++++ net/bpfilter/Kconfig | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index b2ce83c2e84c..fb8ab85de7f8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -49,6 +49,11 @@ config CC_CAN_LINK default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(m64-flag)) if 64BIT default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(m32-flag)) +config CC_CAN_LINK_STATIC + bool + default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) -static $(m64-flag)) if 64BIT + default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) -static $(m32-flag)) + config CC_HAS_ASM_GOTO def_bool $(success,$(srctree)/scripts/gcc-goto.sh $(CC)) diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig index fed9290e3b41..045144d4a42c 100644 --- a/net/bpfilter/Kconfig +++ b/net/bpfilter/Kconfig @@ -9,7 +9,7 @@ menuconfig BPFILTER if BPFILTER config BPFILTER_UMH tristate "bpfilter kernel module with user mode helper" - depends on CC_CAN_LINK + depends on CC_CAN_LINK_STATIC default m help This builds bpfilter kernel module with embedded user mode helper From 7f3a59db274c3e3d884c785e363a054110f1c266 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 29 Apr 2020 12:45:14 +0900 Subject: [PATCH 124/427] kbuild: add infrastructure to build userspace programs Kbuild supports the infrastructure to build host programs, but there was no support to build userspace programs for the target architecture (i.e. the same architecture as the kernel). Sam Ravnborg worked on this in 2014 (https://lkml.org/lkml/2014/7/13/154), but it was not merged. One problem at that time was, there was no good way to know whether $(CC) can link standalone programs. In fact, pre-built kernel.org toolchains [1] are often used for building the kernel, but they do not provide libc. Now, we can handle this cleanly because the compiler capability is evaluated at the Kconfig time. If $(CC) cannot link standalone programs, the relevant options are hidden by 'depends on CC_CAN_LINK'. The implementation just mimics scripts/Makefile.host The userspace programs are compiled with the same flags as the host programs. In addition, it uses -m32 or -m64 if it is found in $(KBUILD_CFLAGS). This new syntax has two usecases. - Sample programs Several userspace programs under samples/ include UAPI headers installed in usr/include. Most of them were previously built for the host architecture just to use the 'hostprogs' syntax. However, 'make headers' always works for the target architecture. This caused the arch mismatch in cross-compiling. To fix this distortion, sample code should be built for the target architecture. - Bpfilter net/bpfilter/Makefile compiles bpfilter_umh as the user mode helper, and embeds it into the kernel. Currently, it overrides HOSTCC with CC to use the 'hostprogs' syntax. This hack should go away. [1]: https://mirrors.edge.kernel.org/pub/tools/crosstool/ Signed-off-by: Masahiro Yamada Acked-by: Sam Ravnborg --- Makefile | 13 ++++++++--- scripts/Makefile.build | 6 +++++ scripts/Makefile.clean | 2 +- scripts/Makefile.userprogs | 45 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 4 deletions(-) create mode 100644 scripts/Makefile.userprogs diff --git a/Makefile b/Makefile index 9671fa09c83a..faec37f23c48 100644 --- a/Makefile +++ b/Makefile @@ -406,9 +406,12 @@ else HOSTCC = gcc HOSTCXX = g++ endif -KBUILD_HOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 \ - -fomit-frame-pointer -std=gnu89 $(HOST_LFS_CFLAGS) \ - $(HOSTCFLAGS) + +export KBUILD_USERCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes \ + -O2 -fomit-frame-pointer -std=gnu89 +export KBUILD_USERLDFLAGS := + +KBUILD_HOSTCFLAGS := $(KBUILD_USERCFLAGS) $(HOST_LFS_CFLAGS) $(HOSTCFLAGS) KBUILD_HOSTCXXFLAGS := -Wall -O2 $(HOST_LFS_CFLAGS) $(HOSTCXXFLAGS) KBUILD_HOSTLDFLAGS := $(HOST_LFS_LDFLAGS) $(HOSTLDFLAGS) KBUILD_HOSTLDLIBS := $(HOST_LFS_LIBS) $(HOSTLDLIBS) @@ -944,6 +947,10 @@ ifeq ($(CONFIG_RELR),y) LDFLAGS_vmlinux += --pack-dyn-relocs=relr endif +# Align the bit size of userspace programs with the kernel +KBUILD_USERCFLAGS += $(filter -m32 -m64, $(KBUILD_CFLAGS)) +KBUILD_USERLDFLAGS += $(filter -m32 -m64, $(KBUILD_CFLAGS)) + # make the checker run with the right architecture CHECKFLAGS += --arch=$(ARCH) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 9fcbfac15d1d..3665b1a0bc8e 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -50,6 +50,12 @@ ifneq ($(hostprogs)$(hostcxxlibs-y)$(hostcxxlibs-m),) include scripts/Makefile.host endif +# Do not include userprogs rules unless needed. +userprogs := $(sort $(userprogs)) +ifneq ($(userprogs),) +include scripts/Makefile.userprogs +endif + ifndef obj $(warning kbuild: Makefile.build is included improperly) endif diff --git a/scripts/Makefile.clean b/scripts/Makefile.clean index 075f0cc2d8d7..e2c76122319d 100644 --- a/scripts/Makefile.clean +++ b/scripts/Makefile.clean @@ -29,7 +29,7 @@ subdir-ymn := $(addprefix $(obj)/,$(subdir-ymn)) __clean-files := $(extra-y) $(extra-m) $(extra-) \ $(always) $(always-y) $(always-m) $(always-) $(targets) $(clean-files) \ - $(hostprogs) $(hostprogs-y) $(hostprogs-m) $(hostprogs-) \ + $(hostprogs) $(hostprogs-y) $(hostprogs-m) $(hostprogs-) $(userprogs) \ $(hostcxxlibs-y) $(hostcxxlibs-m) __clean-files := $(filter-out $(no-clean-files), $(__clean-files)) diff --git a/scripts/Makefile.userprogs b/scripts/Makefile.userprogs new file mode 100644 index 000000000000..fb415297337a --- /dev/null +++ b/scripts/Makefile.userprogs @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Build userspace programs for the target system +# + +# Executables compiled from a single .c file +user-csingle := $(foreach m, $(userprogs), $(if $($(m)-objs),,$(m))) + +# Executables linked based on several .o files +user-cmulti := $(foreach m, $(userprogs), $(if $($(m)-objs),$(m))) + +# Objects compiled from .c files +user-cobjs := $(sort $(foreach m, $(userprogs), $($(m)-objs))) + +user-csingle := $(addprefix $(obj)/, $(user-csingle)) +user-cmulti := $(addprefix $(obj)/, $(user-cmulti)) +user-cobjs := $(addprefix $(obj)/, $(user-cobjs)) + +user_ccflags = -Wp,-MMD,$(depfile) $(KBUILD_USERCFLAGS) $(userccflags) \ + $($(target-stem)-userccflags) +user_ldflags = $(KBUILD_USERLDFLAGS) $(userldflags) $($(target-stem)-userldflags) + +# Create an executable from a single .c file +quiet_cmd_user_cc_c = CC [U] $@ + cmd_user_cc_c = $(CC) $(user_ccflags) $(user_ldflags) -o $@ $< \ + $($(target-stem)-userldlibs) +$(user-csingle): $(obj)/%: $(src)/%.c FORCE + $(call if_changed_dep,user_cc_c) + +# Link an executable based on list of .o files +quiet_cmd_user_ld = LD [U] $@ + cmd_user_ld = $(CC) $(user_ldflags) -o $@ \ + $(addprefix $(obj)/, $($(target-stem)-objs)) \ + $($(target-stem)-userldlibs) +$(user-cmulti): FORCE + $(call if_changed,user_ld) +$(call multi_depend, $(user-cmulti), , -objs) + +# Create .o file from a .c file +quiet_cmd_user_cc_o_c = CC [U] $@ + cmd_user_cc_o_c = $(CC) $(user_ccflags) -c -o $@ $< +$(user-cobjs): $(obj)/%.o: $(src)/%.c FORCE + $(call if_changed_dep,user_cc_o_c) + +targets += $(user-csingle) $(user-cmulti) $(user-cobjs) From 8a2cc0505cc4313e6ce2bc8fc1ce3607b410e114 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 29 Apr 2020 12:45:15 +0900 Subject: [PATCH 125/427] bpfilter: use 'userprogs' syntax to build bpfilter_umh The user mode helper should be compiled for the same architecture as the kernel. This Makefile reused the 'hostprogs' syntax by overriding HOSTCC with CC. Use the new syntax 'userprogs' to fix the Makefile mess. Signed-off-by: Masahiro Yamada Acked-by: Sam Ravnborg --- net/bpfilter/Makefile | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile index f6209e4827b9..f23b53294fba 100644 --- a/net/bpfilter/Makefile +++ b/net/bpfilter/Makefile @@ -3,18 +3,14 @@ # Makefile for the Linux BPFILTER layer. # -hostprogs := bpfilter_umh +userprogs := bpfilter_umh bpfilter_umh-objs := main.o -KBUILD_HOSTCFLAGS += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \ - $(filter -m32 -m64, $(KBUILD_CFLAGS)) -HOSTCC := $(CC) +userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi -ifeq ($(CONFIG_BPFILTER_UMH), y) -# builtin bpfilter_umh should be compiled with -static +# builtin bpfilter_umh should be linked with -static # since rootfs isn't mounted at the time of __init # function is called and do_execv won't find elf interpreter -KBUILD_HOSTLDFLAGS += -static $(filter -m32 -m64, $(KBUILD_CFLAGS)) -endif +userldflags += -static $(obj)/bpfilter_umh_blob.o: $(obj)/bpfilter_umh From f59e76687742fc2d6226e0d6449ae09c5942e9fc Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 29 Apr 2020 12:45:16 +0900 Subject: [PATCH 126/427] samples: seccomp: build sample programs for target architecture These userspace programs include UAPI headers exported to usr/include/. 'make headers' always works for the target architecture (i.e. the same architecture as the kernel), so the sample programs should be built for the target as well. Kbuild now supports 'userprogs' for that. I also guarded the CONFIG option by 'depends on CC_CAN_LINK' because $(CC) may not provide libc. The 'ifndef CROSS_COMPILE' is no longer needed. BTW, the -m31 for s390 is left-over code. Commit 5a79859ae0f3 ("s390: remove 31 bit support") killed it. Signed-off-by: Masahiro Yamada Acked-by: Sam Ravnborg --- samples/Kconfig | 2 +- samples/seccomp/Makefile | 42 +++------------------------------------- 2 files changed, 4 insertions(+), 40 deletions(-) diff --git a/samples/Kconfig b/samples/Kconfig index 9d236c346de5..8949e9646125 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -126,7 +126,7 @@ config SAMPLE_PIDFD config SAMPLE_SECCOMP bool "Build seccomp sample code" - depends on SECCOMP_FILTER && HEADERS_INSTALL + depends on SECCOMP_FILTER && CC_CAN_LINK && HEADERS_INSTALL help Build samples of seccomp filters using various methods of BPF filter construction. diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile index 89279e8b87df..75916c23e416 100644 --- a/samples/seccomp/Makefile +++ b/samples/seccomp/Makefile @@ -1,44 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 -ifndef CROSS_COMPILE -hostprogs := bpf-fancy dropper bpf-direct user-trap +userprogs := bpf-fancy dropper bpf-direct user-trap -HOSTCFLAGS_bpf-fancy.o += -I$(objtree)/usr/include -HOSTCFLAGS_bpf-fancy.o += -idirafter $(objtree)/include -HOSTCFLAGS_bpf-helper.o += -I$(objtree)/usr/include -HOSTCFLAGS_bpf-helper.o += -idirafter $(objtree)/include bpf-fancy-objs := bpf-fancy.o bpf-helper.o -HOSTCFLAGS_dropper.o += -I$(objtree)/usr/include -HOSTCFLAGS_dropper.o += -idirafter $(objtree)/include -dropper-objs := dropper.o +userccflags += -I usr/include -HOSTCFLAGS_bpf-direct.o += -I$(objtree)/usr/include -HOSTCFLAGS_bpf-direct.o += -idirafter $(objtree)/include -bpf-direct-objs := bpf-direct.o - -HOSTCFLAGS_user-trap.o += -I$(objtree)/usr/include -HOSTCFLAGS_user-trap.o += -idirafter $(objtree)/include -user-trap-objs := user-trap.o - -# Try to match the kernel target. -ifndef CONFIG_64BIT - -# s390 has -m31 flag to build 31 bit binaries -ifndef CONFIG_S390 -MFLAG = -m32 -else -MFLAG = -m31 -endif - -HOSTCFLAGS_bpf-direct.o += $(MFLAG) -HOSTCFLAGS_dropper.o += $(MFLAG) -HOSTCFLAGS_bpf-helper.o += $(MFLAG) -HOSTCFLAGS_bpf-fancy.o += $(MFLAG) -HOSTCFLAGS_user-trap.o += $(MFLAG) -HOSTLDLIBS_bpf-direct += $(MFLAG) -HOSTLDLIBS_bpf-fancy += $(MFLAG) -HOSTLDLIBS_dropper += $(MFLAG) -HOSTLDLIBS_user-trap += $(MFLAG) -endif -always-y := $(hostprogs) -endif +always-y := $(userprogs) From e079a08c60744af54eed7b7f957d6c87b163f25e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 29 Apr 2020 12:45:17 +0900 Subject: [PATCH 127/427] kbuild: doc: document the new syntax 'userprogs' Kbuild now supports the syntax 'userprogs' to compile userspace programs for the same architecture as the kernel. Insert the section '5 Userspace Program support' to explain it. I copy-pasted '4 Host Program support' and fixed it up. Signed-off-by: Masahiro Yamada Acked-by: Sam Ravnborg --- Documentation/kbuild/makefiles.rst | 177 +++++++++++++++++++++-------- 1 file changed, 132 insertions(+), 45 deletions(-) diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst index b80257a03830..2a18aea7c043 100644 --- a/Documentation/kbuild/makefiles.rst +++ b/Documentation/kbuild/makefiles.rst @@ -29,31 +29,37 @@ This document describes the Linux kernel Makefiles. --- 4.4 Controlling compiler options for host programs --- 4.5 When host programs are actually built - === 5 Kbuild clean infrastructure + === 5 Userspace Program support + --- 5.1 Simple Userspace Program + --- 5.2 Composite Userspace Programs + --- 5.3 Controlling compiler options for userspace programs + --- 5.4 When userspace programs are actually built - === 6 Architecture Makefiles - --- 6.1 Set variables to tweak the build to the architecture - --- 6.2 Add prerequisites to archheaders: - --- 6.3 Add prerequisites to archprepare: - --- 6.4 List directories to visit when descending - --- 6.5 Architecture-specific boot images - --- 6.6 Building non-kbuild targets - --- 6.7 Commands useful for building a boot image - --- 6.8 Custom kbuild commands - --- 6.9 Preprocessing linker scripts - --- 6.10 Generic header files - --- 6.11 Post-link pass + === 6 Kbuild clean infrastructure - === 7 Kbuild syntax for exported headers - --- 7.1 no-export-headers - --- 7.2 generic-y - --- 7.3 generated-y - --- 7.4 mandatory-y + === 7 Architecture Makefiles + --- 7.1 Set variables to tweak the build to the architecture + --- 7.2 Add prerequisites to archheaders: + --- 7.3 Add prerequisites to archprepare: + --- 7.4 List directories to visit when descending + --- 7.5 Architecture-specific boot images + --- 7.6 Building non-kbuild targets + --- 7.7 Commands useful for building a boot image + --- 7.8 Custom kbuild commands + --- 7.9 Preprocessing linker scripts + --- 7.10 Generic header files + --- 7.11 Post-link pass - === 8 Kbuild Variables - === 9 Makefile language - === 10 Credits - === 11 TODO + === 8 Kbuild syntax for exported headers + --- 8.1 no-export-headers + --- 8.2 generic-y + --- 8.3 generated-y + --- 8.4 mandatory-y + + === 9 Kbuild Variables + === 10 Makefile language + === 11 Credits + === 12 TODO 1 Overview ========== @@ -732,7 +738,88 @@ Both possibilities are described in the following. This will tell kbuild to build lxdialog even if not referenced in any rule. -5 Kbuild clean infrastructure +5 Userspace Program support +=========================== + +Just like host programs, Kbuild also supports building userspace executables +for the target architecture (i.e. the same architecture as you are building +the kernel for). + +The syntax is quite similar. The difference is to use "userprogs" instead of +"hostprogs". + +5.1 Simple Userspace Program +---------------------------- + + The following line tells kbuild that the program bpf-direct shall be + built for the target architecture. + + Example:: + + userprogs := bpf-direct + + Kbuild assumes in the above example that bpf-direct is made from a + single C source file named bpf-direct.c located in the same directory + as the Makefile. + +5.2 Composite Userspace Programs +-------------------------------- + + Userspace programs can be made up based on composite objects. + The syntax used to define composite objects for userspace programs is + similar to the syntax used for kernel objects. + $(-objs) lists all objects used to link the final + executable. + + Example:: + + #samples/seccomp/Makefile + userprogs := bpf-fancy + bpf-fancy-objs := bpf-fancy.o bpf-helper.o + + Objects with extension .o are compiled from the corresponding .c + files. In the above example, bpf-fancy.c is compiled to bpf-fancy.o + and bpf-helper.c is compiled to bpf-helper.o. + + Finally, the two .o files are linked to the executable, bpf-fancy. + Note: The syntax -y is not permitted for userspace programs. + +5.3 Controlling compiler options for userspace programs +------------------------------------------------------- + + When compiling userspace programs, it is possible to set specific flags. + The programs will always be compiled utilising $(CC) passed + the options specified in $(KBUILD_USERCFLAGS). + To set flags that will take effect for all userspace programs created + in that Makefile, use the variable userccflags. + + Example:: + + # samples/seccomp/Makefile + userccflags += -I usr/include + + To set specific flags for a single file the following construction + is used: + + Example:: + + bpf-helper-userccflags += -I user/include + + It is also possible to specify additional options to the linker. + + Example:: + + # net/bpfilter/Makefile + bpfilter_umh-userldflags += -static + + When linking bpfilter_umh, it will be passed the extra option -static. + +5.4 When userspace programs are actually built +---------------------------------------------- + + Same as "When host programs are actually built". + +6 Kbuild clean infrastructure ============================= "make clean" deletes most generated files in the obj tree where the kernel @@ -790,7 +877,7 @@ is not operational at that point. Note 2: All directories listed in core-y, libs-y, drivers-y and net-y will be visited during "make clean". -6 Architecture Makefiles +7 Architecture Makefiles ======================== The top level Makefile sets up the environment and does the preparation, @@ -820,7 +907,7 @@ When kbuild executes, the following steps are followed (roughly): - Preparing initrd images and the like -6.1 Set variables to tweak the build to the architecture +7.1 Set variables to tweak the build to the architecture -------------------------------------------------------- LDFLAGS @@ -967,7 +1054,7 @@ When kbuild executes, the following steps are followed (roughly): KBUILD_VMLINUX_LIBS together specify all the object files used to link vmlinux. -6.2 Add prerequisites to archheaders +7.2 Add prerequisites to archheaders ------------------------------------ The archheaders: rule is used to generate header files that @@ -977,7 +1064,7 @@ When kbuild executes, the following steps are followed (roughly): architecture itself. -6.3 Add prerequisites to archprepare +7.3 Add prerequisites to archprepare ------------------------------------ The archprepare: rule is used to list prerequisites that need to be @@ -995,7 +1082,7 @@ When kbuild executes, the following steps are followed (roughly): generating offset header files. -6.4 List directories to visit when descending +7.4 List directories to visit when descending --------------------------------------------- An arch Makefile cooperates with the top Makefile to define variables @@ -1030,7 +1117,7 @@ When kbuild executes, the following steps are followed (roughly): drivers-$(CONFIG_OPROFILE) += arch/sparc64/oprofile/ -6.5 Architecture-specific boot images +7.5 Architecture-specific boot images ------------------------------------- An arch Makefile specifies goals that take the vmlinux file, compress @@ -1085,7 +1172,7 @@ When kbuild executes, the following steps are followed (roughly): When "make" is executed without arguments, bzImage will be built. -6.6 Building non-kbuild targets +7.6 Building non-kbuild targets ------------------------------- extra-y @@ -1108,7 +1195,7 @@ When kbuild executes, the following steps are followed (roughly): In this example, extra-y is used to list object files that shall be built, but shall not be linked as part of built-in.a. -6.7 Commands useful for building a boot image +7.7 Commands useful for building a boot image --------------------------------------------- Kbuild provides a few macros that are useful when building a @@ -1211,7 +1298,7 @@ When kbuild executes, the following steps are followed (roughly): targets += $(dtb-y) DTC_FLAGS ?= -p 1024 -6.8 Custom kbuild commands +7.8 Custom kbuild commands -------------------------- When kbuild is executing with KBUILD_VERBOSE=0, then only a shorthand @@ -1241,7 +1328,7 @@ When kbuild executes, the following steps are followed (roughly): will be displayed with "make KBUILD_VERBOSE=0". -6.9 Preprocessing linker scripts +7.9 Preprocessing linker scripts -------------------------------- When the vmlinux image is built, the linker script @@ -1274,7 +1361,7 @@ When kbuild executes, the following steps are followed (roughly): The kbuild infrastructure for `*lds` files is used in several architecture-specific files. -6.10 Generic header files +7.10 Generic header files ------------------------- The directory include/asm-generic contains the header files @@ -1283,7 +1370,7 @@ When kbuild executes, the following steps are followed (roughly): to list the file in the Kbuild file. See "7.2 generic-y" for further info on syntax etc. -6.11 Post-link pass +7.11 Post-link pass ------------------- If the file arch/xxx/Makefile.postlink exists, this makefile @@ -1299,7 +1386,7 @@ When kbuild executes, the following steps are followed (roughly): For example, powerpc uses this to check relocation sanity of the linked vmlinux file. -7 Kbuild syntax for exported headers +8 Kbuild syntax for exported headers ------------------------------------ The kernel includes a set of headers that is exported to userspace. @@ -1319,14 +1406,14 @@ A Kbuild file may be defined under arch//include/uapi/asm/ and arch//include/asm/ to list asm files coming from asm-generic. See subsequent chapter for the syntax of the Kbuild file. -7.1 no-export-headers +8.1 no-export-headers --------------------- no-export-headers is essentially used by include/uapi/linux/Kbuild to avoid exporting specific headers (e.g. kvm.h) on architectures that do not support it. It should be avoided as much as possible. -7.2 generic-y +8.2 generic-y ------------- If an architecture uses a verbatim copy of a header from @@ -1356,7 +1443,7 @@ See subsequent chapter for the syntax of the Kbuild file. #include -7.3 generated-y +8.3 generated-y --------------- If an architecture generates other header files alongside generic-y @@ -1370,7 +1457,7 @@ See subsequent chapter for the syntax of the Kbuild file. #arch/x86/include/asm/Kbuild generated-y += syscalls_32.h -7.4 mandatory-y +8.4 mandatory-y --------------- mandatory-y is essentially used by include/(uapi/)asm-generic/Kbuild @@ -1380,7 +1467,7 @@ See subsequent chapter for the syntax of the Kbuild file. in arch/$(ARCH)/include/(uapi/)/asm, Kbuild will automatically generate a wrapper of the asm-generic one. -8 Kbuild Variables +9 Kbuild Variables ================== The top Makefile exports the following variables: @@ -1438,8 +1525,8 @@ The top Makefile exports the following variables: command. -9 Makefile language -=================== +10 Makefile language +==================== The kernel Makefiles are designed to be run with GNU Make. The Makefiles use only the documented features of GNU Make, but they do use many @@ -1458,7 +1545,7 @@ time the left-hand side is used. There are some cases where "=" is appropriate. Usually, though, ":=" is the right choice. -10 Credits +11 Credits ========== - Original version made by Michael Elizabeth Chastain, @@ -1466,7 +1553,7 @@ is the right choice. - Updates by Sam Ravnborg - Language QA by Jan Engelhardt -11 TODO +12 TODO ======= - Describe how kbuild supports shipped files with _shipped. From 8a45fe7053a2c33b786706bdcd77d7ccf1531ef7 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2020 12:45:18 +0900 Subject: [PATCH 128/427] samples: uhid: fix warnings in uhid-example Fix warnings seen when building for 32-bit architecture. Use "%xd" for arguments of type size_t to fix the warnings. Signed-off-by: Sam Ravnborg Signed-off-by: Masahiro Yamada --- samples/uhid/uhid-example.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/uhid/uhid-example.c b/samples/uhid/uhid-example.c index b72d645ce828..015cb06a241e 100644 --- a/samples/uhid/uhid-example.c +++ b/samples/uhid/uhid-example.c @@ -165,7 +165,7 @@ static int uhid_write(int fd, const struct uhid_event *ev) fprintf(stderr, "Cannot write to uhid: %m\n"); return -errno; } else if (ret != sizeof(*ev)) { - fprintf(stderr, "Wrong size written to uhid: %ld != %lu\n", + fprintf(stderr, "Wrong size written to uhid: %zd != %zu\n", ret, sizeof(ev)); return -EFAULT; } else { @@ -236,7 +236,7 @@ static int event(int fd) fprintf(stderr, "Cannot read uhid-cdev: %m\n"); return -errno; } else if (ret != sizeof(ev)) { - fprintf(stderr, "Invalid size read from uhid-dev: %ld != %lu\n", + fprintf(stderr, "Invalid size read from uhid-dev: %zd != %zu\n", ret, sizeof(ev)); return -EFAULT; } From 87ecdf4fc9fd98f2a4156716d75c342b5bd35c74 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 29 Apr 2020 12:45:19 +0900 Subject: [PATCH 129/427] samples: uhid: build sample program for target architecture This userspace program includes UAPI headers exported to usr/include/. 'make headers' always works for the target architecture (i.e. the same architecture as the kernel), so the sample program should be built for the target as well. Kbuild now supports 'userprogs' for that. Add the entry to samples/Makefile to put this into the build bot coverage. I also added the CONFIG option guarded by 'depends on CC_CAN_LINK' because $(CC) may not provide libc. Signed-off-by: Masahiro Yamada Acked-by: Sam Ravnborg --- samples/Kconfig | 6 ++++++ samples/Makefile | 1 + samples/uhid/.gitignore | 2 ++ samples/uhid/Makefile | 9 +++------ 4 files changed, 12 insertions(+), 6 deletions(-) create mode 100644 samples/uhid/.gitignore diff --git a/samples/Kconfig b/samples/Kconfig index 8949e9646125..2560e87c9cae 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -131,6 +131,12 @@ config SAMPLE_SECCOMP Build samples of seccomp filters using various methods of BPF filter construction. +config SAMPLE_UHID + bool "UHID sample" + depends on CC_CAN_LINK && HEADERS_INSTALL + help + Build UHID sample program. + config SAMPLE_VFIO_MDEV_MTTY tristate "Build VFIO mtty example mediated device sample code -- loadable modules only" depends on VFIO_MDEV_DEVICE && m diff --git a/samples/Makefile b/samples/Makefile index 5ce50ef0f2b2..bdc168405452 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace_events/ obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace_printk/ obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace/ obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += ftrace/ +subdir-$(CONFIG_SAMPLE_UHID) += uhid obj-$(CONFIG_VIDEO_PCI_SKELETON) += v4l/ obj-y += vfio-mdev/ subdir-$(CONFIG_SAMPLE_VFS) += vfs diff --git a/samples/uhid/.gitignore b/samples/uhid/.gitignore new file mode 100644 index 000000000000..0e0a5a929f5d --- /dev/null +++ b/samples/uhid/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +/uhid-example diff --git a/samples/uhid/Makefile b/samples/uhid/Makefile index 5f44ea40d6d5..9e652fc34103 100644 --- a/samples/uhid/Makefile +++ b/samples/uhid/Makefile @@ -1,8 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -# List of programs to build -hostprogs := uhid-example +userprogs := uhid-example +always-y := $(userprogs) -# Tell kbuild to always build the programs -always-y := $(hostprogs) - -HOSTCFLAGS_uhid-example.o += -I$(objtree)/usr/include +userccflags += -I usr/include From 82d00a9392ecccc3828656e574bd62354a7259c9 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 29 Apr 2020 12:45:20 +0900 Subject: [PATCH 130/427] samples: hidraw: build sample program for target architecture This userspace program includes UAPI headers exported to usr/include/. 'make headers' always works for the target architecture (i.e. the same architecture as the kernel), so the sample program should be built for the target as well. Kbuild now supports 'userprogs' for that. I also guarded the CONFIG option by 'depends on CC_CAN_LINK' because $(CC) may not provide libc. Signed-off-by: Masahiro Yamada Acked-by: Sam Ravnborg --- samples/Kconfig | 2 +- samples/hidraw/Makefile | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/samples/Kconfig b/samples/Kconfig index 2560e87c9cae..08f2d025ca05 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -118,7 +118,7 @@ config SAMPLE_CONNECTOR config SAMPLE_HIDRAW bool "hidraw sample" - depends on HEADERS_INSTALL + depends on CC_CAN_LINK && HEADERS_INSTALL config SAMPLE_PIDFD bool "pidfd sample" diff --git a/samples/hidraw/Makefile b/samples/hidraw/Makefile index 8bd25f77671f..d2c77ed60b39 100644 --- a/samples/hidraw/Makefile +++ b/samples/hidraw/Makefile @@ -1,8 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# List of programs to build -hostprogs := hid-example -always-y := $(hostprogs) +userprogs := hid-example +always-y := $(userprogs) -HOSTCFLAGS_hid-example.o += -I$(objtree)/usr/include - -all: hid-example +userccflags += -I usr/include From 37249f5945b77d009519efca9544151ea2a78596 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 29 Apr 2020 12:45:21 +0900 Subject: [PATCH 131/427] samples: connector: build sample program for target architecture This userspace program includes UAPI headers exported to usr/include/. 'make headers' always works for the target architecture (i.e. the same architecture as the kernel), so the sample program should be built for the target as well. Kbuild now supports 'userprogs' for that. $(CC) can always compile cn_text.o since it is the kenrel-space code, but building ucon requires libc. I guarded it by: always-$(CONFIG_CC_CAN_LINK) := $(userprogs) Signed-off-by: Masahiro Yamada Acked-by: Sam Ravnborg --- samples/connector/Makefile | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/samples/connector/Makefile b/samples/connector/Makefile index b785cbde5ffa..50cb40e09a7b 100644 --- a/samples/connector/Makefile +++ b/samples/connector/Makefile @@ -1,13 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_SAMPLE_CONNECTOR) += cn_test.o -# List of programs to build -hostprogs := ucon -always-y := $(hostprogs) +userprogs := ucon +always-$(CONFIG_CC_CAN_LINK) := $(userprogs) -HOSTCFLAGS_ucon.o += -I$(objtree)/usr/include - -all: modules - -modules clean: - $(MAKE) -C ../.. M=$(CURDIR) $@ +userccflags += -I usr/include From 28949b84b2cb2473507ec2fed06728f995dd7942 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 29 Apr 2020 12:45:22 +0900 Subject: [PATCH 132/427] samples: vfs: build sample programs for target architecture These userspace programs include UAPI headers exported to usr/include/. 'make headers' always works for the target architecture (i.e. the same architecture as the kernel), so the sample programs should be built for the target as well. Kbuild now supports 'userprogs' for that. I also guarded the CONFIG option by 'depends on CC_CAN_LINK' because $(CC) may not provide libc. Signed-off-by: Masahiro Yamada Acked-by: Sam Ravnborg --- samples/Kconfig | 2 +- samples/vfs/Makefile | 11 +++-------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/samples/Kconfig b/samples/Kconfig index 08f2d025ca05..831a7ecd3352 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -184,7 +184,7 @@ config SAMPLE_ANDROID_BINDERFS config SAMPLE_VFS bool "Build example programs that use new VFS system calls" - depends on HEADERS_INSTALL + depends on CC_CAN_LINK && HEADERS_INSTALL help Build example userspace programs that use new VFS system calls such as mount API and statx(). Note that this is restricted to the x86 diff --git a/samples/vfs/Makefile b/samples/vfs/Makefile index 65acdde5c117..00b6824f9237 100644 --- a/samples/vfs/Makefile +++ b/samples/vfs/Makefile @@ -1,10 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -# List of programs to build -hostprogs := \ - test-fsmount \ - test-statx +userprogs := test-fsmount test-statx +always-y := $(userprogs) -always-y := $(hostprogs) - -HOSTCFLAGS_test-fsmount.o += -I$(objtree)/usr/include -HOSTCFLAGS_test-statx.o += -I$(objtree)/usr/include +userccflags += -I usr/include From 60fb0b1239719df2ec92606fa037f7c116810762 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 29 Apr 2020 12:45:23 +0900 Subject: [PATCH 133/427] samples: pidfd: build sample program for target architecture This userspace program includes UAPI headers exported to usr/include/. 'make headers' always works for the target architecture (i.e. the same architecture as the kernel), so the sample program should be built for the target as well. Kbuild now supports 'userprogs' for that. I also guarded the CONFIG option by 'depends on CC_CAN_LINK' because $(CC) may not provide libc. Signed-off-by: Masahiro Yamada Acked-by: Sam Ravnborg --- samples/Kconfig | 2 +- samples/pidfd/Makefile | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/samples/Kconfig b/samples/Kconfig index 831a7ecd3352..c68d391c0602 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -122,7 +122,7 @@ config SAMPLE_HIDRAW config SAMPLE_PIDFD bool "pidfd sample" - depends on HEADERS_INSTALL + depends on CC_CAN_LINK && HEADERS_INSTALL config SAMPLE_SECCOMP bool "Build seccomp sample code" diff --git a/samples/pidfd/Makefile b/samples/pidfd/Makefile index ee2979849d92..6e5b67e648c2 100644 --- a/samples/pidfd/Makefile +++ b/samples/pidfd/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -hostprogs := pidfd-metadata -always-y := $(hostprogs) -HOSTCFLAGS_pidfd-metadata.o += -I$(objtree)/usr/include -all: pidfd-metadata +usertprogs := pidfd-metadata +always-y := $(userprogs) + +userccflags += -I usr/include From c4c10996b1940e197b45f827b99f40a4b82daebb Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 29 Apr 2020 12:45:24 +0900 Subject: [PATCH 134/427] samples: mei: build sample program for target architecture This userspace program includes UAPI headers exported to usr/include/. 'make headers' always works for the target architecture (i.e. the same architecture as the kernel), so the sample program should be built for the target as well. Kbuild now supports 'userprogs' for that. I also guarded the CONFIG option by 'depends on CC_CAN_LINK' because $(CC) may not provide libc. Signed-off-by: Masahiro Yamada Acked-by: Sam Ravnborg --- samples/Kconfig | 1 + samples/mei/Makefile | 9 +++------ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/samples/Kconfig b/samples/Kconfig index c68d391c0602..69699db49675 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -193,6 +193,7 @@ config SAMPLE_VFS config SAMPLE_INTEL_MEI bool "Build example program working with intel mei driver" depends on INTEL_MEI + depends on CC_CAN_LINK && HEADERS_INSTALL help Build a sample program to work with mei device. diff --git a/samples/mei/Makefile b/samples/mei/Makefile index f5b9d02be2cd..329411f82369 100644 --- a/samples/mei/Makefile +++ b/samples/mei/Makefile @@ -1,10 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 # Copyright (c) 2012-2019, Intel Corporation. All rights reserved. -hostprogs := mei-amt-version +userprogs := mei-amt-version +always-y := $(userprogs) -HOSTCFLAGS_mei-amt-version.o += -I$(objtree)/usr/include - -always-y := $(hostprogs) - -all: mei-amt-version +userccflags += -I usr/include From 87ffbba9a9a1a74da44917a207c7e57dac98f2f8 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 29 Apr 2020 12:45:25 +0900 Subject: [PATCH 135/427] samples: auxdisplay: use 'userprogs' syntax Kbuild now supports the 'userprogs' syntax to compile userspace programs for the same architecture as the kernel. Add the entry to samples/Makefile to put this into the build bot coverage. I also added the CONFIG option guarded by 'depends on CC_CAN_LINK' because $(CC) may not provide libc. Signed-off-by: Masahiro Yamada Acked-by: Miguel Ojeda Acked-by: Sam Ravnborg --- samples/Kconfig | 4 ++++ samples/Makefile | 1 + samples/auxdisplay/Makefile | 11 ++--------- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/samples/Kconfig b/samples/Kconfig index 69699db49675..2322e11e8b80 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -6,6 +6,10 @@ menuconfig SAMPLES if SAMPLES +config SAMPLE_AUXDISPLAY + bool "auxdisplay sample" + depends on CC_CAN_LINK + config SAMPLE_TRACE_EVENTS tristate "Build trace_events examples -- loadable modules only" depends on EVENT_TRACING && m diff --git a/samples/Makefile b/samples/Makefile index bdc168405452..0c43b5d34b15 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for Linux samples code +subdir-$(CONFIG_SAMPLE_AUXDISPLAY) += auxdisplay obj-$(CONFIG_SAMPLE_ANDROID_BINDERFS) += binderfs/ obj-$(CONFIG_SAMPLE_CONFIGFS) += configfs/ obj-$(CONFIG_SAMPLE_CONNECTOR) += connector/ diff --git a/samples/auxdisplay/Makefile b/samples/auxdisplay/Makefile index 0273bab27233..dbdf939af94a 100644 --- a/samples/auxdisplay/Makefile +++ b/samples/auxdisplay/Makefile @@ -1,10 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -CC := $(CROSS_COMPILE)gcc -CFLAGS := -I../../usr/include - -PROGS := cfag12864b-example - -all: $(PROGS) - -clean: - rm -fr $(PROGS) +userprogs := cfag12864b-example +always-y := $(userprogs) From b98ccc7150c7bfce60671ad1928d5f1220446b39 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 29 Apr 2020 12:45:26 +0900 Subject: [PATCH 136/427] samples: timers: use 'userprogs' syntax Kbuild now supports the 'userprogs' syntax to compile userspace programs for the same architecture as the kernel. Add the entry to samples/Makefile to put this into the build bot coverage. I also added the CONFIG option guarded by 'depends on CC_CAN_LINK' because $(CC) may not provide libc. Signed-off-by: Masahiro Yamada Acked-by: Sam Ravnborg --- samples/Kconfig | 4 ++++ samples/Makefile | 1 + samples/timers/Makefile | 17 +++-------------- 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/samples/Kconfig b/samples/Kconfig index 2322e11e8b80..a8629a0d4f96 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -135,6 +135,10 @@ config SAMPLE_SECCOMP Build samples of seccomp filters using various methods of BPF filter construction. +config SAMPLE_TIMER + bool "Timer sample" + depends on CC_CAN_LINK && HEADERS_INSTALL + config SAMPLE_UHID bool "UHID sample" depends on CC_CAN_LINK && HEADERS_INSTALL diff --git a/samples/Makefile b/samples/Makefile index 0c43b5d34b15..042208326689 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -16,6 +16,7 @@ subdir-$(CONFIG_SAMPLE_PIDFD) += pidfd obj-$(CONFIG_SAMPLE_QMI_CLIENT) += qmi/ obj-$(CONFIG_SAMPLE_RPMSG_CLIENT) += rpmsg/ subdir-$(CONFIG_SAMPLE_SECCOMP) += seccomp +subdir-$(CONFIG_SAMPLE_TIMER) += timers obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace_events/ obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace_printk/ obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace/ diff --git a/samples/timers/Makefile b/samples/timers/Makefile index f9fa07460802..15c7ddbc8c51 100644 --- a/samples/timers/Makefile +++ b/samples/timers/Makefile @@ -1,16 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -ifndef CROSS_COMPILE -uname_M := $(shell uname -m 2>/dev/null || echo not) -ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) +userprogs := hpet_example +always-y := $(userprogs) -ifeq ($(ARCH),x86) -CC := $(CROSS_COMPILE)gcc -PROGS := hpet_example - -all: $(PROGS) - -clean: - rm -fr $(PROGS) - -endif -endif +userccflags += -I usr/include From 88a8e278ff0b6b461bf39d4ace17384e976a3f3f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 29 Apr 2020 12:45:27 +0900 Subject: [PATCH 137/427] samples: watchdog: use 'userprogs' syntax Kbuild now supports the 'userprogs' syntax to compile userspace programs for the same architecture as the kernel. Add the entry to samples/Makefile to put this into the build bot coverage. I also added the CONFIG option guarded by 'depends on CC_CAN_LINK' because $(CC) may not provide libc. Signed-off-by: Masahiro Yamada Acked-by: Sam Ravnborg --- samples/Kconfig | 3 +++ samples/Makefile | 1 + samples/watchdog/Makefile | 10 ++-------- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/samples/Kconfig b/samples/Kconfig index a8629a0d4f96..5005f74ac0eb 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -205,5 +205,8 @@ config SAMPLE_INTEL_MEI help Build a sample program to work with mei device. +config SAMPLE_WATCHDOG + bool "watchdog sample" + depends on CC_CAN_LINK endif # SAMPLES diff --git a/samples/Makefile b/samples/Makefile index 042208326689..29c66aadd954 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -26,3 +26,4 @@ obj-$(CONFIG_VIDEO_PCI_SKELETON) += v4l/ obj-y += vfio-mdev/ subdir-$(CONFIG_SAMPLE_VFS) += vfs obj-$(CONFIG_SAMPLE_INTEL_MEI) += mei/ +subdir-$(CONFIG_SAMPLE_WATCHDOG) += watchdog diff --git a/samples/watchdog/Makefile b/samples/watchdog/Makefile index a9430fa60253..17384cfb387e 100644 --- a/samples/watchdog/Makefile +++ b/samples/watchdog/Makefile @@ -1,9 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -CC := $(CROSS_COMPILE)gcc -PROGS := watchdog-simple - -all: $(PROGS) - -clean: - rm -fr $(PROGS) - +userprogs := watchdog-simple +always-y := $(userprogs) From 6964494582f56a3882c2c53b0edbfe99eb32b2e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 30 Apr 2020 10:06:14 +0200 Subject: [PATCH 138/427] PCI: aardvark: Train link immediately after enabling training MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding even 100ms (PCI_PM_D3COLD_WAIT) delay between enabling link training and starting link training causes detection issues with some buggy cards (such as Compex WLE900VX). Move the code which enables link training immediately before the one which starts link traning. This fixes detection issues of Compex WLE900VX card on Turris MOX after cold boot. Link: https://lore.kernel.org/r/20200430080625.26070-2-pali@kernel.org Fixes: f4c7d053d7f7 ("PCI: aardvark: Wait for endpoint to be ready...") Tested-by: Tomasz Maciej Nowak Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Acked-by: Rob Herring Acked-by: Thomas Petazzoni --- drivers/pci/controller/pci-aardvark.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 2a20b649f40c..f9955b494267 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -300,11 +300,6 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) reg |= LANE_COUNT_1; advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); - /* Enable link training */ - reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); - reg |= LINK_TRAINING_EN; - advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); - /* Enable MSI */ reg = advk_readl(pcie, PCIE_CORE_CTRL2_REG); reg |= PCIE_CORE_CTRL2_MSI_ENABLE; @@ -346,7 +341,15 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) */ msleep(PCI_PM_D3COLD_WAIT); - /* Start link training */ + /* Enable link training */ + reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); + reg |= LINK_TRAINING_EN; + advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); + + /* + * Start link training immediately after enabling it. + * This solves problems for some buggy cards. + */ reg = advk_readl(pcie, PCIE_CORE_LINK_CTRL_STAT_REG); reg |= PCIE_CORE_LINK_TRAINING; advk_writel(pcie, reg, PCIE_CORE_LINK_CTRL_STAT_REG); From 90c6cb4a355e7befcb557d217d1d8b8bd5875a05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 30 Apr 2020 10:06:15 +0200 Subject: [PATCH 139/427] PCI: aardvark: Don't blindly enable ASPM L0s and don't write to read-only register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trying to change Link Status register does not have any effect as this is a read-only register. Trying to overwrite bits for Negotiated Link Width does not make sense. In future proper change of link width can be done via Lane Count Select bits in PCIe Control 0 register. Trying to unconditionally enable ASPM L0s via ASPM Control bits in Link Control register is wrong. There should be at least some detection if endpoint supports L0s as isn't mandatory. Moreover ASPM Control bits in Link Control register are controlled by pcie/aspm.c code which sets it according to system ASPM settings, immediately after aardvark driver probes. So setting these bits by aardvark driver has no long running effect. Remove code which touches ASPM L0s bits from this driver and let kernel's ASPM implementation to set ASPM state properly. Some users are reporting issues that this code is problematic for some Intel wifi cards and removing it fixes them, see e.g.: https://bugzilla.kernel.org/show_bug.cgi?id=196339 If problems with Intel wifi cards occur even after this commit, then pcie/aspm.c code could be modified / hooked to not enable ASPM L0s state for affected problematic cards. Link: https://lore.kernel.org/r/20200430080625.26070-3-pali@kernel.org Tested-by: Tomasz Maciej Nowak Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Acked-by: Rob Herring Acked-by: Thomas Petazzoni --- drivers/pci/controller/pci-aardvark.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index f9955b494267..74b90940a9d4 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -356,10 +356,6 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) advk_pcie_wait_for_link(pcie); - reg = PCIE_CORE_LINK_L0S_ENTRY | - (1 << PCIE_CORE_LINK_WIDTH_SHIFT); - advk_writel(pcie, reg, PCIE_CORE_LINK_CTRL_STAT_REG); - reg = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); reg |= PCIE_CORE_CMD_MEM_ACCESS_EN | PCIE_CORE_CMD_IO_ACCESS_EN | From 2dd9072e8fb0af4af47c912244f6c16fc57d4fbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 30 Apr 2020 10:06:16 +0200 Subject: [PATCH 140/427] PCI: of: Zero max-link-speed value is invalid MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Interpret zero value of max-link-speed property as invalid, as the device tree bindings documentation specifies. Link: https://lore.kernel.org/r/20200430080625.26070-4-pali@kernel.org Tested-by: Tomasz Maciej Nowak Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Acked-by: Rob Herring Acked-by: Thomas Petazzoni --- drivers/pci/of.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/of.c b/drivers/pci/of.c index 81ceeaa6f1d5..27839cd2459f 100644 --- a/drivers/pci/of.c +++ b/drivers/pci/of.c @@ -592,7 +592,7 @@ int of_pci_get_max_link_speed(struct device_node *node) u32 max_link_speed; if (of_property_read_u32(node, "max-link-speed", &max_link_speed) || - max_link_speed > 4) + max_link_speed == 0 || max_link_speed > 4) return -EINVAL; return max_link_speed; From 43fc679ced18006b12d918d7a8a4af392b7fbfe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Thu, 30 Apr 2020 10:06:17 +0200 Subject: [PATCH 141/427] PCI: aardvark: Improve link training MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the aardvark driver trains link in PCIe gen2 mode. This may cause some buggy gen1 cards (such as Compex WLE900VX) to be unstable or even not detected. Moreover when ASPM code tries to retrain link second time, these cards may stop responding and link goes down. If gen1 is used this does not happen. Unconditionally forcing gen1 is not a good solution since it may have performance impact on gen2 cards. To overcome this, read 'max-link-speed' property (as defined in PCI device tree bindings) and use this as max gen mode. Then iteratively try link training at this mode or lower until successful. After successful link training choose final controller gen based on Negotiated Link Speed from Link Status register, which should match card speed. Link: https://lore.kernel.org/r/20200430080625.26070-5-pali@kernel.org Tested-by: Tomasz Maciej Nowak Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Thomas Petazzoni --- drivers/pci/controller/pci-aardvark.c | 114 ++++++++++++++++++++------ 1 file changed, 89 insertions(+), 25 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 74b90940a9d4..e202f954eb84 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -40,6 +40,7 @@ #define PCIE_CORE_LINK_CTRL_STAT_REG 0xd0 #define PCIE_CORE_LINK_L0S_ENTRY BIT(0) #define PCIE_CORE_LINK_TRAINING BIT(5) +#define PCIE_CORE_LINK_SPEED_SHIFT 16 #define PCIE_CORE_LINK_WIDTH_SHIFT 20 #define PCIE_CORE_ERR_CAPCTL_REG 0x118 #define PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX BIT(5) @@ -201,6 +202,7 @@ struct advk_pcie { struct mutex msi_used_lock; u16 msi_msg; int root_bus_nr; + int link_gen; struct pci_bridge_emul bridge; }; @@ -225,20 +227,16 @@ static int advk_pcie_link_up(struct advk_pcie *pcie) static int advk_pcie_wait_for_link(struct advk_pcie *pcie) { - struct device *dev = &pcie->pdev->dev; int retries; /* check if the link is up or not */ for (retries = 0; retries < LINK_WAIT_MAX_RETRIES; retries++) { - if (advk_pcie_link_up(pcie)) { - dev_info(dev, "link up\n"); + if (advk_pcie_link_up(pcie)) return 0; - } usleep_range(LINK_WAIT_USLEEP_MIN, LINK_WAIT_USLEEP_MAX); } - dev_err(dev, "link never came up\n"); return -ETIMEDOUT; } @@ -253,6 +251,85 @@ static void advk_pcie_wait_for_retrain(struct advk_pcie *pcie) } } +static int advk_pcie_train_at_gen(struct advk_pcie *pcie, int gen) +{ + int ret, neg_gen; + u32 reg; + + /* Setup link speed */ + reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); + reg &= ~PCIE_GEN_SEL_MSK; + if (gen == 3) + reg |= SPEED_GEN_3; + else if (gen == 2) + reg |= SPEED_GEN_2; + else + reg |= SPEED_GEN_1; + advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); + + /* + * Enable link training. This is not needed in every call to this + * function, just once suffices, but it does not break anything either. + */ + reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); + reg |= LINK_TRAINING_EN; + advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); + + /* + * Start link training immediately after enabling it. + * This solves problems for some buggy cards. + */ + reg = advk_readl(pcie, PCIE_CORE_LINK_CTRL_STAT_REG); + reg |= PCIE_CORE_LINK_TRAINING; + advk_writel(pcie, reg, PCIE_CORE_LINK_CTRL_STAT_REG); + + ret = advk_pcie_wait_for_link(pcie); + if (ret) + return ret; + + reg = advk_readl(pcie, PCIE_CORE_LINK_CTRL_STAT_REG); + neg_gen = (reg >> PCIE_CORE_LINK_SPEED_SHIFT) & 0xf; + + return neg_gen; +} + +static void advk_pcie_train_link(struct advk_pcie *pcie) +{ + struct device *dev = &pcie->pdev->dev; + int neg_gen = -1, gen; + + /* + * Try link training at link gen specified by device tree property + * 'max-link-speed'. If this fails, iteratively train at lower gen. + */ + for (gen = pcie->link_gen; gen > 0; --gen) { + neg_gen = advk_pcie_train_at_gen(pcie, gen); + if (neg_gen > 0) + break; + } + + if (neg_gen < 0) + goto err; + + /* + * After successful training if negotiated gen is lower than requested, + * train again on negotiated gen. This solves some stability issues for + * some buggy gen1 cards. + */ + if (neg_gen < gen) { + gen = neg_gen; + neg_gen = advk_pcie_train_at_gen(pcie, gen); + } + + if (neg_gen == gen) { + dev_info(dev, "link up at gen %i\n", gen); + return; + } + +err: + dev_err(dev, "link never came up\n"); +} + static void advk_pcie_setup_hw(struct advk_pcie *pcie) { u32 reg; @@ -288,12 +365,6 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) PCIE_CORE_CTRL2_TD_ENABLE; advk_writel(pcie, reg, PCIE_CORE_CTRL2_REG); - /* Set GEN2 */ - reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); - reg &= ~PCIE_GEN_SEL_MSK; - reg |= SPEED_GEN_2; - advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); - /* Set lane X1 */ reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); reg &= ~LANE_CNT_MSK; @@ -341,20 +412,7 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) */ msleep(PCI_PM_D3COLD_WAIT); - /* Enable link training */ - reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); - reg |= LINK_TRAINING_EN; - advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); - - /* - * Start link training immediately after enabling it. - * This solves problems for some buggy cards. - */ - reg = advk_readl(pcie, PCIE_CORE_LINK_CTRL_STAT_REG); - reg |= PCIE_CORE_LINK_TRAINING; - advk_writel(pcie, reg, PCIE_CORE_LINK_CTRL_STAT_REG); - - advk_pcie_wait_for_link(pcie); + advk_pcie_train_link(pcie); reg = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); reg |= PCIE_CORE_CMD_MEM_ACCESS_EN | @@ -988,6 +1046,12 @@ static int advk_pcie_probe(struct platform_device *pdev) } pcie->root_bus_nr = bus->start; + ret = of_pci_get_max_link_speed(dev->of_node); + if (ret <= 0 || ret > 3) + pcie->link_gen = 3; + else + pcie->link_gen = ret; + advk_pcie_setup_hw(pcie); advk_sw_pci_bridge_init(pcie); From 5169a9851daaa2782a7bd2bb83d5b1bd224b2879 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 30 Apr 2020 10:06:18 +0200 Subject: [PATCH 142/427] PCI: aardvark: Issue PERST via GPIO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for issuing PERST via GPIO specified in 'reset-gpios' property (as described in PCI device tree bindings). Some buggy cards (e.g. Compex WLE900VX or WLE1216) are not detected after reboot when PERST is not issued during driver initialization. If bootloader already enabled link training then issuing PERST has no effect for some buggy cards (e.g. Compex WLE900VX) and these cards are not detected. We therefore clear the LINK_TRAINING_EN register before. It was observed that Compex WLE900VX card needs to be in PERST reset for at least 10ms if bootloader enabled link training. Tested on Turris MOX. Link: https://lore.kernel.org/r/20200430080625.26070-6-pali@kernel.org Tested-by: Tomasz Maciej Nowak Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Acked-by: Thomas Petazzoni --- drivers/pci/controller/pci-aardvark.c | 43 ++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index e202f954eb84..2ecc79c03ade 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -18,6 +19,7 @@ #include #include #include +#include #include #include "../pci.h" @@ -204,6 +206,7 @@ struct advk_pcie { int root_bus_nr; int link_gen; struct pci_bridge_emul bridge; + struct gpio_desc *reset_gpio; }; static inline void advk_writel(struct advk_pcie *pcie, u32 val, u64 reg) @@ -330,10 +333,31 @@ err: dev_err(dev, "link never came up\n"); } +static void advk_pcie_issue_perst(struct advk_pcie *pcie) +{ + u32 reg; + + if (!pcie->reset_gpio) + return; + + /* PERST does not work for some cards when link training is enabled */ + reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); + reg &= ~LINK_TRAINING_EN; + advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); + + /* 10ms delay is needed for some cards */ + dev_info(&pcie->pdev->dev, "issuing PERST via reset GPIO for 10ms\n"); + gpiod_set_value_cansleep(pcie->reset_gpio, 1); + usleep_range(10000, 11000); + gpiod_set_value_cansleep(pcie->reset_gpio, 0); +} + static void advk_pcie_setup_hw(struct advk_pcie *pcie) { u32 reg; + advk_pcie_issue_perst(pcie); + /* Set to Direct mode */ reg = advk_readl(pcie, CTRL_CONFIG_REG); reg &= ~(CTRL_MODE_MASK << CTRL_MODE_SHIFT); @@ -406,7 +430,8 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) /* * PERST# signal could have been asserted by pinctrl subsystem before - * probe() callback has been called, making the endpoint going into + * probe() callback has been called or issued explicitly by reset gpio + * function advk_pcie_issue_perst(), making the endpoint going into * fundamental reset. As required by PCI Express spec a delay for at * least 100ms after such a reset before link training is needed. */ @@ -1046,6 +1071,22 @@ static int advk_pcie_probe(struct platform_device *pdev) } pcie->root_bus_nr = bus->start; + pcie->reset_gpio = devm_gpiod_get_from_of_node(dev, dev->of_node, + "reset-gpios", 0, + GPIOD_OUT_LOW, + "pcie1-reset"); + ret = PTR_ERR_OR_ZERO(pcie->reset_gpio); + if (ret) { + if (ret == -ENOENT) { + pcie->reset_gpio = NULL; + } else { + if (ret != -EPROBE_DEFER) + dev_err(dev, "Failed to get reset-gpio: %i\n", + ret); + return ret; + } + } + ret = of_pci_get_max_link_speed(dev->of_node); if (ret <= 0 || ret > 3) pcie->link_gen = 3; From b2a56469d550cdcbbaeacba86fdf8bcf6af4d084 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 30 Apr 2020 10:06:19 +0200 Subject: [PATCH 143/427] PCI: aardvark: Add FIXME comment for PCIE_CORE_CMD_STATUS_REG access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This register is applicable only when the controller is configured for Endpoint mode, which is not the case for the current version of this driver. Attempting to remove this code though caused some ath10k cards to stop working, so for some unknown reason it is needed here. This should be investigated and a comment explaining this should be put before the code, so we add a FIXME comment for now. Link: https://lore.kernel.org/r/20200430080625.26070-7-pali@kernel.org Tested-by: Tomasz Maciej Nowak Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Acked-by: Rob Herring Acked-by: Thomas Petazzoni --- drivers/pci/controller/pci-aardvark.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 2ecc79c03ade..8332c71d69fa 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -439,6 +439,13 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) advk_pcie_train_link(pcie); + /* + * FIXME: The following register update is suspicious. This register is + * applicable only when the PCI controller is configured for Endpoint + * mode, not as a Root Complex. But apparently when this code is + * removed, some cards stop working. This should be investigated and + * a comment explaining this should be put here. + */ reg = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); reg |= PCIE_CORE_CMD_MEM_ACCESS_EN | PCIE_CORE_CMD_IO_ACCESS_EN | From 366697018c9a2aa67d457bfdc495115cface6ae8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Thu, 30 Apr 2020 10:06:20 +0200 Subject: [PATCH 144/427] PCI: aardvark: Add PHY support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With recent proposed changes for U-Boot it is possible that bootloader won't initialize the PHY for this controller (currently the PHY is initialized regardless whether PCI is used in U-Boot, but with these proposed changes the PHY is initialized only on request). Since the mvebu-a3700-comphy driver by Miquèl Raynal supports enabling PCIe PHY, and since Linux' functionality should be independent on what bootloader did, add code for enabling generic PHY if found in device OF node. The mvebu-a3700-comphy driver does PHY powering via SMC calls to ARM Trusted Firmware. The corresponding code in ARM Trusted Firmware skips one register write which U-Boot does not: step 7 ("Enable TX"), see [1]. Instead ARM Trusted Firmware expects PCIe driver to do this step, probably because the register is in PCIe controller address space, instead of PHY address space. We therefore add this step into the advk_pcie_setup_hw function. [1] https://git.trustedfirmware.org/TF-A/trusted-firmware-a.git/tree/drivers/marvell/comphy/phy-comphy-3700.c?h=v2.3-rc2#n836 Link: https://lore.kernel.org/r/20200430080625.26070-8-pali@kernel.org Tested-by: Tomasz Maciej Nowak Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Thomas Petazzoni Cc: Miquèl Raynal --- drivers/pci/controller/pci-aardvark.c | 69 +++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 8332c71d69fa..053ae6c19a3d 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -104,6 +105,8 @@ #define PCIE_CORE_CTRL2_STRICT_ORDER_ENABLE BIT(5) #define PCIE_CORE_CTRL2_OB_WIN_ENABLE BIT(6) #define PCIE_CORE_CTRL2_MSI_ENABLE BIT(10) +#define PCIE_CORE_REF_CLK_REG (CONTROL_BASE_ADDR + 0x14) +#define PCIE_CORE_REF_CLK_TX_ENABLE BIT(1) #define PCIE_MSG_LOG_REG (CONTROL_BASE_ADDR + 0x30) #define PCIE_ISR0_REG (CONTROL_BASE_ADDR + 0x40) #define PCIE_MSG_PM_PME_MASK BIT(7) @@ -207,6 +210,7 @@ struct advk_pcie { int link_gen; struct pci_bridge_emul bridge; struct gpio_desc *reset_gpio; + struct phy *phy; }; static inline void advk_writel(struct advk_pcie *pcie, u32 val, u64 reg) @@ -358,6 +362,11 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) advk_pcie_issue_perst(pcie); + /* Enable TX */ + reg = advk_readl(pcie, PCIE_CORE_REF_CLK_REG); + reg |= PCIE_CORE_REF_CLK_TX_ENABLE; + advk_writel(pcie, reg, PCIE_CORE_REF_CLK_REG); + /* Set to Direct mode */ reg = advk_readl(pcie, CTRL_CONFIG_REG); reg &= ~(CTRL_MODE_MASK << CTRL_MODE_SHIFT); @@ -1041,6 +1050,62 @@ static irqreturn_t advk_pcie_irq_handler(int irq, void *arg) return IRQ_HANDLED; } +static void __maybe_unused advk_pcie_disable_phy(struct advk_pcie *pcie) +{ + phy_power_off(pcie->phy); + phy_exit(pcie->phy); +} + +static int advk_pcie_enable_phy(struct advk_pcie *pcie) +{ + int ret; + + if (!pcie->phy) + return 0; + + ret = phy_init(pcie->phy); + if (ret) + return ret; + + ret = phy_set_mode(pcie->phy, PHY_MODE_PCIE); + if (ret) { + phy_exit(pcie->phy); + return ret; + } + + ret = phy_power_on(pcie->phy); + if (ret) { + phy_exit(pcie->phy); + return ret; + } + + return 0; +} + +static int advk_pcie_setup_phy(struct advk_pcie *pcie) +{ + struct device *dev = &pcie->pdev->dev; + struct device_node *node = dev->of_node; + int ret = 0; + + pcie->phy = devm_of_phy_get(dev, node, NULL); + if (IS_ERR(pcie->phy) && (PTR_ERR(pcie->phy) == -EPROBE_DEFER)) + return PTR_ERR(pcie->phy); + + /* Old bindings miss the PHY handle */ + if (IS_ERR(pcie->phy)) { + dev_warn(dev, "PHY unavailable (%ld)\n", PTR_ERR(pcie->phy)); + pcie->phy = NULL; + return 0; + } + + ret = advk_pcie_enable_phy(pcie); + if (ret) + dev_err(dev, "Failed to initialize PHY (%d)\n", ret); + + return ret; +} + static int advk_pcie_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -1100,6 +1165,10 @@ static int advk_pcie_probe(struct platform_device *pdev) else pcie->link_gen = ret; + ret = advk_pcie_setup_phy(pcie); + if (ret) + return ret; + advk_pcie_setup_hw(pcie); advk_sw_pci_bridge_init(pcie); From 96be36dbffacea0aa9e6ec4839583e79faa141a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 30 Apr 2020 10:06:21 +0200 Subject: [PATCH 145/427] PCI: aardvark: Replace custom macros by standard linux/pci_regs.h macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCI-E capability macros are already defined in linux/pci_regs.h. Remove their reimplementation in pcie-aardvark. Link: https://lore.kernel.org/r/20200430080625.26070-9-pali@kernel.org Tested-by: Tomasz Maciej Nowak Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Thomas Petazzoni --- drivers/pci/controller/pci-aardvark.c | 41 ++++++++++++--------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 053ae6c19a3d..c53ae2511a9c 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -34,17 +34,6 @@ #define PCIE_CORE_CMD_MEM_IO_REQ_EN BIT(2) #define PCIE_CORE_DEV_REV_REG 0x8 #define PCIE_CORE_PCIEXP_CAP 0xc0 -#define PCIE_CORE_DEV_CTRL_STATS_REG 0xc8 -#define PCIE_CORE_DEV_CTRL_STATS_RELAX_ORDER_DISABLE (0 << 4) -#define PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT 5 -#define PCIE_CORE_DEV_CTRL_STATS_SNOOP_DISABLE (0 << 11) -#define PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SIZE_SHIFT 12 -#define PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SZ 0x2 -#define PCIE_CORE_LINK_CTRL_STAT_REG 0xd0 -#define PCIE_CORE_LINK_L0S_ENTRY BIT(0) -#define PCIE_CORE_LINK_TRAINING BIT(5) -#define PCIE_CORE_LINK_SPEED_SHIFT 16 -#define PCIE_CORE_LINK_WIDTH_SHIFT 20 #define PCIE_CORE_ERR_CAPCTL_REG 0x118 #define PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX BIT(5) #define PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX_EN BIT(6) @@ -223,6 +212,11 @@ static inline u32 advk_readl(struct advk_pcie *pcie, u64 reg) return readl(pcie->base + reg); } +static inline u16 advk_read16(struct advk_pcie *pcie, u64 reg) +{ + return advk_readl(pcie, (reg & ~0x3)) >> ((reg & 0x3) * 8); +} + static int advk_pcie_link_up(struct advk_pcie *pcie) { u32 val, ltssm_state; @@ -286,16 +280,16 @@ static int advk_pcie_train_at_gen(struct advk_pcie *pcie, int gen) * Start link training immediately after enabling it. * This solves problems for some buggy cards. */ - reg = advk_readl(pcie, PCIE_CORE_LINK_CTRL_STAT_REG); - reg |= PCIE_CORE_LINK_TRAINING; - advk_writel(pcie, reg, PCIE_CORE_LINK_CTRL_STAT_REG); + reg = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKCTL); + reg |= PCI_EXP_LNKCTL_RL; + advk_writel(pcie, reg, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKCTL); ret = advk_pcie_wait_for_link(pcie); if (ret) return ret; - reg = advk_readl(pcie, PCIE_CORE_LINK_CTRL_STAT_REG); - neg_gen = (reg >> PCIE_CORE_LINK_SPEED_SHIFT) & 0xf; + reg = advk_read16(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKSTA); + neg_gen = reg & PCI_EXP_LNKSTA_CLS; return neg_gen; } @@ -385,13 +379,14 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) PCIE_CORE_ERR_CAPCTL_ECRC_CHCK_RCV; advk_writel(pcie, reg, PCIE_CORE_ERR_CAPCTL_REG); - /* Set PCIe Device Control and Status 1 PF0 register */ - reg = PCIE_CORE_DEV_CTRL_STATS_RELAX_ORDER_DISABLE | - (7 << PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT) | - PCIE_CORE_DEV_CTRL_STATS_SNOOP_DISABLE | - (PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SZ << - PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SIZE_SHIFT); - advk_writel(pcie, reg, PCIE_CORE_DEV_CTRL_STATS_REG); + /* Set PCIe Device Control register */ + reg = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_DEVCTL); + reg &= ~PCI_EXP_DEVCTL_RELAX_EN; + reg &= ~PCI_EXP_DEVCTL_NOSNOOP_EN; + reg &= ~PCI_EXP_DEVCTL_READRQ; + reg |= PCI_EXP_DEVCTL_PAYLOAD; /* Set max payload size */ + reg |= PCI_EXP_DEVCTL_READRQ_512B; + advk_writel(pcie, reg, PCIE_CORE_PCIEXP_CAP + PCI_EXP_DEVCTL); /* Program PCIe Control 2 to disable strict ordering */ reg = PCIE_CORE_CTRL2_RESERVED | From e89897c9dec7f859a93b8364709851c3a7418ac3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Thu, 30 Apr 2020 10:06:22 +0200 Subject: [PATCH 146/427] dt-bindings: PCI: aardvark: Describe new properties MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Document the possibility to reference a PHY and reset-gpios and to set max-link-speed property. Link: https://lore.kernel.org/r/20200430080625.26070-10-pali@kernel.org Tested-by: Tomasz Maciej Nowak Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Thomas Petazzoni Cc: Rob Herring Cc: devicetree@vger.kernel.org --- Documentation/devicetree/bindings/pci/aardvark-pci.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/aardvark-pci.txt b/Documentation/devicetree/bindings/pci/aardvark-pci.txt index 310ef7145c47..2b8ca920a7fa 100644 --- a/Documentation/devicetree/bindings/pci/aardvark-pci.txt +++ b/Documentation/devicetree/bindings/pci/aardvark-pci.txt @@ -19,6 +19,9 @@ contain the following properties: - interrupt-map-mask and interrupt-map: standard PCI properties to define the mapping of the PCIe interface to interrupt numbers. - bus-range: PCI bus numbers covered + - phys: the PCIe PHY handle + - max-link-speed: see pci.txt + - reset-gpios: see pci.txt In addition, the Device Tree describing an Aardvark PCIe controller must include a sub-node that describes the legacy interrupt controller @@ -48,6 +51,7 @@ Example: <0 0 0 2 &pcie_intc 1>, <0 0 0 3 &pcie_intc 2>, <0 0 0 4 &pcie_intc 3>; + phys = <&comphy1 0>; pcie_intc: interrupt-controller { interrupt-controller; #interrupt-cells = <1>; From fb5f8f3ca5f853568a1872c9aeb432e1743ebd18 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Fri, 8 May 2020 18:36:43 +0530 Subject: [PATCH 147/427] dt-bindings: PCI: cadence: Deprecate inbound/outbound specific bindings Deprecate cdns,max-outbound-regions and cdns,no-bar-match-nbits for host mode as both these could be derived from "ranges" and "dma-ranges" property. "cdns,max-outbound-regions" property would still be required for EP mode. Link: https://lore.kernel.org/r/20200508130646.23939-2-kishon@ti.com Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Tom Joseph --- .../bindings/pci/cdns,cdns-pcie-ep.yaml | 2 +- .../bindings/pci/cdns,cdns-pcie-host.yaml | 3 +-- .../devicetree/bindings/pci/cdns-pcie-ep.yaml | 25 +++++++++++++++++++ .../bindings/pci/cdns-pcie-host.yaml | 10 ++++++++ .../devicetree/bindings/pci/cdns-pcie.yaml | 8 ------ 5 files changed, 37 insertions(+), 11 deletions(-) create mode 100644 Documentation/devicetree/bindings/pci/cdns-pcie-ep.yaml diff --git a/Documentation/devicetree/bindings/pci/cdns,cdns-pcie-ep.yaml b/Documentation/devicetree/bindings/pci/cdns,cdns-pcie-ep.yaml index 2996f8d4777c..50ce5d79d2c7 100644 --- a/Documentation/devicetree/bindings/pci/cdns,cdns-pcie-ep.yaml +++ b/Documentation/devicetree/bindings/pci/cdns,cdns-pcie-ep.yaml @@ -10,7 +10,7 @@ maintainers: - Tom Joseph allOf: - - $ref: "cdns-pcie.yaml#" + - $ref: "cdns-pcie-ep.yaml#" - $ref: "pci-ep.yaml#" properties: diff --git a/Documentation/devicetree/bindings/pci/cdns,cdns-pcie-host.yaml b/Documentation/devicetree/bindings/pci/cdns,cdns-pcie-host.yaml index cabbe46ff578..84a8f095d031 100644 --- a/Documentation/devicetree/bindings/pci/cdns,cdns-pcie-host.yaml +++ b/Documentation/devicetree/bindings/pci/cdns,cdns-pcie-host.yaml @@ -45,8 +45,6 @@ examples: #size-cells = <2>; bus-range = <0x0 0xff>; linux,pci-domain = <0>; - cdns,max-outbound-regions = <16>; - cdns,no-bar-match-nbits = <32>; vendor-id = <0x17cd>; device-id = <0x0200>; @@ -57,6 +55,7 @@ examples: ranges = <0x02000000 0x0 0x42000000 0x0 0x42000000 0x0 0x1000000>, <0x01000000 0x0 0x43000000 0x0 0x43000000 0x0 0x0010000>; + dma-ranges = <0x02000000 0x0 0x0 0x0 0x0 0x1 0x00000000>; #interrupt-cells = <0x1>; diff --git a/Documentation/devicetree/bindings/pci/cdns-pcie-ep.yaml b/Documentation/devicetree/bindings/pci/cdns-pcie-ep.yaml new file mode 100644 index 000000000000..6150a7a7bdbf --- /dev/null +++ b/Documentation/devicetree/bindings/pci/cdns-pcie-ep.yaml @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: "http://devicetree.org/schemas/pci/cdns-pcie-ep.yaml#" +$schema: "http://devicetree.org/meta-schemas/core.yaml#" + +title: Cadence PCIe Device + +maintainers: + - Tom Joseph + +allOf: + - $ref: "cdns-pcie.yaml#" + +properties: + cdns,max-outbound-regions: + description: maximum number of outbound regions + allOf: + - $ref: /schemas/types.yaml#/definitions/uint32 + minimum: 1 + maximum: 32 + default: 32 + +required: + - cdns,max-outbound-regions diff --git a/Documentation/devicetree/bindings/pci/cdns-pcie-host.yaml b/Documentation/devicetree/bindings/pci/cdns-pcie-host.yaml index ab6e43b636ec..3d64f85aeb39 100644 --- a/Documentation/devicetree/bindings/pci/cdns-pcie-host.yaml +++ b/Documentation/devicetree/bindings/pci/cdns-pcie-host.yaml @@ -14,6 +14,15 @@ allOf: - $ref: "cdns-pcie.yaml#" properties: + cdns,max-outbound-regions: + description: maximum number of outbound regions + allOf: + - $ref: /schemas/types.yaml#/definitions/uint32 + minimum: 1 + maximum: 32 + default: 32 + deprecated: true + cdns,no-bar-match-nbits: description: Set into the no BAR match register to configure the number of least @@ -23,5 +32,6 @@ properties: minimum: 0 maximum: 64 default: 32 + deprecated: true msi-parent: true diff --git a/Documentation/devicetree/bindings/pci/cdns-pcie.yaml b/Documentation/devicetree/bindings/pci/cdns-pcie.yaml index 6887ccc339cc..02553d5e6c51 100644 --- a/Documentation/devicetree/bindings/pci/cdns-pcie.yaml +++ b/Documentation/devicetree/bindings/pci/cdns-pcie.yaml @@ -10,14 +10,6 @@ maintainers: - Tom Joseph properties: - cdns,max-outbound-regions: - description: maximum number of outbound regions - allOf: - - $ref: /schemas/types.yaml#/definitions/uint32 - minimum: 1 - maximum: 32 - default: 32 - phys: description: One per lane if more than one in the list. If only one PHY listed it must From 9e2618c3f1a9499a921131a913b25d1347f16261 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Fri, 8 May 2020 18:36:44 +0530 Subject: [PATCH 148/427] PCI: cadence: Remove "cdns,max-outbound-regions" DT property "cdns,max-outbound-regions" device tree property provides the maximum number of outbound regions supported by the Host PCIe controller. However the outbound regions are configured based on what is populated in the "ranges" DT property. Avoid using two properties for configuring outbound regions and use only "ranges" property instead. Link: https://lore.kernel.org/r/20200508130646.23939-3-kishon@ti.com Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Lorenzo Pieralisi Acked-by: Rob Herring Acked-by: Tom Joseph --- drivers/pci/controller/cadence/pcie-cadence-host.c | 6 ------ drivers/pci/controller/cadence/pcie-cadence.h | 2 -- 2 files changed, 8 deletions(-) diff --git a/drivers/pci/controller/cadence/pcie-cadence-host.c b/drivers/pci/controller/cadence/pcie-cadence-host.c index 9b1c3966414b..e5e9a3293579 100644 --- a/drivers/pci/controller/cadence/pcie-cadence-host.c +++ b/drivers/pci/controller/cadence/pcie-cadence-host.c @@ -140,9 +140,6 @@ static int cdns_pcie_host_init_address_translation(struct cdns_pcie_rc *rc) for_each_of_pci_range(&parser, &range) { bool is_io; - if (r >= rc->max_regions) - break; - if ((range.flags & IORESOURCE_TYPE_BITS) == IORESOURCE_MEM) is_io = false; else if ((range.flags & IORESOURCE_TYPE_BITS) == IORESOURCE_IO) @@ -219,9 +216,6 @@ int cdns_pcie_host_setup(struct cdns_pcie_rc *rc) pcie = &rc->pcie; pcie->is_rc = true; - rc->max_regions = 32; - of_property_read_u32(np, "cdns,max-outbound-regions", &rc->max_regions); - rc->no_bar_nbits = 32; of_property_read_u32(np, "cdns,no-bar-match-nbits", &rc->no_bar_nbits); diff --git a/drivers/pci/controller/cadence/pcie-cadence.h b/drivers/pci/controller/cadence/pcie-cadence.h index a2b28b912ca4..6bd89a21bb1c 100644 --- a/drivers/pci/controller/cadence/pcie-cadence.h +++ b/drivers/pci/controller/cadence/pcie-cadence.h @@ -251,7 +251,6 @@ struct cdns_pcie { * @bus_range: first/last buses behind the PCIe host controller * @cfg_base: IO mapped window to access the PCI configuration space of a * single function at a time - * @max_regions: maximum number of regions supported by the hardware * @no_bar_nbits: Number of bits to keep for inbound (PCIe -> CPU) address * translation (nbits sets into the "no BAR match" register) * @vendor_id: PCI vendor ID @@ -262,7 +261,6 @@ struct cdns_pcie_rc { struct resource *cfg_res; struct resource *bus_range; void __iomem *cfg_base; - u32 max_regions; u32 no_bar_nbits; u16 vendor_id; u16 device_id; From 7fb39bf2a1de9dc9e0846a1e3fe74e959a693a0d Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Fri, 8 May 2020 18:36:45 +0530 Subject: [PATCH 149/427] PCI: cadence: Fix to read 32-bit Vendor ID/Device ID property from DT The PCI Bus Binding specification (IEEE Std 1275-1994 Revision 2.1 [1]) defines both Vendor ID and Device ID to be 32-bits. Fix pcie-cadence-host.c driver to read 32-bit Vendor ID and Device ID properties from device tree. [1] -> https://www.devicetree.org/open-firmware/bindings/pci/pci2_1.pdf Link: https://lore.kernel.org/r/20200508130646.23939-4-kishon@ti.com Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Lorenzo Pieralisi Acked-by: Rob Herring Acked-by: Tom Joseph --- drivers/pci/controller/cadence/pcie-cadence-host.c | 4 ++-- drivers/pci/controller/cadence/pcie-cadence.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pci/controller/cadence/pcie-cadence-host.c b/drivers/pci/controller/cadence/pcie-cadence-host.c index e5e9a3293579..8c2543f28ba0 100644 --- a/drivers/pci/controller/cadence/pcie-cadence-host.c +++ b/drivers/pci/controller/cadence/pcie-cadence-host.c @@ -220,10 +220,10 @@ int cdns_pcie_host_setup(struct cdns_pcie_rc *rc) of_property_read_u32(np, "cdns,no-bar-match-nbits", &rc->no_bar_nbits); rc->vendor_id = 0xffff; - of_property_read_u16(np, "vendor-id", &rc->vendor_id); + of_property_read_u32(np, "vendor-id", &rc->vendor_id); rc->device_id = 0xffff; - of_property_read_u16(np, "device-id", &rc->device_id); + of_property_read_u32(np, "device-id", &rc->device_id); res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "reg"); pcie->reg_base = devm_ioremap_resource(dev, res); diff --git a/drivers/pci/controller/cadence/pcie-cadence.h b/drivers/pci/controller/cadence/pcie-cadence.h index 6bd89a21bb1c..df14ad002fe9 100644 --- a/drivers/pci/controller/cadence/pcie-cadence.h +++ b/drivers/pci/controller/cadence/pcie-cadence.h @@ -262,8 +262,8 @@ struct cdns_pcie_rc { struct resource *bus_range; void __iomem *cfg_base; u32 no_bar_nbits; - u16 vendor_id; - u16 device_id; + u32 vendor_id; + u32 device_id; }; /** From 6c805f77f161d65364cfc4e4734f7057a621fee4 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:24 +0200 Subject: [PATCH 150/427] dm zoned: remove 'dev' argument from reclaim Use the dmz_zone_to_dev() mapping function to remove the 'dev' argument from reclaim. Signed-off-by: Hannes Reinecke Reviewed-by: Bob Liu Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-reclaim.c | 56 ++++++++++++++++++----------------- drivers/md/dm-zoned-target.c | 2 +- drivers/md/dm-zoned.h | 4 +-- 3 files changed, 32 insertions(+), 30 deletions(-) diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 5daede0daf92..3c8847d49e5a 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -13,7 +13,6 @@ struct dmz_reclaim { struct dmz_metadata *metadata; - struct dmz_dev *dev; struct delayed_work work; struct workqueue_struct *wq; @@ -59,6 +58,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, sector_t block) { struct dmz_metadata *zmd = zrc->metadata; + struct dmz_dev *dev = dmz_zone_to_dev(zmd, zone); sector_t wp_block = zone->wp_block; unsigned int nr_blocks; int ret; @@ -74,15 +74,15 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, * pointer and the requested position. */ nr_blocks = block - wp_block; - ret = blkdev_issue_zeroout(zrc->dev->bdev, + ret = blkdev_issue_zeroout(dev->bdev, dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block), dmz_blk2sect(nr_blocks), GFP_NOIO, 0); if (ret) { - dmz_dev_err(zrc->dev, + dmz_dev_err(dev, "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d", zone->id, (unsigned long long)wp_block, (unsigned long long)block, nr_blocks, ret); - dmz_check_bdev(zrc->dev); + dmz_check_bdev(dev); return ret; } @@ -116,7 +116,7 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, struct dm_zone *src_zone, struct dm_zone *dst_zone) { struct dmz_metadata *zmd = zrc->metadata; - struct dmz_dev *dev = zrc->dev; + struct dmz_dev *src_dev, *dst_dev; struct dm_io_region src, dst; sector_t block = 0, end_block; sector_t nr_blocks; @@ -130,13 +130,17 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, else end_block = dmz_zone_nr_blocks(zmd); src_zone_block = dmz_start_block(zmd, src_zone); + src_dev = dmz_zone_to_dev(zmd, src_zone); dst_zone_block = dmz_start_block(zmd, dst_zone); + dst_dev = dmz_zone_to_dev(zmd, dst_zone); if (dmz_is_seq(dst_zone)) set_bit(DM_KCOPYD_WRITE_SEQ, &flags); while (block < end_block) { - if (dev->flags & DMZ_BDEV_DYING) + if (src_dev->flags & DMZ_BDEV_DYING) + return -EIO; + if (dst_dev->flags & DMZ_BDEV_DYING) return -EIO; /* Get a valid region from the source zone */ @@ -156,11 +160,11 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, return ret; } - src.bdev = dev->bdev; + src.bdev = src_dev->bdev; src.sector = dmz_blk2sect(src_zone_block + block); src.count = dmz_blk2sect(nr_blocks); - dst.bdev = dev->bdev; + dst.bdev = dst_dev->bdev; dst.sector = dmz_blk2sect(dst_zone_block + block); dst.count = src.count; @@ -194,10 +198,10 @@ static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone) struct dmz_metadata *zmd = zrc->metadata; int ret; - dmz_dev_debug(zrc->dev, - "Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)", - dzone->chunk, bzone->id, dmz_weight(bzone), - dzone->id, dmz_weight(dzone)); + DMDEBUG("(%s): Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)", + dmz_metadata_label(zmd), + dzone->chunk, bzone->id, dmz_weight(bzone), + dzone->id, dmz_weight(dzone)); /* Flush data zone into the buffer zone */ ret = dmz_reclaim_copy(zrc, bzone, dzone); @@ -233,10 +237,10 @@ static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) struct dmz_metadata *zmd = zrc->metadata; int ret = 0; - dmz_dev_debug(zrc->dev, - "Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)", - chunk, dzone->id, dmz_weight(dzone), - bzone->id, dmz_weight(bzone)); + DMDEBUG("(%s): Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)", + dmz_metadata_label(zmd), + chunk, dzone->id, dmz_weight(dzone), + bzone->id, dmz_weight(bzone)); /* Flush data zone into the buffer zone */ ret = dmz_reclaim_copy(zrc, dzone, bzone); @@ -285,9 +289,9 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) if (!szone) return -ENOSPC; - dmz_dev_debug(zrc->dev, - "Chunk %u, move rnd zone %u (weight %u) to seq zone %u", - chunk, dzone->id, dmz_weight(dzone), szone->id); + DMDEBUG("(%s): Chunk %u, move rnd zone %u (weight %u) to seq zone %u", + dmz_metadata_label(zmd), + chunk, dzone->id, dmz_weight(dzone), szone->id); /* Flush the random data zone into the sequential zone */ ret = dmz_reclaim_copy(zrc, dzone, szone); @@ -352,7 +356,6 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) return PTR_ERR(dzone); start = jiffies; - if (dmz_is_rnd(dzone)) { if (!dmz_weight(dzone)) { /* Empty zone */ @@ -400,14 +403,14 @@ out: ret = dmz_flush_metadata(zrc->metadata); if (ret) { - dmz_dev_debug(zrc->dev, - "Metadata flush for zone %u failed, err %d\n", - rzone->id, ret); + DMDEBUG("(%s): Metadata flush for zone %u failed, err %d\n", + dmz_metadata_label(zmd), rzone->id, ret); return ret; } - dmz_dev_debug(zrc->dev, "Reclaimed zone %u in %u ms", - rzone->id, jiffies_to_msecs(jiffies - start)); + DMDEBUG("(%s): Reclaimed zone %u in %u ms", + dmz_metadata_label(zmd), + rzone->id, jiffies_to_msecs(jiffies - start)); return 0; } @@ -500,7 +503,7 @@ static void dmz_reclaim_work(struct work_struct *work) /* * Initialize reclaim. */ -int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd, +int dmz_ctr_reclaim(struct dmz_metadata *zmd, struct dmz_reclaim **reclaim) { struct dmz_reclaim *zrc; @@ -510,7 +513,6 @@ int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd, if (!zrc) return -ENOMEM; - zrc->dev = dev; zrc->metadata = zmd; zrc->atime = jiffies; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index b32e791b8a5c..520e55df627b 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -840,7 +840,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD); /* Initialize reclaim */ - ret = dmz_ctr_reclaim(dev, dmz->metadata, &dmz->reclaim); + ret = dmz_ctr_reclaim(dmz->metadata, &dmz->reclaim); if (ret) { ti->error = "Zone reclaim initialization failed"; goto err_fwq; diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index e0883df8a903..2629bd51fa26 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -180,6 +180,7 @@ const char *dmz_metadata_label(struct dmz_metadata *zmd); sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone); sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone); unsigned int dmz_nr_chunks(struct dmz_metadata *zmd); +struct dmz_dev *dmz_zone_to_dev(struct dmz_metadata *zmd, struct dm_zone *zone); bool dmz_check_dev(struct dmz_metadata *zmd); bool dmz_dev_is_dying(struct dmz_metadata *zmd); @@ -254,8 +255,7 @@ int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, /* * Functions defined in dm-zoned-reclaim.c */ -int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd, - struct dmz_reclaim **zrc); +int dmz_ctr_reclaim(struct dmz_metadata *zmd, struct dmz_reclaim **zrc); void dmz_dtr_reclaim(struct dmz_reclaim *zrc); void dmz_suspend_reclaim(struct dmz_reclaim *zrc); void dmz_resume_reclaim(struct dmz_reclaim *zrc); From 52d6775888c65be66f3577ccd0f14b51691df7f9 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:25 +0200 Subject: [PATCH 151/427] dm zoned: replace 'target' pointer in the bio context Replace the 'target' pointer in the bio context with the device pointer as this is what's actually used. Signed-off-by: Hannes Reinecke Reviewed-by: Bob Liu Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-target.c | 44 ++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 520e55df627b..a09fb78ffe88 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -17,7 +17,7 @@ * Zone BIO context. */ struct dmz_bioctx { - struct dmz_target *target; + struct dmz_dev *dev; struct dm_zone *zone; struct bio *bio; refcount_t ref; @@ -76,12 +76,13 @@ struct dmz_target { */ static inline void dmz_bio_endio(struct bio *bio, blk_status_t status) { - struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); + struct dmz_bioctx *bioctx = + dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK) bio->bi_status = status; if (bio->bi_status != BLK_STS_OK) - bioctx->target->dev->flags |= DMZ_CHECK_BDEV; + bioctx->dev->flags |= DMZ_CHECK_BDEV; if (refcount_dec_and_test(&bioctx->ref)) { struct dm_zone *zone = bioctx->zone; @@ -118,14 +119,20 @@ static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone, struct bio *bio, sector_t chunk_block, unsigned int nr_blocks) { - struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); + struct dmz_bioctx *bioctx = + dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); + struct dmz_dev *dev = dmz_zone_to_dev(dmz->metadata, zone); struct bio *clone; + if (dev->flags & DMZ_BDEV_DYING) + return -EIO; + clone = bio_clone_fast(bio, GFP_NOIO, &dmz->bio_set); if (!clone) return -ENOMEM; - bio_set_dev(clone, dmz->dev->bdev); + bio_set_dev(clone, dev->bdev); + bioctx->dev = dev; clone->bi_iter.bi_sector = dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block); clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT; @@ -218,8 +225,10 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, if (nr_blocks) { /* Valid blocks found: read them */ - nr_blocks = min_t(unsigned int, nr_blocks, end_block - chunk_block); - ret = dmz_submit_bio(dmz, rzone, bio, chunk_block, nr_blocks); + nr_blocks = min_t(unsigned int, nr_blocks, + end_block - chunk_block); + ret = dmz_submit_bio(dmz, rzone, bio, + chunk_block, nr_blocks); if (ret) return ret; chunk_block += nr_blocks; @@ -330,7 +339,8 @@ static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone, * and the BIO is aligned to the zone write pointer: * direct write the zone. */ - return dmz_handle_direct_write(dmz, zone, bio, chunk_block, nr_blocks); + return dmz_handle_direct_write(dmz, zone, bio, + chunk_block, nr_blocks); } /* @@ -383,7 +393,8 @@ static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone, static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, struct bio *bio) { - struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); + struct dmz_bioctx *bioctx = + dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); struct dmz_metadata *zmd = dmz->metadata; struct dm_zone *zone; int ret; @@ -397,11 +408,6 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, dmz_lock_metadata(zmd); - if (dmz->dev->flags & DMZ_BDEV_DYING) { - ret = -EIO; - goto out; - } - /* * Get the data zone mapping the chunk. There may be no * mapping for read and discard. If a mapping is obtained, @@ -625,7 +631,6 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) { struct dmz_target *dmz = ti->private; struct dmz_metadata *zmd = dmz->metadata; - struct dmz_dev *dev = dmz->dev; struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); sector_t sector = bio->bi_iter.bi_sector; unsigned int nr_sectors = bio_sectors(bio); @@ -642,8 +647,6 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) (unsigned long long)dmz_chunk_block(zmd, dmz_bio_block(bio)), (unsigned int)dmz_bio_blocks(bio)); - bio_set_dev(bio, dev->bdev); - if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE) return DM_MAPIO_REMAPPED; @@ -652,7 +655,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_KILL; /* Initialize the BIO context */ - bioctx->target = dmz; + bioctx->dev = NULL; bioctx->zone = NULL; bioctx->bio = bio; refcount_set(&bioctx->ref, 1); @@ -931,11 +934,12 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) { struct dmz_target *dmz = ti->private; + struct dmz_dev *dev = &dmz->dev[0]; - if (!dmz_check_bdev(dmz->dev)) + if (!dmz_check_bdev(dev)) return -EIO; - *bdev = dmz->dev->bdev; + *bdev = dev->bdev; return 0; } From aa821c8dc0d76fa9f827becf1186bfd824f1fcfb Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:26 +0200 Subject: [PATCH 152/427] dm zoned: use dmz_zone_to_dev() when handling metadata I/O Use accessors to retrieve the device pointer in preparation for adding an additional block device. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Reviewed-by: Bob Liu Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 426af738f1ca..312194be4cb0 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1310,6 +1310,7 @@ static int dmz_update_zone_cb(struct blk_zone *blkz, unsigned int idx, */ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) { + struct dmz_dev *dev = dmz_zone_to_dev(zmd, zone); unsigned int noio_flag; int ret; @@ -1320,16 +1321,16 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) * GFP_NOIO was specified. */ noio_flag = memalloc_noio_save(); - ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), 1, + ret = blkdev_report_zones(dev->bdev, dmz_start_sect(zmd, zone), 1, dmz_update_zone_cb, zone); memalloc_noio_restore(noio_flag); if (ret == 0) ret = -EIO; if (ret < 0) { - dmz_dev_err(zmd->dev, "Get zone %u report failed", + dmz_dev_err(dev, "Get zone %u report failed", zone->id); - dmz_check_bdev(zmd->dev); + dmz_check_bdev(dev); return ret; } @@ -1343,6 +1344,7 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) static int dmz_handle_seq_write_err(struct dmz_metadata *zmd, struct dm_zone *zone) { + struct dmz_dev *dev = dmz_zone_to_dev(zmd, zone); unsigned int wp = 0; int ret; @@ -1351,7 +1353,7 @@ static int dmz_handle_seq_write_err(struct dmz_metadata *zmd, if (ret) return ret; - dmz_dev_warn(zmd->dev, "Processing zone %u write error (zone wp %u/%u)", + dmz_dev_warn(dev, "Processing zone %u write error (zone wp %u/%u)", zone->id, zone->wp_block, wp); if (zone->wp_block < wp) { @@ -1384,7 +1386,7 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) return 0; if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) { - struct dmz_dev *dev = zmd->dev; + struct dmz_dev *dev = dmz_zone_to_dev(zmd, zone); ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, dmz_start_sect(zmd, zone), From ca1a70450a969c63dd19f0a34504fa1bd227e730 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:27 +0200 Subject: [PATCH 153/427] dm zoned: add metadata logging functions Use the metadata label for logging and not the underlying device. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Reviewed-by: Bob Liu Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 106 +++++++++++++++++++-------------- 1 file changed, 62 insertions(+), 44 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 312194be4cb0..0e7122867fd8 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -194,6 +194,17 @@ struct dmz_metadata { wait_queue_head_t free_wq; }; +#define dmz_zmd_info(zmd, format, args...) \ + DMINFO("(%s): " format, (zmd)->devname, ## args) + +#define dmz_zmd_err(zmd, format, args...) \ + DMERR("(%s): " format, (zmd)->devname, ## args) + +#define dmz_zmd_warn(zmd, format, args...) \ + DMWARN("(%s): " format, (zmd)->devname, ## args) + +#define dmz_zmd_debug(zmd, format, args...) \ + DMDEBUG("(%s): " format, (zmd)->devname, ## args) /* * Various accessors */ @@ -1098,7 +1109,7 @@ static int dmz_load_sb(struct dmz_metadata *zmd) int ret; if (!zmd->sb[0].zone) { - dmz_dev_err(zmd->dev, "Primary super block zone not set"); + dmz_zmd_err(zmd, "Primary super block zone not set"); return -ENXIO; } @@ -1135,7 +1146,7 @@ static int dmz_load_sb(struct dmz_metadata *zmd) /* Use highest generation sb first */ if (!sb_good[0] && !sb_good[1]) { - dmz_dev_err(zmd->dev, "No valid super block found"); + dmz_zmd_err(zmd, "No valid super block found"); return -EIO; } @@ -1248,7 +1259,7 @@ static void dmz_drop_zones(struct dmz_metadata *zmd) */ static int dmz_init_zones(struct dmz_metadata *zmd) { - struct dmz_dev *dev = zmd->dev; + struct dmz_dev *dev = &zmd->dev[0]; int ret; /* Init */ @@ -1268,8 +1279,8 @@ static int dmz_init_zones(struct dmz_metadata *zmd) if (!zmd->zones) return -ENOMEM; - dmz_dev_info(dev, "Using %zu B for zone information", - sizeof(struct dm_zone) * zmd->nr_zones); + DMINFO("(%s): Using %zu B for zone information", + zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones); /* * Get zone information and initialize zone descriptors. At the same @@ -1412,7 +1423,6 @@ static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone); */ static int dmz_load_mapping(struct dmz_metadata *zmd) { - struct dmz_dev *dev = zmd->dev; struct dm_zone *dzone, *bzone; struct dmz_mblock *dmap_mblk = NULL; struct dmz_map *dmap; @@ -1445,7 +1455,7 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) goto next; if (dzone_id >= zmd->nr_zones) { - dmz_dev_err(dev, "Chunk %u mapping: invalid data zone ID %u", + dmz_zmd_err(zmd, "Chunk %u mapping: invalid data zone ID %u", chunk, dzone_id); return -EIO; } @@ -1466,14 +1476,14 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) goto next; if (bzone_id >= zmd->nr_zones) { - dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone ID %u", + dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone ID %u", chunk, bzone_id); return -EIO; } bzone = dmz_get(zmd, bzone_id); if (!dmz_is_rnd(bzone)) { - dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone %u", + dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone %u", chunk, bzone_id); return -EIO; } @@ -1893,7 +1903,7 @@ again: atomic_dec(&zmd->unmap_nr_seq); if (dmz_is_offline(zone)) { - dmz_dev_warn(zmd->dev, "Zone %u is offline", zone->id); + dmz_zmd_warn(zmd, "Zone %u is offline", zone->id); zone = NULL; goto again; } @@ -2104,7 +2114,7 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, struct dmz_mblock *mblk; unsigned int n = 0; - dmz_dev_debug(zmd->dev, "=> VALIDATE zone %u, block %llu, %u blocks", + dmz_zmd_debug(zmd, "=> VALIDATE zone %u, block %llu, %u blocks", zone->id, (unsigned long long)chunk_block, nr_blocks); @@ -2134,7 +2144,7 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, if (likely(zone->weight + n <= zone_nr_blocks)) zone->weight += n; else { - dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be <= %u", + dmz_zmd_warn(zmd, "Zone %u: weight %u should be <= %u", zone->id, zone->weight, zone_nr_blocks - n); zone->weight = zone_nr_blocks; @@ -2184,7 +2194,7 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, struct dmz_mblock *mblk; unsigned int n = 0; - dmz_dev_debug(zmd->dev, "=> INVALIDATE zone %u, block %llu, %u blocks", + dmz_zmd_debug(zmd, "=> INVALIDATE zone %u, block %llu, %u blocks", zone->id, (u64)chunk_block, nr_blocks); WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks); @@ -2214,7 +2224,7 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, if (zone->weight >= n) zone->weight -= n; else { - dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be >= %u", + dmz_zmd_warn(zmd, "Zone %u: weight %u should be >= %u", zone->id, zone->weight, n); zone->weight = 0; } @@ -2424,7 +2434,7 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd) while (!list_empty(&zmd->mblk_dirty_list)) { mblk = list_first_entry(&zmd->mblk_dirty_list, struct dmz_mblock, link); - dmz_dev_warn(zmd->dev, "mblock %llu still in dirty list (ref %u)", + dmz_zmd_warn(zmd, "mblock %llu still in dirty list (ref %u)", (u64)mblk->no, mblk->ref); list_del_init(&mblk->link); rb_erase(&mblk->node, &zmd->mblk_rbtree); @@ -2442,7 +2452,7 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd) /* Sanity checks: the mblock rbtree should now be empty */ root = &zmd->mblk_rbtree; rbtree_postorder_for_each_entry_safe(mblk, next, root, node) { - dmz_dev_warn(zmd->dev, "mblock %llu ref %u still in rbtree", + dmz_zmd_warn(zmd, "mblock %llu ref %u still in rbtree", (u64)mblk->no, mblk->ref); mblk->ref = 0; dmz_free_mblock(zmd, mblk); @@ -2455,6 +2465,19 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd) mutex_destroy(&zmd->map_lock); } +static void dmz_print_dev(struct dmz_metadata *zmd, int num) +{ + struct dmz_dev *dev = &zmd->dev[num]; + + dmz_dev_info(dev, "Host-%s zoned block device", + bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ? + "aware" : "managed"); + dmz_dev_info(dev, " %llu 512-byte logical sectors", + (u64)dev->capacity); + dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors", + dev->nr_zones, (u64)zmd->zone_nr_sectors); +} + /* * Initialize the zoned metadata. */ @@ -2531,34 +2554,31 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata, /* Metadata cache shrinker */ ret = register_shrinker(&zmd->mblk_shrinker); if (ret) { - dmz_dev_err(dev, "Register metadata cache shrinker failed"); + dmz_zmd_err(zmd, "Register metadata cache shrinker failed"); goto err; } - dmz_dev_info(dev, "Host-%s zoned block device", - bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ? - "aware" : "managed"); - dmz_dev_info(dev, " %llu 512-byte logical sectors", - (u64)dev->capacity); - dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors", - zmd->nr_zones, (u64)zmd->zone_nr_sectors); - dmz_dev_info(dev, " %u metadata zones", - zmd->nr_meta_zones * 2); - dmz_dev_info(dev, " %u data zones for %u chunks", - zmd->nr_data_zones, zmd->nr_chunks); - dmz_dev_info(dev, " %u random zones (%u unmapped)", - zmd->nr_rnd, atomic_read(&zmd->unmap_nr_rnd)); - dmz_dev_info(dev, " %u sequential zones (%u unmapped)", - zmd->nr_seq, atomic_read(&zmd->unmap_nr_seq)); - dmz_dev_info(dev, " %u reserved sequential data zones", - zmd->nr_reserved_seq); + dmz_zmd_info(zmd, "DM-Zoned metadata version %d", DMZ_META_VER); + dmz_print_dev(zmd, 0); - dmz_dev_debug(dev, "Format:"); - dmz_dev_debug(dev, "%u metadata blocks per set (%u max cache)", + dmz_zmd_info(zmd, " %u zones of %llu 512-byte logical sectors", + zmd->nr_zones, (u64)zmd->zone_nr_sectors); + dmz_zmd_info(zmd, " %u metadata zones", + zmd->nr_meta_zones * 2); + dmz_zmd_info(zmd, " %u data zones for %u chunks", + zmd->nr_data_zones, zmd->nr_chunks); + dmz_zmd_info(zmd, " %u random zones (%u unmapped)", + zmd->nr_rnd, atomic_read(&zmd->unmap_nr_rnd)); + dmz_zmd_info(zmd, " %u sequential zones (%u unmapped)", + zmd->nr_seq, atomic_read(&zmd->unmap_nr_seq)); + dmz_zmd_info(zmd, " %u reserved sequential data zones", + zmd->nr_reserved_seq); + dmz_zmd_debug(zmd, "Format:"); + dmz_zmd_debug(zmd, "%u metadata blocks per set (%u max cache)", zmd->nr_meta_blocks, zmd->max_nr_mblks); - dmz_dev_debug(dev, " %u data zone mapping blocks", + dmz_zmd_debug(zmd, " %u data zone mapping blocks", zmd->nr_map_blocks); - dmz_dev_debug(dev, " %u bitmap blocks", + dmz_zmd_debug(zmd, " %u bitmap blocks", zmd->nr_bitmap_blocks); *metadata = zmd; @@ -2587,7 +2607,6 @@ void dmz_dtr_metadata(struct dmz_metadata *zmd) */ int dmz_resume_metadata(struct dmz_metadata *zmd) { - struct dmz_dev *dev = zmd->dev; struct dm_zone *zone; sector_t wp_block; unsigned int i; @@ -2597,20 +2616,19 @@ int dmz_resume_metadata(struct dmz_metadata *zmd) for (i = 0; i < zmd->nr_zones; i++) { zone = dmz_get(zmd, i); if (!zone) { - dmz_dev_err(dev, "Unable to get zone %u", i); + dmz_zmd_err(zmd, "Unable to get zone %u", i); return -EIO; } - wp_block = zone->wp_block; ret = dmz_update_zone(zmd, zone); if (ret) { - dmz_dev_err(dev, "Broken zone %u", i); + dmz_zmd_err(zmd, "Broken zone %u", i); return ret; } if (dmz_is_offline(zone)) { - dmz_dev_warn(dev, "Zone %u is offline", i); + dmz_zmd_warn(zmd, "Zone %u is offline", i); continue; } @@ -2618,7 +2636,7 @@ int dmz_resume_metadata(struct dmz_metadata *zmd) if (!dmz_is_seq(zone)) zone->wp_block = 0; else if (zone->wp_block != wp_block) { - dmz_dev_err(dev, "Zone %u: Invalid wp (%llu / %llu)", + dmz_zmd_err(zmd, "Zone %u: Invalid wp (%llu / %llu)", i, (u64)zone->wp_block, (u64)wp_block); zone->wp_block = wp_block; dmz_invalidate_blocks(zmd, zone, zone->wp_block, From ae3c1f1171467f83849c7e8c5e0e632c5078ca2f Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:28 +0200 Subject: [PATCH 154/427] dm zoned: Reduce logging output on startup dm-zoned is becoming quite chatty during startup; reduce the noise by moving some information to 'debug' level. Suggested-by: Mike Snitzer Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 0e7122867fd8..1290728c197e 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1279,8 +1279,8 @@ static int dmz_init_zones(struct dmz_metadata *zmd) if (!zmd->zones) return -ENOMEM; - DMINFO("(%s): Using %zu B for zone information", - zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones); + DMDEBUG("(%s): Using %zu B for zone information", + zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones); /* * Get zone information and initialize zone descriptors. At the same @@ -2563,16 +2563,16 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata, dmz_zmd_info(zmd, " %u zones of %llu 512-byte logical sectors", zmd->nr_zones, (u64)zmd->zone_nr_sectors); - dmz_zmd_info(zmd, " %u metadata zones", - zmd->nr_meta_zones * 2); - dmz_zmd_info(zmd, " %u data zones for %u chunks", - zmd->nr_data_zones, zmd->nr_chunks); - dmz_zmd_info(zmd, " %u random zones (%u unmapped)", - zmd->nr_rnd, atomic_read(&zmd->unmap_nr_rnd)); - dmz_zmd_info(zmd, " %u sequential zones (%u unmapped)", - zmd->nr_seq, atomic_read(&zmd->unmap_nr_seq)); - dmz_zmd_info(zmd, " %u reserved sequential data zones", - zmd->nr_reserved_seq); + dmz_zmd_debug(zmd, " %u metadata zones", + zmd->nr_meta_zones * 2); + dmz_zmd_debug(zmd, " %u data zones for %u chunks", + zmd->nr_data_zones, zmd->nr_chunks); + dmz_zmd_debug(zmd, " %u random zones (%u unmapped)", + zmd->nr_rnd, atomic_read(&zmd->unmap_nr_rnd)); + dmz_zmd_debug(zmd, " %u sequential zones (%u unmapped)", + zmd->nr_seq, atomic_read(&zmd->unmap_nr_seq)); + dmz_zmd_debug(zmd, " %u reserved sequential data zones", + zmd->nr_reserved_seq); dmz_zmd_debug(zmd, "Format:"); dmz_zmd_debug(zmd, "%u metadata blocks per set (%u max cache)", zmd->nr_meta_blocks, zmd->max_nr_mblks); From dc076c838f65723325001c977b39e55fc6ba0fa7 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:29 +0200 Subject: [PATCH 155/427] dm zoned: ignore metadata zone in dmz_alloc_zone() When looking up zones in dmz_alloc_zone() we need to ignore metadata zones so as not to accidentally overwrite metadata. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Reviewed-by: Bob Liu Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 1290728c197e..939deed1606a 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1907,7 +1907,13 @@ again: zone = NULL; goto again; } + if (dmz_is_meta(zone)) { + struct dmz_dev *dev = dmz_zone_to_dev(zmd, zone); + dmz_dev_warn(dev, "Zone %u has metadata", zone->id); + zone = NULL; + goto again; + } return zone; } From bd5c40313a1467e4683d92456fc5219d94823f24 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:30 +0200 Subject: [PATCH 156/427] dm zoned: metadata version 2 Implement handling for metadata version 2. The new metadata adds a label and UUID for the device mapper device, and additional UUID for the underlying block devices. It also allows for an additional regular drive to be used for emulating random access zones. The emulated zones will be placed logically in front of the zones from the zoned block device, causing the superblocks and metadata to be stored on that device. The first zone of the original zoned device will be used to hold another, tertiary copy of the metadata; this copy carries a generation number of 0 and is never updated; it's just used for identification. Signed-off-by: Hannes Reinecke Reviewed-by: Bob Liu Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- .../admin-guide/device-mapper/dm-zoned.rst | 34 +- drivers/md/dm-zoned-metadata.c | 310 +++++++++++++++--- drivers/md/dm-zoned-target.c | 183 ++++++++--- drivers/md/dm-zoned.h | 7 +- 4 files changed, 426 insertions(+), 108 deletions(-) diff --git a/Documentation/admin-guide/device-mapper/dm-zoned.rst b/Documentation/admin-guide/device-mapper/dm-zoned.rst index 7547ce635161..553752ea2521 100644 --- a/Documentation/admin-guide/device-mapper/dm-zoned.rst +++ b/Documentation/admin-guide/device-mapper/dm-zoned.rst @@ -37,9 +37,13 @@ Algorithm dm-zoned implements an on-disk buffering scheme to handle non-sequential write accesses to the sequential zones of a zoned block device. Conventional zones are used for caching as well as for storing internal -metadata. +metadata. It can also use a regular block device together with the zoned +block device; in that case the regular block device will be split logically +in zones with the same size as the zoned block device. These zones will be +placed in front of the zones from the zoned block device and will be handled +just like conventional zones. -The zones of the device are separated into 2 types: +The zones of the device(s) are separated into 2 types: 1) Metadata zones: these are conventional zones used to store metadata. Metadata zones are not reported as useable capacity to the user. @@ -127,6 +131,13 @@ resumed. Flushing metadata thus only temporarily delays write and discard requests. Read requests can be processed concurrently while metadata flush is being executed. +If a regular device is used in conjunction with the zoned block device, +a third set of metadata (without the zone bitmaps) is written to the +start of the zoned block device. This metadata has a generation counter of +'0' and will never be updated during normal operation; it just serves for +identification purposes. The first and second copy of the metadata +are located at the start of the regular block device. + Usage ===== @@ -138,12 +149,21 @@ Ex:: dmzadm --format /dev/sdxx -For a formatted device, the target can be created normally with the -dmsetup utility. The only parameter that dm-zoned requires is the -underlying zoned block device name. Ex:: - echo "0 `blockdev --getsize ${dev}` zoned ${dev}" | \ - dmsetup create dmz-`basename ${dev}` +If two drives are to be used, both devices must be specified, with the +regular block device as the first device. + +Ex:: + + dmzadm --format /dev/sdxx /dev/sdyy + + +Fomatted device(s) can be started with the dmzadm utility, too.: + +Ex:: + + dmzadm --start /dev/sdxx /dev/sdyy + Information about the internal layout and current usage of the zones can be obtained with the 'status' callback from dmsetup: diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 939deed1606a..fba690dd37f5 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -16,7 +16,7 @@ /* * Metadata version. */ -#define DMZ_META_VER 1 +#define DMZ_META_VER 2 /* * On-disk super block magic. @@ -69,8 +69,17 @@ struct dmz_super { /* Checksum */ __le32 crc; /* 48 */ + /* DM-Zoned label */ + u8 dmz_label[32]; /* 80 */ + + /* DM-Zoned UUID */ + u8 dmz_uuid[16]; /* 96 */ + + /* Device UUID */ + u8 dev_uuid[16]; /* 112 */ + /* Padding to full 512B sector */ - u8 reserved[464]; /* 512 */ + u8 reserved[400]; /* 512 */ }; /* @@ -133,8 +142,11 @@ struct dmz_sb { */ struct dmz_metadata { struct dmz_dev *dev; + unsigned int nr_devs; char devname[BDEVNAME_SIZE]; + char label[BDEVNAME_SIZE]; + uuid_t uuid; sector_t zone_bitmap_size; unsigned int zone_nr_bitmap_blocks; @@ -161,8 +173,9 @@ struct dmz_metadata { /* Zone information array */ struct dm_zone *zones; - struct dmz_sb sb[2]; + struct dmz_sb sb[3]; unsigned int mblk_primary; + unsigned int sb_version; u64 sb_gen; unsigned int min_nr_mblks; unsigned int max_nr_mblks; @@ -195,31 +208,56 @@ struct dmz_metadata { }; #define dmz_zmd_info(zmd, format, args...) \ - DMINFO("(%s): " format, (zmd)->devname, ## args) + DMINFO("(%s): " format, (zmd)->label, ## args) #define dmz_zmd_err(zmd, format, args...) \ - DMERR("(%s): " format, (zmd)->devname, ## args) + DMERR("(%s): " format, (zmd)->label, ## args) #define dmz_zmd_warn(zmd, format, args...) \ - DMWARN("(%s): " format, (zmd)->devname, ## args) + DMWARN("(%s): " format, (zmd)->label, ## args) #define dmz_zmd_debug(zmd, format, args...) \ - DMDEBUG("(%s): " format, (zmd)->devname, ## args) + DMDEBUG("(%s): " format, (zmd)->label, ## args) /* * Various accessors */ +static unsigned int dmz_dev_zone_id(struct dmz_metadata *zmd, struct dm_zone *zone) +{ + unsigned int zone_id; + + if (WARN_ON(!zone)) + return 0; + + zone_id = zone->id; + if (zmd->nr_devs > 1 && + (zone_id >= zmd->dev[1].zone_offset)) + zone_id -= zmd->dev[1].zone_offset; + return zone_id; +} + sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone) { - return (sector_t)zone->id << zmd->zone_nr_sectors_shift; + unsigned int zone_id = dmz_dev_zone_id(zmd, zone); + + return (sector_t)zone_id << zmd->zone_nr_sectors_shift; } sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone) { - return (sector_t)zone->id << zmd->zone_nr_blocks_shift; + unsigned int zone_id = dmz_dev_zone_id(zmd, zone); + + return (sector_t)zone_id << zmd->zone_nr_blocks_shift; } struct dmz_dev *dmz_zone_to_dev(struct dmz_metadata *zmd, struct dm_zone *zone) { + if (WARN_ON(!zone)) + return &zmd->dev[0]; + + if (zmd->nr_devs > 1 && + zone->id >= zmd->dev[1].zone_offset) + return &zmd->dev[1]; + return &zmd->dev[0]; } @@ -275,17 +313,29 @@ unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd) const char *dmz_metadata_label(struct dmz_metadata *zmd) { - return (const char *)zmd->devname; + return (const char *)zmd->label; } bool dmz_check_dev(struct dmz_metadata *zmd) { - return dmz_check_bdev(&zmd->dev[0]); + unsigned int i; + + for (i = 0; i < zmd->nr_devs; i++) { + if (!dmz_check_bdev(&zmd->dev[i])) + return false; + } + return true; } bool dmz_dev_is_dying(struct dmz_metadata *zmd) { - return dmz_bdev_is_dying(&zmd->dev[0]); + unsigned int i; + + for (i = 0; i < zmd->nr_devs; i++) { + if (dmz_bdev_is_dying(&zmd->dev[i])) + return true; + } + return false; } /* @@ -687,6 +737,9 @@ static int dmz_rdwr_block(struct dmz_dev *dev, int op, struct bio *bio; int ret; + if (WARN_ON(!dev)) + return -EIO; + if (dmz_bdev_is_dying(dev)) return -EIO; @@ -711,19 +764,32 @@ static int dmz_rdwr_block(struct dmz_dev *dev, int op, */ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) { - sector_t block = zmd->sb[set].block; struct dmz_mblock *mblk = zmd->sb[set].mblk; struct dmz_super *sb = zmd->sb[set].sb; struct dmz_dev *dev = zmd->sb[set].dev; + sector_t sb_block; u64 sb_gen = zmd->sb_gen + 1; int ret; sb->magic = cpu_to_le32(DMZ_MAGIC); - sb->version = cpu_to_le32(DMZ_META_VER); + + sb->version = cpu_to_le32(zmd->sb_version); + if (zmd->sb_version > 1) { + BUILD_BUG_ON(UUID_SIZE != 16); + export_uuid(sb->dmz_uuid, &zmd->uuid); + memcpy(sb->dmz_label, zmd->label, BDEVNAME_SIZE); + export_uuid(sb->dev_uuid, &dev->uuid); + } sb->gen = cpu_to_le64(sb_gen); - sb->sb_block = cpu_to_le64(block); + /* + * The metadata always references the absolute block address, + * ie relative to the entire block range, not the per-device + * block address. + */ + sb_block = zmd->sb[set].zone->id << zmd->zone_nr_blocks_shift; + sb->sb_block = cpu_to_le64(sb_block); sb->nr_meta_blocks = cpu_to_le32(zmd->nr_meta_blocks); sb->nr_reserved_seq = cpu_to_le32(zmd->nr_reserved_seq); sb->nr_chunks = cpu_to_le32(zmd->nr_chunks); @@ -734,7 +800,8 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) sb->crc = 0; sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE)); - ret = dmz_rdwr_block(dev, REQ_OP_WRITE, block, mblk->page); + ret = dmz_rdwr_block(dev, REQ_OP_WRITE, zmd->sb[set].block, + mblk->page); if (ret == 0) ret = blkdev_issue_flush(dev->bdev, GFP_NOIO, NULL); @@ -915,6 +982,23 @@ static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) u32 crc, stored_crc; u64 gen; + if (le32_to_cpu(sb->magic) != DMZ_MAGIC) { + dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)", + DMZ_MAGIC, le32_to_cpu(sb->magic)); + return -ENXIO; + } + + zmd->sb_version = le32_to_cpu(sb->version); + if (zmd->sb_version > DMZ_META_VER) { + dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)", + DMZ_META_VER, zmd->sb_version); + return -EINVAL; + } + if ((zmd->sb_version < 1) && (set == 2)) { + dmz_dev_err(dev, "Tertiary superblocks are not supported"); + return -EINVAL; + } + gen = le64_to_cpu(sb->gen); stored_crc = le32_to_cpu(sb->crc); sb->crc = 0; @@ -925,16 +1009,45 @@ static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) return -ENXIO; } - if (le32_to_cpu(sb->magic) != DMZ_MAGIC) { - dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)", - DMZ_MAGIC, le32_to_cpu(sb->magic)); - return -ENXIO; - } + if (zmd->sb_version > 1) { + uuid_t sb_uuid; - if (le32_to_cpu(sb->version) != DMZ_META_VER) { - dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)", - DMZ_META_VER, le32_to_cpu(sb->version)); - return -ENXIO; + import_uuid(&sb_uuid, sb->dmz_uuid); + if (uuid_is_null(&sb_uuid)) { + dmz_dev_err(dev, "NULL DM-Zoned uuid"); + return -ENXIO; + } else if (uuid_is_null(&zmd->uuid)) { + uuid_copy(&zmd->uuid, &sb_uuid); + } else if (!uuid_equal(&zmd->uuid, &sb_uuid)) { + dmz_dev_err(dev, "mismatching DM-Zoned uuid, " + "is %pUl expected %pUl", + &sb_uuid, &zmd->uuid); + return -ENXIO; + } + if (!strlen(zmd->label)) + memcpy(zmd->label, sb->dmz_label, BDEVNAME_SIZE); + else if (memcmp(zmd->label, sb->dmz_label, BDEVNAME_SIZE)) { + dmz_dev_err(dev, "mismatching DM-Zoned label, " + "is %s expected %s", + sb->dmz_label, zmd->label); + return -ENXIO; + } + import_uuid(&dev->uuid, sb->dev_uuid); + if (uuid_is_null(&dev->uuid)) { + dmz_dev_err(dev, "NULL device uuid"); + return -ENXIO; + } + + if (set == 2) { + /* + * Generation number should be 0, but it doesn't + * really matter if it isn't. + */ + if (gen != 0) + dmz_dev_warn(dev, "Invalid generation %llu", + gen); + return 0; + } } nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + zmd->zone_nr_blocks - 1) @@ -1185,21 +1298,38 @@ static int dmz_load_sb(struct dmz_metadata *zmd) "Using super block %u (gen %llu)", zmd->mblk_primary, zmd->sb_gen); + if ((zmd->sb_version > 1) && zmd->sb[2].zone) { + zmd->sb[2].block = dmz_start_block(zmd, zmd->sb[2].zone); + zmd->sb[2].dev = dmz_zone_to_dev(zmd, zmd->sb[2].zone); + ret = dmz_get_sb(zmd, 2); + if (ret) { + dmz_dev_err(zmd->sb[2].dev, + "Read tertiary super block failed"); + return ret; + } + ret = dmz_check_sb(zmd, 2); + if (ret == -EINVAL) + return ret; + } return 0; } /* * Initialize a zone descriptor. */ -static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data) +static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) { struct dmz_metadata *zmd = data; + struct dmz_dev *dev = zmd->nr_devs > 1 ? &zmd->dev[1] : &zmd->dev[0]; + int idx = num + dev->zone_offset; struct dm_zone *zone = &zmd->zones[idx]; - struct dmz_dev *dev = zmd->dev; - /* Ignore the eventual last runt (smaller) zone */ if (blkz->len != zmd->zone_nr_sectors) { - if (blkz->start + blkz->len == dev->capacity) + if (zmd->sb_version > 1) { + /* Ignore the eventual runt (smaller) zone */ + set_bit(DMZ_OFFLINE, &zone->flags); + return 0; + } else if (blkz->start + blkz->len == dev->capacity) return 0; return -ENXIO; } @@ -1234,16 +1364,45 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data) zmd->nr_useable_zones++; if (dmz_is_rnd(zone)) { zmd->nr_rnd_zones++; - if (!zmd->sb[0].zone) { - /* Super block zone */ + if (zmd->nr_devs == 1 && !zmd->sb[0].zone) { + /* Primary super block zone */ zmd->sb[0].zone = zone; } } + if (zmd->nr_devs > 1 && !zmd->sb[2].zone) { + /* Tertiary superblock zone */ + zmd->sb[2].zone = zone; + } } return 0; } +static void dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev) +{ + int idx; + sector_t zone_offset = 0; + + for(idx = 0; idx < dev->nr_zones; idx++) { + struct dm_zone *zone = &zmd->zones[idx]; + + INIT_LIST_HEAD(&zone->link); + atomic_set(&zone->refcount, 0); + zone->id = idx; + zone->chunk = DMZ_MAP_UNMAPPED; + set_bit(DMZ_RND, &zone->flags); + zone->wp_block = 0; + zmd->nr_rnd_zones++; + zmd->nr_useable_zones++; + if (dev->capacity - zone_offset < zmd->zone_nr_sectors) { + /* Disable runt zone */ + set_bit(DMZ_OFFLINE, &zone->flags); + break; + } + zone_offset += zmd->zone_nr_sectors; + } +} + /* * Free zones descriptors. */ @@ -1259,11 +1418,11 @@ static void dmz_drop_zones(struct dmz_metadata *zmd) */ static int dmz_init_zones(struct dmz_metadata *zmd) { - struct dmz_dev *dev = &zmd->dev[0]; - int ret; + int i, ret; + struct dmz_dev *zoned_dev = &zmd->dev[0]; /* Init */ - zmd->zone_nr_sectors = dev->zone_nr_sectors; + zmd->zone_nr_sectors = zmd->dev[0].zone_nr_sectors; zmd->zone_nr_sectors_shift = ilog2(zmd->zone_nr_sectors); zmd->zone_nr_blocks = dmz_sect2blk(zmd->zone_nr_sectors); zmd->zone_nr_blocks_shift = ilog2(zmd->zone_nr_blocks); @@ -1274,7 +1433,14 @@ static int dmz_init_zones(struct dmz_metadata *zmd) DMZ_BLOCK_SIZE_BITS); /* Allocate zone array */ - zmd->nr_zones = dev->nr_zones; + zmd->nr_zones = 0; + for (i = 0; i < zmd->nr_devs; i++) + zmd->nr_zones += zmd->dev[i].nr_zones; + + if (!zmd->nr_zones) { + DMERR("(%s): No zones found", zmd->devname); + return -ENXIO; + } zmd->zones = kcalloc(zmd->nr_zones, sizeof(struct dm_zone), GFP_KERNEL); if (!zmd->zones) return -ENOMEM; @@ -1282,14 +1448,27 @@ static int dmz_init_zones(struct dmz_metadata *zmd) DMDEBUG("(%s): Using %zu B for zone information", zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones); + if (zmd->nr_devs > 1) { + dmz_emulate_zones(zmd, &zmd->dev[0]); + /* + * Primary superblock zone is always at zone 0 when multiple + * drives are present. + */ + zmd->sb[0].zone = &zmd->zones[0]; + + zoned_dev = &zmd->dev[1]; + } + /* * Get zone information and initialize zone descriptors. At the same * time, determine where the super block should be: first block of the * first randomly writable zone. */ - ret = blkdev_report_zones(dev->bdev, 0, BLK_ALL_ZONES, dmz_init_zone, - zmd); + ret = blkdev_report_zones(zoned_dev->bdev, 0, BLK_ALL_ZONES, + dmz_init_zone, zmd); if (ret < 0) { + DMDEBUG("(%s): Failed to report zones, error %d", + zmd->devname, ret); dmz_drop_zones(zmd); return ret; } @@ -1325,6 +1504,9 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) unsigned int noio_flag; int ret; + if (dev->flags & DMZ_BDEV_REGULAR) + return 0; + /* * Get zone information from disk. Since blkdev_report_zones() uses * GFP_KERNEL by default for memory allocations, set the per-task @@ -2475,19 +2657,34 @@ static void dmz_print_dev(struct dmz_metadata *zmd, int num) { struct dmz_dev *dev = &zmd->dev[num]; - dmz_dev_info(dev, "Host-%s zoned block device", - bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ? - "aware" : "managed"); - dmz_dev_info(dev, " %llu 512-byte logical sectors", - (u64)dev->capacity); - dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors", - dev->nr_zones, (u64)zmd->zone_nr_sectors); + if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE) + dmz_dev_info(dev, "Regular block device"); + else + dmz_dev_info(dev, "Host-%s zoned block device", + bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ? + "aware" : "managed"); + if (zmd->sb_version > 1) { + sector_t sector_offset = + dev->zone_offset << zmd->zone_nr_sectors_shift; + + dmz_dev_info(dev, " %llu 512-byte logical sectors (offset %llu)", + (u64)dev->capacity, (u64)sector_offset); + dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors (offset %llu)", + dev->nr_zones, (u64)zmd->zone_nr_sectors, + (u64)dev->zone_offset); + } else { + dmz_dev_info(dev, " %llu 512-byte logical sectors", + (u64)dev->capacity); + dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors", + dev->nr_zones, (u64)zmd->zone_nr_sectors); + } } /* * Initialize the zoned metadata. */ -int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata, +int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, + struct dmz_metadata **metadata, const char *devname) { struct dmz_metadata *zmd; @@ -2501,6 +2698,7 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata, strcpy(zmd->devname, devname); zmd->dev = dev; + zmd->nr_devs = num_dev; zmd->mblk_rbtree = RB_ROOT; init_rwsem(&zmd->mblk_sem); mutex_init(&zmd->mblk_flush_lock); @@ -2535,11 +2733,24 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata, /* Set metadata zones starting from sb_zone */ for (i = 0; i < zmd->nr_meta_zones << 1; i++) { zone = dmz_get(zmd, zmd->sb[0].zone->id + i); - if (!dmz_is_rnd(zone)) + if (!dmz_is_rnd(zone)) { + dmz_zmd_err(zmd, + "metadata zone %d is not random", i); + ret = -ENXIO; goto err; + } + set_bit(DMZ_META, &zone->flags); + } + if (zmd->sb[2].zone) { + zone = dmz_get(zmd, zmd->sb[2].zone->id); + if (!zone) { + dmz_zmd_err(zmd, + "Tertiary metadata zone not present"); + ret = -ENXIO; + goto err; + } set_bit(DMZ_META, &zone->flags); } - /* Load mapping table */ ret = dmz_load_mapping(zmd); if (ret) @@ -2564,8 +2775,9 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata, goto err; } - dmz_zmd_info(zmd, "DM-Zoned metadata version %d", DMZ_META_VER); - dmz_print_dev(zmd, 0); + dmz_zmd_info(zmd, "DM-Zoned metadata version %d", zmd->sb_version); + for (i = 0; i < zmd->nr_devs; i++) + dmz_print_dev(zmd, i); dmz_zmd_info(zmd, " %u zones of %llu 512-byte logical sectors", zmd->nr_zones, (u64)zmd->zone_nr_sectors); diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index a09fb78ffe88..ea43f6892ced 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -13,6 +13,8 @@ #define DMZ_MIN_BIOS 8192 +#define DMZ_MAX_DEVS 2 + /* * Zone BIO context. */ @@ -38,7 +40,7 @@ struct dm_chunk_work { * Target descriptor. */ struct dmz_target { - struct dm_dev *ddev; + struct dm_dev *ddev[DMZ_MAX_DEVS]; unsigned long flags; @@ -81,7 +83,7 @@ static inline void dmz_bio_endio(struct bio *bio, blk_status_t status) if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK) bio->bi_status = status; - if (bio->bi_status != BLK_STS_OK) + if (bioctx->dev && bio->bi_status != BLK_STS_OK) bioctx->dev->flags |= DMZ_CHECK_BDEV; if (refcount_dec_and_test(&bioctx->ref)) { @@ -690,60 +692,64 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) /* * Get zoned device information. */ -static int dmz_get_zoned_device(struct dm_target *ti, char *path) +static int dmz_get_zoned_device(struct dm_target *ti, char *path, + int idx, int nr_devs) { struct dmz_target *dmz = ti->private; - struct request_queue *q; + struct dm_dev *ddev; struct dmz_dev *dev; - sector_t aligned_capacity; int ret; + struct block_device *bdev; /* Get the target device */ - ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &dmz->ddev); + ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &ddev); if (ret) { ti->error = "Get target device failed"; - dmz->ddev = NULL; return ret; } - dev = kzalloc(sizeof(struct dmz_dev), GFP_KERNEL); - if (!dev) { - ret = -ENOMEM; - goto err; + bdev = ddev->bdev; + if (bdev_zoned_model(bdev) == BLK_ZONED_NONE) { + if (nr_devs == 1) { + ti->error = "Invalid regular device"; + goto err; + } + if (idx != 0) { + ti->error = "First device must be a regular device"; + goto err; + } + if (dmz->ddev[0]) { + ti->error = "Too many regular devices"; + goto err; + } + dev = &dmz->dev[idx]; + dev->flags = DMZ_BDEV_REGULAR; + } else { + if (dmz->ddev[idx]) { + ti->error = "Too many zoned devices"; + goto err; + } + if (nr_devs > 1 && idx == 0) { + ti->error = "First device must be a regular device"; + goto err; + } + dev = &dmz->dev[idx]; } - - dev->bdev = dmz->ddev->bdev; + dev->bdev = bdev; (void)bdevname(dev->bdev, dev->name); - if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE) { - ti->error = "Not a zoned block device"; - ret = -EINVAL; + dev->capacity = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; + if (ti->begin) { + ti->error = "Partial mapping is not supported"; goto err; } - q = bdev_get_queue(dev->bdev); - dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; - aligned_capacity = dev->capacity & - ~((sector_t)blk_queue_zone_sectors(q) - 1); - if (ti->begin || - ((ti->len != dev->capacity) && (ti->len != aligned_capacity))) { - ti->error = "Partial mapping not supported"; - ret = -EINVAL; - goto err; - } - - dev->zone_nr_sectors = blk_queue_zone_sectors(q); - - dev->nr_zones = blkdev_nr_zones(dev->bdev->bd_disk); - - dmz->dev = dev; + dmz->ddev[idx] = ddev; return 0; err: - dm_put_device(ti, dmz->ddev); - kfree(dev); - - return ret; + dm_put_device(ti, ddev); + return -EINVAL; } /* @@ -752,10 +758,56 @@ err: static void dmz_put_zoned_device(struct dm_target *ti) { struct dmz_target *dmz = ti->private; + int i; - dm_put_device(ti, dmz->ddev); - kfree(dmz->dev); - dmz->dev = NULL; + for (i = 0; i < DMZ_MAX_DEVS; i++) { + if (dmz->ddev[i]) { + dm_put_device(ti, dmz->ddev[i]); + dmz->ddev[i] = NULL; + } + } +} + +static int dmz_fixup_devices(struct dm_target *ti) +{ + struct dmz_target *dmz = ti->private; + struct dmz_dev *reg_dev, *zoned_dev; + struct request_queue *q; + + /* + * When we have two devices, the first one must be a regular block + * device and the second a zoned block device. + */ + if (dmz->ddev[0] && dmz->ddev[1]) { + reg_dev = &dmz->dev[0]; + if (!(reg_dev->flags & DMZ_BDEV_REGULAR)) { + ti->error = "Primary disk is not a regular device"; + return -EINVAL; + } + zoned_dev = &dmz->dev[1]; + if (zoned_dev->flags & DMZ_BDEV_REGULAR) { + ti->error = "Secondary disk is not a zoned device"; + return -EINVAL; + } + } else { + reg_dev = NULL; + zoned_dev = &dmz->dev[0]; + if (zoned_dev->flags & DMZ_BDEV_REGULAR) { + ti->error = "Disk is not a zoned device"; + return -EINVAL; + } + } + q = bdev_get_queue(zoned_dev->bdev); + zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q); + zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk); + + if (reg_dev) { + reg_dev->zone_nr_sectors = zoned_dev->zone_nr_sectors; + reg_dev->nr_zones = DIV_ROUND_UP(reg_dev->capacity, + reg_dev->zone_nr_sectors); + zoned_dev->zone_offset = reg_dev->nr_zones; + } + return 0; } /* @@ -764,11 +816,10 @@ static void dmz_put_zoned_device(struct dm_target *ti) static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct dmz_target *dmz; - struct dmz_dev *dev; int ret; /* Check arguments */ - if (argc != 1) { + if (argc < 1 || argc > 2) { ti->error = "Invalid argument count"; return -EINVAL; } @@ -779,18 +830,34 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->error = "Unable to allocate the zoned target descriptor"; return -ENOMEM; } + dmz->dev = kcalloc(2, sizeof(struct dmz_dev), GFP_KERNEL); + if (!dmz->dev) { + ti->error = "Unable to allocate the zoned device descriptors"; + kfree(dmz); + return -ENOMEM; + } ti->private = dmz; /* Get the target zoned block device */ - ret = dmz_get_zoned_device(ti, argv[0]); + ret = dmz_get_zoned_device(ti, argv[0], 0, argc); + if (ret) + goto err; + + if (argc == 2) { + ret = dmz_get_zoned_device(ti, argv[1], 1, argc); + if (ret) { + dmz_put_zoned_device(ti); + goto err; + } + } + ret = dmz_fixup_devices(ti); if (ret) { - dmz->ddev = NULL; + dmz_put_zoned_device(ti); goto err; } /* Initialize metadata */ - dev = dmz->dev; - ret = dmz_ctr_metadata(dev, &dmz->metadata, + ret = dmz_ctr_metadata(dmz->dev, argc, &dmz->metadata, dm_table_device_name(ti->table)); if (ret) { ti->error = "Metadata initialization failed"; @@ -867,6 +934,7 @@ err_meta: err_dev: dmz_put_zoned_device(ti); err: + kfree(dmz->dev); kfree(dmz); return ret; @@ -897,6 +965,7 @@ static void dmz_dtr(struct dm_target *ti) mutex_destroy(&dmz->chunk_lock); + kfree(dmz->dev); kfree(dmz); } @@ -971,10 +1040,17 @@ static int dmz_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { struct dmz_target *dmz = ti->private; - struct dmz_dev *dev = dmz->dev; - sector_t capacity = dev->capacity & ~(dmz_zone_nr_sectors(dmz->metadata) - 1); + unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata); + sector_t capacity; + int r; - return fn(ti, dmz->ddev, 0, capacity, data); + capacity = dmz->dev[0].capacity & ~(zone_nr_sectors - 1); + r = fn(ti, dmz->ddev[0], 0, capacity, data); + if (!r && dmz->ddev[1]) { + capacity = dmz->dev[1].capacity & ~(zone_nr_sectors - 1); + r = fn(ti, dmz->ddev[1], 0, capacity, data); + } + return r; } static void dmz_status(struct dm_target *ti, status_type_t type, @@ -984,6 +1060,7 @@ static void dmz_status(struct dm_target *ti, status_type_t type, struct dmz_target *dmz = ti->private; ssize_t sz = 0; char buf[BDEVNAME_SIZE]; + struct dmz_dev *dev; switch (type) { case STATUSTYPE_INFO: @@ -995,8 +1072,14 @@ static void dmz_status(struct dm_target *ti, status_type_t type, dmz_nr_seq_zones(dmz->metadata)); break; case STATUSTYPE_TABLE: - format_dev_t(buf, dmz->dev->bdev->bd_dev); + dev = &dmz->dev[0]; + format_dev_t(buf, dev->bdev->bd_dev); DMEMIT("%s", buf); + if (dmz->dev[1].bdev) { + dev = &dmz->dev[1]; + format_dev_t(buf, dev->bdev->bd_dev); + DMEMIT(" %s", buf); + } break; } return; @@ -1018,7 +1101,7 @@ static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv, static struct target_type dmz_type = { .name = "zoned", - .version = {1, 1, 0}, + .version = {2, 0, 0}, .features = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM, .module = THIS_MODULE, .ctr = dmz_ctr, diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 2629bd51fa26..4971a765be55 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -52,10 +52,12 @@ struct dmz_dev { struct block_device *bdev; char name[BDEVNAME_SIZE]; + uuid_t uuid; sector_t capacity; unsigned int nr_zones; + unsigned int zone_offset; unsigned int flags; @@ -69,6 +71,7 @@ struct dmz_dev { /* Device flags. */ #define DMZ_BDEV_DYING (1 << 0) #define DMZ_CHECK_BDEV (2 << 0) +#define DMZ_BDEV_REGULAR (4 << 0) /* * Zone descriptor. @@ -163,8 +166,8 @@ struct dmz_reclaim; /* * Functions defined in dm-zoned-metadata.c */ -int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **zmd, - const char *devname); +int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, + struct dmz_metadata **zmd, const char *devname); void dmz_dtr_metadata(struct dmz_metadata *zmd); int dmz_resume_metadata(struct dmz_metadata *zmd); From b18ae8dd9d7685233d7be472c043c545f18d015a Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:51:58 -0500 Subject: [PATCH 157/427] dm: replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 2 +- drivers/md/dm-integrity.c | 2 +- drivers/md/dm-log-writes.c | 2 +- drivers/md/dm-raid.c | 2 +- drivers/md/dm-raid1.c | 2 +- drivers/md/dm-stats.c | 2 +- drivers/md/dm-stripe.c | 2 +- drivers/md/dm-switch.c | 2 +- drivers/md/persistent-data/dm-btree-internal.h | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 91787cde369b..71c651465bdd 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -214,7 +214,7 @@ struct crypt_config { struct mutex bio_alloc_lock; u8 *authenc_key; /* space for keys in authenc() format (if used) */ - u8 key[0]; + u8 key[]; }; #define MIN_IOS 64 diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 3726b987151e..f794dca22032 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -92,7 +92,7 @@ struct journal_entry { } s; __u64 sector; } u; - commit_id_t last_bytes[0]; + commit_id_t last_bytes[]; /* __u8 tag[0]; */ }; diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 8ea20b56b4d6..e3d35c6c9f71 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -127,7 +127,7 @@ struct pending_block { char *data; u32 datalen; struct list_head list; - struct bio_vec vecs[0]; + struct bio_vec vecs[]; }; struct per_bio_data { diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 9a18bef0a5ff..10e8b2fe787b 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -254,7 +254,7 @@ struct raid_set { int mode; } journal_dev; - struct raid_dev dev[0]; + struct raid_dev dev[]; }; static void rs_config_backup(struct raid_set *rs, struct rs_layout *l) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 089aed57e083..2f655d9f4200 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -83,7 +83,7 @@ struct mirror_set { struct work_struct trigger_event; unsigned nr_mirrors; - struct mirror mirror[0]; + struct mirror mirror[]; }; DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(raid1_resync_throttle, diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 71417048256a..35d368c418d0 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -56,7 +56,7 @@ struct dm_stat { size_t percpu_alloc_size; size_t histogram_alloc_size; struct dm_stat_percpu *stat_percpu[NR_CPUS]; - struct dm_stat_shared stat_shared[0]; + struct dm_stat_shared stat_shared[]; }; #define STAT_PRECISE_TIMESTAMPS 1 diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index fa813c0f993d..151d022b032d 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -41,7 +41,7 @@ struct stripe_c { /* Work struct used for triggering events*/ struct work_struct trigger_event; - struct stripe stripe[0]; + struct stripe stripe[]; }; /* diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index 8a0f057b8122..bff4c7fa1cd2 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -53,7 +53,7 @@ struct switch_ctx { /* * Array of dm devices to switch between. */ - struct switch_path path_list[0]; + struct switch_path path_list[]; }; static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths, diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index 55a4096f1334..564896659dd4 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -38,7 +38,7 @@ struct node_header { struct btree_node { struct node_header header; - __le64 keys[0]; + __le64 keys[]; } __packed; From 74244b59a82358b9f51c80981a99c5951ea3028f Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 14 May 2020 08:09:28 +0200 Subject: [PATCH 158/427] dm: use dynamic debug instead of compile-time config option Switch to use dynamic debug to avoid having recompile the kernel just to enable debugging messages. Signed-off-by: Hannes Reinecke Signed-off-by: Mike Snitzer --- include/linux/device-mapper.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 934037d938b9..8750f2dc5613 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -559,13 +559,8 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size); #define DMINFO(fmt, ...) pr_info(DM_FMT(fmt), ##__VA_ARGS__) #define DMINFO_LIMIT(fmt, ...) pr_info_ratelimited(DM_FMT(fmt), ##__VA_ARGS__) -#ifdef CONFIG_DM_DEBUG -#define DMDEBUG(fmt, ...) printk(KERN_DEBUG DM_FMT(fmt), ##__VA_ARGS__) +#define DMDEBUG(fmt, ...) pr_debug(DM_FMT(fmt), ##__VA_ARGS__) #define DMDEBUG_LIMIT(fmt, ...) pr_debug_ratelimited(DM_FMT(fmt), ##__VA_ARGS__) -#else -#define DMDEBUG(fmt, ...) no_printk(fmt, ##__VA_ARGS__) -#define DMDEBUG_LIMIT(fmt, ...) no_printk(fmt, ##__VA_ARGS__) -#endif #define DMEMIT(x...) sz += ((sz >= maxlen) ? \ 0 : scnprintf(result + sz, maxlen - sz, x)) From 49de3b7d21ef12e03358aa77ad6bff4aaf5ac3f5 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 14 May 2020 08:09:29 +0200 Subject: [PATCH 159/427] dm zoned: remove spurious newlines from debugging messages DMDEBUG will already add a newline to the logging messages, so we shouldn't be adding it to the message itself. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-reclaim.c | 4 ++-- drivers/md/dm-zoned-target.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 3c8847d49e5a..7e9b11ee064f 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -403,7 +403,7 @@ out: ret = dmz_flush_metadata(zrc->metadata); if (ret) { - DMDEBUG("(%s): Metadata flush for zone %u failed, err %d\n", + DMDEBUG("(%s): Metadata flush for zone %u failed, err %d", dmz_metadata_label(zmd), rzone->id, ret); return ret; } @@ -491,7 +491,7 @@ static void dmz_reclaim_work(struct work_struct *work) ret = dmz_do_reclaim(zrc); if (ret) { - DMDEBUG("(%s): Reclaim error %d\n", + DMDEBUG("(%s): Reclaim error %d", dmz_metadata_label(zmd), ret); if (!dmz_check_dev(zmd)) return; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index ea43f6892ced..a3d572da70ad 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -515,7 +515,7 @@ static void dmz_flush_work(struct work_struct *work) /* Flush dirty metadata blocks */ ret = dmz_flush_metadata(dmz->metadata); if (ret) - DMDEBUG("(%s): Metadata flush failed, rc=%d\n", + DMDEBUG("(%s): Metadata flush failed, rc=%d", dmz_metadata_label(dmz->metadata), ret); /* Process queued flush requests */ @@ -679,7 +679,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) /* Now ready to handle this BIO */ ret = dmz_queue_chunk_work(dmz, bio); if (ret) { - DMDEBUG("(%s): BIO op %d, can't process chunk %llu, err %i\n", + DMDEBUG("(%s): BIO op %d, can't process chunk %llu, err %i", dmz_metadata_label(zmd), bio_op(bio), (u64)dmz_bio_chunk(zmd, bio), ret); From ac75b09fc62df441eee90fecfe9b2a6ca24976f2 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 14 May 2020 12:55:39 -0400 Subject: [PATCH 160/427] dm: use DMDEBUG macros now that they use pr_debug variants Now that DMDEBUG uses pr_debug and DMDEBUG_LIMIT uses pr_debug_ratelimited cleanup DM's 2 direct pr_debug callers to use them to get the benefit of consistent DM_FMT formatting of debugging messages. While doing so, dm-mpath.c:dm_report_EIO() was switched over to using DMDEBUG_LIMIT due to the potential for error handling floods in the IO completion path. Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 12 ++++++------ drivers/md/dm.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 74246d7c7d68..95f16d816585 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -439,7 +439,7 @@ failed: } /* - * dm_report_EIO() is a macro instead of a function to make pr_debug() + * dm_report_EIO() is a macro instead of a function to make pr_debug_ratelimited() * report the function name and line number of the function from which * it has been invoked. */ @@ -447,11 +447,11 @@ failed: do { \ struct mapped_device *md = dm_table_get_md((m)->ti->table); \ \ - pr_debug("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d\n", \ - dm_device_name(md), \ - test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags), \ - test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \ - dm_noflush_suspending((m)->ti)); \ + DMDEBUG_LIMIT("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d", \ + dm_device_name(md), \ + test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags), \ + test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \ + dm_noflush_suspending((m)->ti)); \ } while (0) /* diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2fcb932eb4bd..1fae647ef108 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2609,7 +2609,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, if (noflush) set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); else - pr_debug("%s: suspending with flush\n", dm_device_name(md)); + DMDEBUG("%s: suspending with flush", dm_device_name(md)); /* * This gets reverted if there's an error later and the targets From 42c689f671233371f5c8c1685ab77bd66c274932 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 13 May 2020 01:45:22 -0700 Subject: [PATCH 161/427] dm zoned: Avoid 64-bit division error in dmz_fixup_devices When building arm32 allyesconfig: ld.lld: error: undefined symbol: __aeabi_uldivmod >>> referenced by dm-zoned-target.c >>> md/dm-zoned-target.o:(dmz_ctr) in archive drivers/built-in.a dmz_fixup_devices uses DIV_ROUND_UP with variables of type sector_t. As such, it should be using DIV_ROUND_UP_SECTOR_T, which handles this automatically. Fixes: 70978208ec91 ("dm zoned: metadata version 2") Signed-off-by: Nathan Chancellor Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-target.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index a3d572da70ad..b586fc67d931 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -803,8 +803,9 @@ static int dmz_fixup_devices(struct dm_target *ti) if (reg_dev) { reg_dev->zone_nr_sectors = zoned_dev->zone_nr_sectors; - reg_dev->nr_zones = DIV_ROUND_UP(reg_dev->capacity, - reg_dev->zone_nr_sectors); + reg_dev->nr_zones = + DIV_ROUND_UP_SECTOR_T(reg_dev->capacity, + reg_dev->zone_nr_sectors); zoned_dev->zone_offset = reg_dev->nr_zones; } return 0; From 489dc0f06a5837f87482c0ce61d830d24e17082e Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 19 May 2020 10:14:19 +0200 Subject: [PATCH 162/427] dm zoned: return NULL if dmz_get_zone_for_reclaim() fails to find a zone The only case where dmz_get_zone_for_reclaim() cannot return a zone is if the respective lists are empty. So we should just return a simple NULL value here as we really don't have an error code which would make sense. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 4 ++-- drivers/md/dm-zoned-reclaim.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index fba690dd37f5..fa7bcb28e952 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1845,7 +1845,7 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd) return dzone; } - return ERR_PTR(-EBUSY); + return NULL; } /* @@ -1865,7 +1865,7 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd) return zone; } - return ERR_PTR(-EBUSY); + return NULL; } /* diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 7e9b11ee064f..201177ad1f17 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -352,8 +352,8 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) /* Get a data zone */ dzone = dmz_get_zone_for_reclaim(zmd); - if (IS_ERR(dzone)) - return PTR_ERR(dzone); + if (!dzone) + return -EBUSY; start = jiffies; if (dmz_is_rnd(dzone)) { From 34f5affd04c4a16d9df19c369bcec6e873e57ffe Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 19 May 2020 10:14:20 +0200 Subject: [PATCH 163/427] dm zoned: separate random and cache zones Instead of lumping emulated zones together with random zones we should be handling them as separate 'cache' zones. This improves code readability and allows an easier implementation of different cache policies. Also add additional allocation flags, to separate the type (cache, random, or sequential) from the purpose (eg reclaim). Also switch the allocation policy to not use random zones as buffer zones if cache zones are present. This avoids a performance drop when all cache zones are used. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 123 ++++++++++++++++++++++++--------- drivers/md/dm-zoned-reclaim.c | 76 ++++++++++++-------- drivers/md/dm-zoned-target.c | 19 +++-- drivers/md/dm-zoned.h | 8 ++- 4 files changed, 159 insertions(+), 67 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index fa7bcb28e952..6c009a8b36a4 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -166,6 +166,7 @@ struct dmz_metadata { unsigned int nr_meta_blocks; unsigned int nr_meta_zones; unsigned int nr_data_zones; + unsigned int nr_cache_zones; unsigned int nr_rnd_zones; unsigned int nr_reserved_seq; unsigned int nr_chunks; @@ -196,6 +197,11 @@ struct dmz_metadata { struct list_head unmap_rnd_list; struct list_head map_rnd_list; + unsigned int nr_cache; + atomic_t unmap_nr_cache; + struct list_head unmap_cache_list; + struct list_head map_cache_list; + unsigned int nr_seq; atomic_t unmap_nr_seq; struct list_head unmap_seq_list; @@ -301,6 +307,16 @@ unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd) return atomic_read(&zmd->unmap_nr_rnd); } +unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd) +{ + return zmd->nr_cache; +} + +unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd) +{ + return atomic_read(&zmd->unmap_nr_cache); +} + unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd) { return zmd->nr_seq; @@ -1390,9 +1406,9 @@ static void dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev) atomic_set(&zone->refcount, 0); zone->id = idx; zone->chunk = DMZ_MAP_UNMAPPED; - set_bit(DMZ_RND, &zone->flags); + set_bit(DMZ_CACHE, &zone->flags); zone->wp_block = 0; - zmd->nr_rnd_zones++; + zmd->nr_cache_zones++; zmd->nr_useable_zones++; if (dev->capacity - zone_offset < zmd->zone_nr_sectors) { /* Disable runt zone */ @@ -1647,7 +1663,9 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) dzone->chunk = chunk; dmz_get_zone_weight(zmd, dzone); - if (dmz_is_rnd(dzone)) + if (dmz_is_cache(dzone)) + list_add_tail(&dzone->link, &zmd->map_cache_list); + else if (dmz_is_rnd(dzone)) list_add_tail(&dzone->link, &zmd->map_rnd_list); else list_add_tail(&dzone->link, &zmd->map_seq_list); @@ -1664,7 +1682,7 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) } bzone = dmz_get(zmd, bzone_id); - if (!dmz_is_rnd(bzone)) { + if (!dmz_is_rnd(bzone) && !dmz_is_cache(bzone)) { dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone %u", chunk, bzone_id); return -EIO; @@ -1676,7 +1694,10 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) bzone->bzone = dzone; dzone->bzone = bzone; dmz_get_zone_weight(zmd, bzone); - list_add_tail(&bzone->link, &zmd->map_rnd_list); + if (dmz_is_cache(bzone)) + list_add_tail(&bzone->link, &zmd->map_cache_list); + else + list_add_tail(&bzone->link, &zmd->map_rnd_list); next: chunk++; e++; @@ -1693,8 +1714,12 @@ next: dzone = dmz_get(zmd, i); if (dmz_is_meta(dzone)) continue; + if (dmz_is_offline(dzone)) + continue; - if (dmz_is_rnd(dzone)) + if (dmz_is_cache(dzone)) + zmd->nr_cache++; + else if (dmz_is_rnd(dzone)) zmd->nr_rnd++; else zmd->nr_seq++; @@ -1707,7 +1732,10 @@ next: /* Unmapped data zone */ set_bit(DMZ_DATA, &dzone->flags); dzone->chunk = DMZ_MAP_UNMAPPED; - if (dmz_is_rnd(dzone)) { + if (dmz_is_cache(dzone)) { + list_add_tail(&dzone->link, &zmd->unmap_cache_list); + atomic_inc(&zmd->unmap_nr_cache); + } else if (dmz_is_rnd(dzone)) { list_add_tail(&dzone->link, &zmd->unmap_rnd_list); atomic_inc(&zmd->unmap_nr_rnd); } else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) { @@ -1751,6 +1779,9 @@ static void __dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone) if (dmz_is_seq(zone)) { /* LRU rotate sequential zone */ list_add_tail(&zone->link, &zmd->map_seq_list); + } else if (dmz_is_cache(zone)) { + /* LRU rotate cache zone */ + list_add_tail(&zone->link, &zmd->map_cache_list); } else { /* LRU rotate random zone */ list_add_tail(&zone->link, &zmd->map_rnd_list); @@ -1826,17 +1857,19 @@ static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone) } /* - * Select a random write zone for reclaim. + * Select a cache or random write zone for reclaim. */ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd) { struct dm_zone *dzone = NULL; struct dm_zone *zone; + struct list_head *zone_list = &zmd->map_rnd_list; - if (list_empty(&zmd->map_rnd_list)) - return ERR_PTR(-EBUSY); + /* If we have cache zones select from the cache zone list */ + if (zmd->nr_cache) + zone_list = &zmd->map_cache_list; - list_for_each_entry(zone, &zmd->map_rnd_list, link) { + list_for_each_entry(zone, zone_list, link) { if (dmz_is_buf(zone)) dzone = zone->bzone; else @@ -1855,9 +1888,6 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd) { struct dm_zone *zone; - if (list_empty(&zmd->map_seq_list)) - return ERR_PTR(-EBUSY); - list_for_each_entry(zone, &zmd->map_seq_list, link) { if (!zone->bzone) continue; @@ -1907,6 +1937,7 @@ struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chu unsigned int dzone_id; struct dm_zone *dzone = NULL; int ret = 0; + int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND; dmz_lock_map(zmd); again: @@ -1921,7 +1952,7 @@ again: goto out; /* Allocate a random zone */ - dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND); + dzone = dmz_alloc_zone(zmd, alloc_flags); if (!dzone) { if (dmz_dev_is_dying(zmd)) { dzone = ERR_PTR(-EIO); @@ -2014,6 +2045,7 @@ struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd, struct dm_zone *dzone) { struct dm_zone *bzone; + int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND; dmz_lock_map(zmd); again: @@ -2022,7 +2054,7 @@ again: goto out; /* Allocate a random zone */ - bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND); + bzone = dmz_alloc_zone(zmd, alloc_flags); if (!bzone) { if (dmz_dev_is_dying(zmd)) { bzone = ERR_PTR(-EIO); @@ -2039,7 +2071,10 @@ again: bzone->chunk = dzone->chunk; bzone->bzone = dzone; dzone->bzone = bzone; - list_add_tail(&bzone->link, &zmd->map_rnd_list); + if (dmz_is_cache(bzone)) + list_add_tail(&bzone->link, &zmd->map_cache_list); + else + list_add_tail(&bzone->link, &zmd->map_rnd_list); out: dmz_unlock_map(zmd); @@ -2055,31 +2090,46 @@ struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags) struct list_head *list; struct dm_zone *zone; - if (flags & DMZ_ALLOC_RND) + if (flags & DMZ_ALLOC_CACHE) + list = &zmd->unmap_cache_list; + else if (flags & DMZ_ALLOC_RND) list = &zmd->unmap_rnd_list; else list = &zmd->unmap_seq_list; + again: if (list_empty(list)) { /* - * No free zone: if this is for reclaim, allow using the - * reserved sequential zones. + * No free zone: return NULL if this is for not reclaim. */ - if (!(flags & DMZ_ALLOC_RECLAIM) || - list_empty(&zmd->reserved_seq_zones_list)) + if (!(flags & DMZ_ALLOC_RECLAIM)) return NULL; - - zone = list_first_entry(&zmd->reserved_seq_zones_list, - struct dm_zone, link); - list_del_init(&zone->link); - atomic_dec(&zmd->nr_reserved_seq_zones); + /* + * Use sequential write zones if we started off with random + * zones and the list is empty + */ + if (list == &zmd->unmap_rnd_list) { + list = &zmd->unmap_seq_list; + goto again; + } + /* + * Fallback to the reserved sequential zones + */ + zone = list_first_entry_or_null(&zmd->reserved_seq_zones_list, + struct dm_zone, link); + if (zone) { + list_del_init(&zone->link); + atomic_dec(&zmd->nr_reserved_seq_zones); + } return zone; } zone = list_first_entry(list, struct dm_zone, link); list_del_init(&zone->link); - if (dmz_is_rnd(zone)) + if (dmz_is_cache(zone)) + atomic_dec(&zmd->unmap_nr_cache); + else if (dmz_is_rnd(zone)) atomic_dec(&zmd->unmap_nr_rnd); else atomic_dec(&zmd->unmap_nr_seq); @@ -2110,7 +2160,10 @@ void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone) dmz_reset_zone(zmd, zone); /* Return the zone to its type unmap list */ - if (dmz_is_rnd(zone)) { + if (dmz_is_cache(zone)) { + list_add_tail(&zone->link, &zmd->unmap_cache_list); + atomic_inc(&zmd->unmap_nr_cache); + } else if (dmz_is_rnd(zone)) { list_add_tail(&zone->link, &zmd->unmap_rnd_list); atomic_inc(&zmd->unmap_nr_rnd); } else if (atomic_read(&zmd->nr_reserved_seq_zones) < @@ -2136,7 +2189,9 @@ void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone, dmz_set_chunk_mapping(zmd, chunk, dzone->id, DMZ_MAP_UNMAPPED); dzone->chunk = chunk; - if (dmz_is_rnd(dzone)) + if (dmz_is_cache(dzone)) + list_add_tail(&dzone->link, &zmd->map_cache_list); + else if (dmz_is_rnd(dzone)) list_add_tail(&dzone->link, &zmd->map_rnd_list); else list_add_tail(&dzone->link, &zmd->map_seq_list); @@ -2711,6 +2766,10 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, INIT_LIST_HEAD(&zmd->unmap_rnd_list); INIT_LIST_HEAD(&zmd->map_rnd_list); + atomic_set(&zmd->unmap_nr_cache, 0); + INIT_LIST_HEAD(&zmd->unmap_cache_list); + INIT_LIST_HEAD(&zmd->map_cache_list); + atomic_set(&zmd->unmap_nr_seq, 0); INIT_LIST_HEAD(&zmd->unmap_seq_list); INIT_LIST_HEAD(&zmd->map_seq_list); @@ -2733,7 +2792,7 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, /* Set metadata zones starting from sb_zone */ for (i = 0; i < zmd->nr_meta_zones << 1; i++) { zone = dmz_get(zmd, zmd->sb[0].zone->id + i); - if (!dmz_is_rnd(zone)) { + if (!dmz_is_rnd(zone) && !dmz_is_cache(zone)) { dmz_zmd_err(zmd, "metadata zone %d is not random", i); ret = -ENXIO; @@ -2785,6 +2844,8 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, zmd->nr_meta_zones * 2); dmz_zmd_debug(zmd, " %u data zones for %u chunks", zmd->nr_data_zones, zmd->nr_chunks); + dmz_zmd_debug(zmd, " %u cache zones (%u unmapped)", + zmd->nr_cache, atomic_read(&zmd->unmap_nr_cache)); dmz_zmd_debug(zmd, " %u random zones (%u unmapped)", zmd->nr_rnd, atomic_read(&zmd->unmap_nr_rnd)); dmz_zmd_debug(zmd, " %u sequential zones (%u unmapped)", diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 201177ad1f17..d566dedcd8b8 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -43,13 +43,13 @@ enum { * Percentage of unmapped (free) random zones below which reclaim starts * even if the target is busy. */ -#define DMZ_RECLAIM_LOW_UNMAP_RND 30 +#define DMZ_RECLAIM_LOW_UNMAP_ZONES 30 /* * Percentage of unmapped (free) random zones above which reclaim will * stop if the target is busy. */ -#define DMZ_RECLAIM_HIGH_UNMAP_RND 50 +#define DMZ_RECLAIM_HIGH_UNMAP_ZONES 50 /* * Align a sequential zone write pointer to chunk_block. @@ -281,17 +281,21 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) struct dm_zone *szone = NULL; struct dmz_metadata *zmd = zrc->metadata; int ret; + int alloc_flags = dmz_nr_cache_zones(zmd) ? + DMZ_ALLOC_RND : DMZ_ALLOC_SEQ; /* Get a free sequential zone */ dmz_lock_map(zmd); - szone = dmz_alloc_zone(zmd, DMZ_ALLOC_RECLAIM); + szone = dmz_alloc_zone(zmd, alloc_flags | DMZ_ALLOC_RECLAIM); dmz_unlock_map(zmd); if (!szone) return -ENOSPC; - DMDEBUG("(%s): Chunk %u, move rnd zone %u (weight %u) to seq zone %u", - dmz_metadata_label(zmd), - chunk, dzone->id, dmz_weight(dzone), szone->id); + DMDEBUG("(%s): Chunk %u, move %s zone %u (weight %u) to %s zone %u", + dmz_metadata_label(zmd), chunk, + dmz_is_cache(dzone) ? "cache" : "rnd", + dzone->id, dmz_weight(dzone), + dmz_is_rnd(szone) ? "rnd" : "seq", szone->id); /* Flush the random data zone into the sequential zone */ ret = dmz_reclaim_copy(zrc, dzone, szone); @@ -356,7 +360,7 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) return -EBUSY; start = jiffies; - if (dmz_is_rnd(dzone)) { + if (dmz_is_cache(dzone) || dmz_is_rnd(dzone)) { if (!dmz_weight(dzone)) { /* Empty zone */ dmz_reclaim_empty(zrc, dzone); @@ -422,29 +426,41 @@ static inline int dmz_target_idle(struct dmz_reclaim *zrc) return time_is_before_jiffies(zrc->atime + DMZ_IDLE_PERIOD); } +static unsigned int dmz_reclaim_percentage(struct dmz_reclaim *zrc) +{ + struct dmz_metadata *zmd = zrc->metadata; + unsigned int nr_cache = dmz_nr_cache_zones(zmd); + unsigned int nr_rnd = dmz_nr_rnd_zones(zmd); + unsigned int nr_unmap, nr_zones; + + if (nr_cache) { + nr_zones = nr_cache; + nr_unmap = dmz_nr_unmap_cache_zones(zmd); + } else { + nr_zones = nr_rnd; + nr_unmap = dmz_nr_unmap_rnd_zones(zmd); + } + return nr_unmap * 100 / nr_zones; +} + /* * Test if reclaim is necessary. */ -static bool dmz_should_reclaim(struct dmz_reclaim *zrc) +static bool dmz_should_reclaim(struct dmz_reclaim *zrc, unsigned int p_unmap) { - struct dmz_metadata *zmd = zrc->metadata; - unsigned int nr_rnd = dmz_nr_rnd_zones(zmd); - unsigned int nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd); - unsigned int p_unmap_rnd = nr_unmap_rnd * 100 / nr_rnd; - /* Reclaim when idle */ - if (dmz_target_idle(zrc) && nr_unmap_rnd < nr_rnd) + if (dmz_target_idle(zrc) && p_unmap < 100) return true; - /* If there are still plenty of random zones, do not reclaim */ - if (p_unmap_rnd >= DMZ_RECLAIM_HIGH_UNMAP_RND) + /* If there are still plenty of cache zones, do not reclaim */ + if (p_unmap >= DMZ_RECLAIM_HIGH_UNMAP_ZONES) return false; /* - * If the percentage of unmapped random zones is low, + * If the percentage of unmapped cache zones is low, * reclaim even if the target is busy. */ - return p_unmap_rnd <= DMZ_RECLAIM_LOW_UNMAP_RND; + return p_unmap <= DMZ_RECLAIM_LOW_UNMAP_ZONES; } /* @@ -454,14 +470,14 @@ static void dmz_reclaim_work(struct work_struct *work) { struct dmz_reclaim *zrc = container_of(work, struct dmz_reclaim, work.work); struct dmz_metadata *zmd = zrc->metadata; - unsigned int nr_rnd, nr_unmap_rnd; - unsigned int p_unmap_rnd; + unsigned int p_unmap; int ret; if (dmz_dev_is_dying(zmd)) return; - if (!dmz_should_reclaim(zrc)) { + p_unmap = dmz_reclaim_percentage(zrc); + if (!dmz_should_reclaim(zrc, p_unmap)) { mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD); return; } @@ -472,22 +488,22 @@ static void dmz_reclaim_work(struct work_struct *work) * and slower if there are still some free random zones to avoid * as much as possible to negatively impact the user workload. */ - nr_rnd = dmz_nr_rnd_zones(zmd); - nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd); - p_unmap_rnd = nr_unmap_rnd * 100 / nr_rnd; - if (dmz_target_idle(zrc) || p_unmap_rnd < DMZ_RECLAIM_LOW_UNMAP_RND / 2) { + if (dmz_target_idle(zrc) || p_unmap < DMZ_RECLAIM_LOW_UNMAP_ZONES / 2) { /* Idle or very low percentage: go fast */ zrc->kc_throttle.throttle = 100; } else { /* Busy but we still have some random zone: throttle */ - zrc->kc_throttle.throttle = min(75U, 100U - p_unmap_rnd / 2); + zrc->kc_throttle.throttle = min(75U, 100U - p_unmap / 2); } - DMDEBUG("(%s): Reclaim (%u): %s, %u%% free rnd zones (%u/%u)", + DMDEBUG("(%s): Reclaim (%u): %s, %u%% free zones (%u/%u cache %u/%u random)", dmz_metadata_label(zmd), zrc->kc_throttle.throttle, (dmz_target_idle(zrc) ? "Idle" : "Busy"), - p_unmap_rnd, nr_unmap_rnd, nr_rnd); + p_unmap, dmz_nr_unmap_cache_zones(zmd), + dmz_nr_cache_zones(zmd), + dmz_nr_unmap_rnd_zones(zmd), + dmz_nr_rnd_zones(zmd)); ret = dmz_do_reclaim(zrc); if (ret) { @@ -585,7 +601,9 @@ void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc) */ void dmz_schedule_reclaim(struct dmz_reclaim *zrc) { - if (dmz_should_reclaim(zrc)) + unsigned int p_unmap = dmz_reclaim_percentage(zrc); + + if (dmz_should_reclaim(zrc, p_unmap)) mod_delayed_work(zrc->wq, &zrc->work, 0); } diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index b586fc67d931..2770e293a97b 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -190,7 +190,8 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, DMDEBUG("(%s): READ chunk %llu -> %s zone %u, block %llu, %u blocks", dmz_metadata_label(zmd), (unsigned long long)dmz_bio_chunk(zmd, bio), - (dmz_is_rnd(zone) ? "RND" : "SEQ"), + (dmz_is_rnd(zone) ? "RND" : + (dmz_is_cache(zone) ? "CACHE" : "SEQ")), zone->id, (unsigned long long)chunk_block, nr_blocks); @@ -198,7 +199,8 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, bzone = zone->bzone; while (chunk_block < end_block) { nr_blocks = 0; - if (dmz_is_rnd(zone) || chunk_block < zone->wp_block) { + if (dmz_is_rnd(zone) || dmz_is_cache(zone) || + chunk_block < zone->wp_block) { /* Test block validity in the data zone */ ret = dmz_block_valid(zmd, zone, chunk_block); if (ret < 0) @@ -331,11 +333,13 @@ static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone, DMDEBUG("(%s): WRITE chunk %llu -> %s zone %u, block %llu, %u blocks", dmz_metadata_label(zmd), (unsigned long long)dmz_bio_chunk(zmd, bio), - (dmz_is_rnd(zone) ? "RND" : "SEQ"), + (dmz_is_rnd(zone) ? "RND" : + (dmz_is_cache(zone) ? "CACHE" : "SEQ")), zone->id, (unsigned long long)chunk_block, nr_blocks); - if (dmz_is_rnd(zone) || chunk_block == zone->wp_block) { + if (dmz_is_rnd(zone) || dmz_is_cache(zone) || + chunk_block == zone->wp_block) { /* * zone is a random zone or it is a sequential zone * and the BIO is aligned to the zone write pointer: @@ -381,7 +385,8 @@ static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone, * Invalidate blocks in the data zone and its * buffer zone if one is mapped. */ - if (dmz_is_rnd(zone) || chunk_block < zone->wp_block) + if (dmz_is_rnd(zone) || dmz_is_cache(zone) || + chunk_block < zone->wp_block) ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks); if (ret == 0 && zone->bzone) ret = dmz_invalidate_blocks(zmd, zone->bzone, @@ -1065,8 +1070,10 @@ static void dmz_status(struct dm_target *ti, status_type_t type, switch (type) { case STATUSTYPE_INFO: - DMEMIT("%u zones %u/%u random %u/%u sequential", + DMEMIT("%u zones %u/%u cache %u/%u random %u/%u sequential", dmz_nr_zones(dmz->metadata), + dmz_nr_unmap_cache_zones(dmz->metadata), + dmz_nr_cache_zones(dmz->metadata), dmz_nr_unmap_rnd_zones(dmz->metadata), dmz_nr_rnd_zones(dmz->metadata), dmz_nr_unmap_seq_zones(dmz->metadata), diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 4971a765be55..29e01a853f84 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -111,6 +111,7 @@ struct dm_zone { */ enum { /* Zone write type */ + DMZ_CACHE, DMZ_RND, DMZ_SEQ, @@ -131,6 +132,7 @@ enum { /* * Zone data accessors. */ +#define dmz_is_cache(z) test_bit(DMZ_CACHE, &(z)->flags) #define dmz_is_rnd(z) test_bit(DMZ_RND, &(z)->flags) #define dmz_is_seq(z) test_bit(DMZ_SEQ, &(z)->flags) #define dmz_is_empty(z) ((z)->wp_block == 0) @@ -189,7 +191,9 @@ bool dmz_check_dev(struct dmz_metadata *zmd); bool dmz_dev_is_dying(struct dmz_metadata *zmd); #define DMZ_ALLOC_RND 0x01 -#define DMZ_ALLOC_RECLAIM 0x02 +#define DMZ_ALLOC_CACHE 0x02 +#define DMZ_ALLOC_SEQ 0x04 +#define DMZ_ALLOC_RECLAIM 0x10 struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags); void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone); @@ -198,6 +202,8 @@ void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *zone, unsigned int chunk); void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone); unsigned int dmz_nr_zones(struct dmz_metadata *zmd); +unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd); +unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd); From 90a9b8693f1b84a695864f2b416cba9bde107268 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 19 May 2020 10:14:21 +0200 Subject: [PATCH 164/427] dm zoned: reclaim random zones when idle When the system is idle we should be starting reclaiming random zones, too. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 13 +++++++++---- drivers/md/dm-zoned-reclaim.c | 30 +++++++++++++++++++----------- drivers/md/dm-zoned.h | 2 +- 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 6c009a8b36a4..b5fd67eff046 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1859,15 +1859,20 @@ static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone) /* * Select a cache or random write zone for reclaim. */ -static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd) +static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, + bool idle) { struct dm_zone *dzone = NULL; struct dm_zone *zone; struct list_head *zone_list = &zmd->map_rnd_list; /* If we have cache zones select from the cache zone list */ - if (zmd->nr_cache) + if (zmd->nr_cache) { zone_list = &zmd->map_cache_list; + /* Try to relaim random zones, too, when idle */ + if (idle && list_empty(zone_list)) + zone_list = &zmd->map_rnd_list; + } list_for_each_entry(zone, zone_list, link) { if (dmz_is_buf(zone)) @@ -1901,7 +1906,7 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd) /* * Select a zone for reclaim. */ -struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd) +struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, bool idle) { struct dm_zone *zone; @@ -1917,7 +1922,7 @@ struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd) if (list_empty(&zmd->reserved_seq_zones_list)) zone = dmz_get_seq_zone_for_reclaim(zmd); else - zone = dmz_get_rnd_zone_for_reclaim(zmd); + zone = dmz_get_rnd_zone_for_reclaim(zmd, idle); dmz_unlock_map(zmd); return zone; diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index d566dedcd8b8..bd62245d4556 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -284,7 +284,10 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) int alloc_flags = dmz_nr_cache_zones(zmd) ? DMZ_ALLOC_RND : DMZ_ALLOC_SEQ; - /* Get a free sequential zone */ + /* Always use sequential zones to reclaim random zones */ + if (dmz_is_rnd(dzone)) + alloc_flags = DMZ_ALLOC_SEQ; + /* Get a free random or sequential zone */ dmz_lock_map(zmd); szone = dmz_alloc_zone(zmd, alloc_flags | DMZ_ALLOC_RECLAIM); dmz_unlock_map(zmd); @@ -343,6 +346,14 @@ static void dmz_reclaim_empty(struct dmz_reclaim *zrc, struct dm_zone *dzone) dmz_unlock_flush(zmd); } +/* + * Test if the target device is idle. + */ +static inline int dmz_target_idle(struct dmz_reclaim *zrc) +{ + return time_is_before_jiffies(zrc->atime + DMZ_IDLE_PERIOD); +} + /* * Find a candidate zone for reclaim and process it. */ @@ -355,7 +366,7 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) int ret; /* Get a data zone */ - dzone = dmz_get_zone_for_reclaim(zmd); + dzone = dmz_get_zone_for_reclaim(zmd, dmz_target_idle(zrc)); if (!dzone) return -EBUSY; @@ -418,14 +429,6 @@ out: return 0; } -/* - * Test if the target device is idle. - */ -static inline int dmz_target_idle(struct dmz_reclaim *zrc) -{ - return time_is_before_jiffies(zrc->atime + DMZ_IDLE_PERIOD); -} - static unsigned int dmz_reclaim_percentage(struct dmz_reclaim *zrc) { struct dmz_metadata *zmd = zrc->metadata; @@ -448,8 +451,13 @@ static unsigned int dmz_reclaim_percentage(struct dmz_reclaim *zrc) */ static bool dmz_should_reclaim(struct dmz_reclaim *zrc, unsigned int p_unmap) { + unsigned int nr_reclaim = dmz_nr_rnd_zones(zrc->metadata); + + if (dmz_nr_cache_zones(zrc->metadata)) + nr_reclaim += dmz_nr_cache_zones(zrc->metadata); + /* Reclaim when idle */ - if (dmz_target_idle(zrc) && p_unmap < 100) + if (dmz_target_idle(zrc) && nr_reclaim) return true; /* If there are still plenty of cache zones, do not reclaim */ diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 29e01a853f84..288054dd7cf4 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -240,7 +240,7 @@ static inline bool dmz_is_active(struct dm_zone *zone) int dmz_lock_zone_reclaim(struct dm_zone *zone); void dmz_unlock_zone_reclaim(struct dm_zone *zone); -struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd); +struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, bool idle); struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, int op); From c5c7885952927384837a070793698d996cb0fbf3 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 19 May 2020 10:14:22 +0200 Subject: [PATCH 165/427] dm zoned: start reclaim with sequential zones Sequential zones perform better for reclaim, so start off using them and only use random zones as a fallback when cache zones are present. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-reclaim.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index bd62245d4556..d62f6890b92c 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -281,15 +281,16 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) struct dm_zone *szone = NULL; struct dmz_metadata *zmd = zrc->metadata; int ret; - int alloc_flags = dmz_nr_cache_zones(zmd) ? - DMZ_ALLOC_RND : DMZ_ALLOC_SEQ; + int alloc_flags = DMZ_ALLOC_SEQ; - /* Always use sequential zones to reclaim random zones */ - if (dmz_is_rnd(dzone)) - alloc_flags = DMZ_ALLOC_SEQ; /* Get a free random or sequential zone */ dmz_lock_map(zmd); +again: szone = dmz_alloc_zone(zmd, alloc_flags | DMZ_ALLOC_RECLAIM); + if (!szone && alloc_flags == DMZ_ALLOC_SEQ && dmz_nr_cache_zones(zmd)) { + alloc_flags = DMZ_ALLOC_RND; + goto again; + } dmz_unlock_map(zmd); if (!szone) return -ENOSPC; From a16b7dee302d2040d9e1fedff2161d1aceda0e8c Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 19 May 2020 10:14:23 +0200 Subject: [PATCH 166/427] dm zoned: terminate reclaim on congestion When dmz_get_chunk_mapping() selects a zone which is under reclaim we should terminate the reclaim copy process. Since we're changing the zone itself, reclaim needs to run afterwards again anyway. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 2 ++ drivers/md/dm-zoned-reclaim.c | 6 ++++-- drivers/md/dm-zoned.h | 3 +++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index b5fd67eff046..db0dc2b5d44d 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1851,7 +1851,9 @@ static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone) { dmz_unlock_map(zmd); dmz_unlock_metadata(zmd); + set_bit(DMZ_RECLAIM_TERMINATE, &zone->flags); wait_on_bit_timeout(&zone->flags, DMZ_RECLAIM, TASK_UNINTERRUPTIBLE, HZ); + clear_bit(DMZ_RECLAIM_TERMINATE, &zone->flags); dmz_lock_metadata(zmd); dmz_lock_map(zmd); } diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index d62f6890b92c..571bc1d41bab 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -143,6 +143,9 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, if (dst_dev->flags & DMZ_BDEV_DYING) return -EIO; + if (dmz_reclaim_should_terminate(src_zone)) + return -EINTR; + /* Get a valid region from the source zone */ ret = dmz_first_valid_block(zmd, src_zone, &block); if (ret <= 0) @@ -515,7 +518,7 @@ static void dmz_reclaim_work(struct work_struct *work) dmz_nr_rnd_zones(zmd)); ret = dmz_do_reclaim(zrc); - if (ret) { + if (ret && ret != -EINTR) { DMDEBUG("(%s): Reclaim error %d", dmz_metadata_label(zmd), ret); if (!dmz_check_dev(zmd)) @@ -615,4 +618,3 @@ void dmz_schedule_reclaim(struct dmz_reclaim *zrc) if (dmz_should_reclaim(zrc, p_unmap)) mod_delayed_work(zrc->wq, &zrc->work, 0); } - diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 288054dd7cf4..8083607b9535 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -127,6 +127,7 @@ enum { /* Zone internal state */ DMZ_RECLAIM, DMZ_SEQ_WRITE_ERR, + DMZ_RECLAIM_TERMINATE, }; /* @@ -140,6 +141,8 @@ enum { #define dmz_is_readonly(z) test_bit(DMZ_READ_ONLY, &(z)->flags) #define dmz_in_reclaim(z) test_bit(DMZ_RECLAIM, &(z)->flags) #define dmz_seq_write_err(z) test_bit(DMZ_SEQ_WRITE_ERR, &(z)->flags) +#define dmz_reclaim_should_terminate(z) \ + test_bit(DMZ_RECLAIM_TERMINATE, &(z)->flags) #define dmz_is_meta(z) test_bit(DMZ_META, &(z)->flags) #define dmz_is_buf(z) test_bit(DMZ_BUF, &(z)->flags) From 5dda3ba6fc9c5d784b48687a3f3003023a0d7c74 Mon Sep 17 00:00:00 2001 From: Jay Fang Date: Sat, 16 May 2020 15:00:14 +0800 Subject: [PATCH 167/427] PCI/PME: Fix kernel-doc of pcie_pme_resume() and pcie_pme_remove() Fix kernel-doc of the "srv" parameter to pcie_pme_resume() and pcie_pme_remove(). Building with W=1 produced these warnings: drivers/pci/pcie/pme.c:414: warning: Function parameter or member 'srv' not described in 'pcie_pme_resume' drivers/pci/pcie/pme.c:437: warning: Function parameter or member 'srv' not described in 'pcie_pme_remove' Link: https://lore.kernel.org/r/1589612414-61682-1-git-send-email-f.fangjian@huawei.com Signed-off-by: Jay Fang Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/pme.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pcie/pme.c b/drivers/pci/pcie/pme.c index f38e6c19dd50..6a32970bb731 100644 --- a/drivers/pci/pcie/pme.c +++ b/drivers/pci/pcie/pme.c @@ -408,7 +408,7 @@ static int pcie_pme_suspend(struct pcie_device *srv) /** * pcie_pme_resume - Resume PCIe PME service device. - * @srv - PCIe service device to resume. + * @srv: PCIe service device to resume. */ static int pcie_pme_resume(struct pcie_device *srv) { @@ -431,7 +431,7 @@ static int pcie_pme_resume(struct pcie_device *srv) /** * pcie_pme_remove - Prepare PCIe PME service device for removal. - * @srv - PCIe service device to remove. + * @srv: PCIe service device to remove. */ static void pcie_pme_remove(struct pcie_device *srv) { From b8af85492fbf1acfb63b1f83e4faafbaa11c73eb Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Thu, 21 May 2020 20:04:39 +0000 Subject: [PATCH 168/427] PCI/switchtec: Correct bool variable type assignment Use "true" instead of 1 to initialize "bool use_dma_mrpc". This resolves the following Coccinelle warning: drivers/pci/switch/switchtec.c:28:12-24: WARNING: Assignment of 0/1 to bool variable Link: https://lore.kernel.org/r/20200521200439.1076672-1-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas Reviewed-by: Logan Gunthorpe --- drivers/pci/switch/switchtec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c index e69cac84b605..850cfeb74608 100644 --- a/drivers/pci/switch/switchtec.c +++ b/drivers/pci/switch/switchtec.c @@ -25,7 +25,7 @@ static int max_devices = 16; module_param(max_devices, int, 0644); MODULE_PARM_DESC(max_devices, "max number of switchtec device instances"); -static bool use_dma_mrpc = 1; +static bool use_dma_mrpc = true; module_param(use_dma_mrpc, bool, 0644); MODULE_PARM_DESC(use_dma_mrpc, "Enable the use of the DMA MRPC feature"); From cfbd83d02da73d984bee314ed3b96bdd3bbe7115 Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Thu, 21 May 2020 19:04:57 +0000 Subject: [PATCH 169/427] PCI: shpchp: Make shpchp_unconfigure_device() void shpchp_unconfigure_device() always returned 0, so there's no reason for a return value. In addition, remove_board() checked the return value for possible error which is unnecessary. Convert shpchp_unconfigure_device() to a void function and remove the return value check. This addresses the following Coccinelle warning: drivers/pci/hotplug/shpchp_pci.c:66:5-7: Unneeded variable: "rc". Return "0" on line 86 Link: https://lore.kernel.org/r/20200521190457.1066600-1-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas --- drivers/pci/hotplug/shpchp.h | 2 +- drivers/pci/hotplug/shpchp_ctrl.c | 3 +-- drivers/pci/hotplug/shpchp_pci.c | 5 +---- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/pci/hotplug/shpchp.h b/drivers/pci/hotplug/shpchp.h index f7f13ee5d06e..6e85885b554c 100644 --- a/drivers/pci/hotplug/shpchp.h +++ b/drivers/pci/hotplug/shpchp.h @@ -164,7 +164,7 @@ u8 shpchp_handle_switch_change(u8 hp_slot, struct controller *ctrl); u8 shpchp_handle_presence_change(u8 hp_slot, struct controller *ctrl); u8 shpchp_handle_power_fault(u8 hp_slot, struct controller *ctrl); int shpchp_configure_device(struct slot *p_slot); -int shpchp_unconfigure_device(struct slot *p_slot); +void shpchp_unconfigure_device(struct slot *p_slot); void cleanup_slots(struct controller *ctrl); void shpchp_queue_pushbutton_work(struct work_struct *work); int shpc_init(struct controller *ctrl, struct pci_dev *pdev); diff --git a/drivers/pci/hotplug/shpchp_ctrl.c b/drivers/pci/hotplug/shpchp_ctrl.c index 078003dcde5b..afdc52d1cae7 100644 --- a/drivers/pci/hotplug/shpchp_ctrl.c +++ b/drivers/pci/hotplug/shpchp_ctrl.c @@ -341,8 +341,7 @@ static int remove_board(struct slot *p_slot) u8 hp_slot; int rc; - if (shpchp_unconfigure_device(p_slot)) - return(1); + shpchp_unconfigure_device(p_slot); hp_slot = p_slot->device - ctrl->slot_device_offset; p_slot = shpchp_find_slot(ctrl, hp_slot + ctrl->slot_device_offset); diff --git a/drivers/pci/hotplug/shpchp_pci.c b/drivers/pci/hotplug/shpchp_pci.c index 115701301487..36db0c3c4ea6 100644 --- a/drivers/pci/hotplug/shpchp_pci.c +++ b/drivers/pci/hotplug/shpchp_pci.c @@ -61,9 +61,8 @@ int shpchp_configure_device(struct slot *p_slot) return ret; } -int shpchp_unconfigure_device(struct slot *p_slot) +void shpchp_unconfigure_device(struct slot *p_slot) { - int rc = 0; struct pci_bus *parent = p_slot->ctrl->pci_dev->subordinate; struct pci_dev *dev, *temp; struct controller *ctrl = p_slot->ctrl; @@ -83,6 +82,4 @@ int shpchp_unconfigure_device(struct slot *p_slot) } pci_unlock_rescan_remove(); - return rc; } - From 7b38fd9760f51cc83d80eed2cfbde8b5ead9e93a Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 21 May 2020 15:40:07 -0500 Subject: [PATCH 170/427] PCI/PTM: Inherit Switch Downstream Port PTM settings from Upstream Port Except for Endpoints, we enable PTM at enumeration-time. Previously we did not account for the fact that Switch Downstream Ports are not permitted to have a PTM capability; their PTM behavior is controlled by the Upstream Port (PCIe r5.0, sec 7.9.16). Since Downstream Ports don't have a PTM capability, we did not mark them as "ptm_enabled", which meant that pci_enable_ptm() on an Endpoint failed because there was no PTM path to it. Mark Downstream Ports as "ptm_enabled" if their Upstream Port has PTM enabled. Fixes: eec097d43100 ("PCI: Add pci_enable_ptm() for drivers to enable PTM on endpoints") Reported-by: Aditya Paluri Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/ptm.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c index 9361f3aa26ab..357a454cafa0 100644 --- a/drivers/pci/pcie/ptm.c +++ b/drivers/pci/pcie/ptm.c @@ -39,10 +39,6 @@ void pci_ptm_init(struct pci_dev *dev) if (!pci_is_pcie(dev)) return; - pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_PTM); - if (!pos) - return; - /* * Enable PTM only on interior devices (root ports, switch ports, * etc.) on the assumption that it causes no link traffic until an @@ -52,6 +48,23 @@ void pci_ptm_init(struct pci_dev *dev) pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END)) return; + /* + * Switch Downstream Ports are not permitted to have a PTM + * capability; their PTM behavior is controlled by the Upstream + * Port (PCIe r5.0, sec 7.9.16). + */ + ups = pci_upstream_bridge(dev); + if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM && + ups && ups->ptm_enabled) { + dev->ptm_granularity = ups->ptm_granularity; + dev->ptm_enabled = 1; + return; + } + + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_PTM); + if (!pos) + return; + pci_read_config_dword(dev, pos + PCI_PTM_CAP, &cap); local_clock = (cap & PCI_PTM_GRANULARITY_MASK) >> 8; @@ -61,7 +74,6 @@ void pci_ptm_init(struct pci_dev *dev) * the spec recommendation (PCIe r3.1, sec 7.32.3), select the * furthest upstream Time Source as the PTM Root. */ - ups = pci_upstream_bridge(dev); if (ups && ups->ptm_enabled) { ctrl = PCI_PTM_CTRL_ENABLE; if (ups->ptm_granularity == 0) From 6e0688dbff625f1e49e3ddb028720ae9fd606f0b Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Wed, 20 May 2020 18:34:10 +0000 Subject: [PATCH 171/427] PCI: Use bridge window names (PCI_BRIDGE_IO_WINDOW etc) Use bridge resource definitions instead of using the PCI_BRIDGE_RESOURCES constant with an integer offeset. Link: https://lore.kernel.org/r/20200520183411.1534621-2-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas --- drivers/pci/setup-bus.c | 114 ++++++++++++++++++---------------- drivers/pcmcia/yenta_socket.c | 16 +++-- include/linux/pci.h | 14 ++++- 3 files changed, 84 insertions(+), 60 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index bbcef1a053ab..07cd7a3817dc 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -583,7 +583,7 @@ static void pci_setup_bridge_io(struct pci_dev *bridge) io_mask = PCI_IO_1K_RANGE_MASK; /* Set up the top and bottom of the PCI I/O segment for this bus */ - res = &bridge->resource[PCI_BRIDGE_RESOURCES + 0]; + res = &bridge->resource[PCI_BRIDGE_IO_WINDOW]; pcibios_resource_to_bus(bridge->bus, ®ion, res); if (res->flags & IORESOURCE_IO) { pci_read_config_word(bridge, PCI_IO_BASE, &l); @@ -613,7 +613,7 @@ static void pci_setup_bridge_mmio(struct pci_dev *bridge) u32 l; /* Set up the top and bottom of the PCI Memory segment for this bus */ - res = &bridge->resource[PCI_BRIDGE_RESOURCES + 1]; + res = &bridge->resource[PCI_BRIDGE_MEM_WINDOW]; pcibios_resource_to_bus(bridge->bus, ®ion, res); if (res->flags & IORESOURCE_MEM) { l = (region.start >> 16) & 0xfff0; @@ -640,7 +640,7 @@ static void pci_setup_bridge_mmio_pref(struct pci_dev *bridge) /* Set up PREF base/limit */ bu = lu = 0; - res = &bridge->resource[PCI_BRIDGE_RESOURCES + 2]; + res = &bridge->resource[PCI_BRIDGE_PREF_MEM_WINDOW]; pcibios_resource_to_bus(bridge->bus, ®ion, res); if (res->flags & IORESOURCE_PREFETCH) { l = (region.start >> 16) & 0xfff0; @@ -707,14 +707,14 @@ int pci_claim_bridge_resource(struct pci_dev *bridge, int i) if (!pci_bus_clip_resource(bridge, i)) return -EINVAL; /* Clipping didn't change anything */ - switch (i - PCI_BRIDGE_RESOURCES) { - case 0: + switch (i) { + case PCI_BRIDGE_IO_WINDOW: pci_setup_bridge_io(bridge); break; - case 1: + case PCI_BRIDGE_MEM_WINDOW: pci_setup_bridge_mmio(bridge); break; - case 2: + case PCI_BRIDGE_PREF_MEM_WINDOW: pci_setup_bridge_mmio_pref(bridge); break; default: @@ -735,18 +735,22 @@ int pci_claim_bridge_resource(struct pci_dev *bridge, int i) static void pci_bridge_check_ranges(struct pci_bus *bus) { struct pci_dev *bridge = bus->self; - struct resource *b_res = &bridge->resource[PCI_BRIDGE_RESOURCES]; + struct resource *b_res; - b_res[1].flags |= IORESOURCE_MEM; + b_res = &bridge->resource[PCI_BRIDGE_MEM_WINDOW]; + b_res->flags |= IORESOURCE_MEM; - if (bridge->io_window) - b_res[0].flags |= IORESOURCE_IO; + if (bridge->io_window) { + b_res = &bridge->resource[PCI_BRIDGE_IO_WINDOW]; + b_res->flags |= IORESOURCE_IO; + } if (bridge->pref_window) { - b_res[2].flags |= IORESOURCE_MEM | IORESOURCE_PREFETCH; + b_res = &bridge->resource[PCI_BRIDGE_PREF_MEM_WINDOW]; + b_res->flags |= IORESOURCE_MEM | IORESOURCE_PREFETCH; if (bridge->pref_64_window) { - b_res[2].flags |= IORESOURCE_MEM_64; - b_res[2].flags |= PCI_PREF_RANGE_TYPE_64; + b_res->flags |= IORESOURCE_MEM_64 | + PCI_PREF_RANGE_TYPE_64; } } } @@ -1105,35 +1109,37 @@ static void pci_bus_size_cardbus(struct pci_bus *bus, struct list_head *realloc_head) { struct pci_dev *bridge = bus->self; - struct resource *b_res = &bridge->resource[PCI_BRIDGE_RESOURCES]; + struct resource *b_res; resource_size_t b_res_3_size = pci_cardbus_mem_size * 2; u16 ctrl; - if (b_res[0].parent) + b_res = &bridge->resource[PCI_CB_BRIDGE_IO_0_WINDOW]; + if (b_res->parent) goto handle_b_res_1; /* * Reserve some resources for CardBus. We reserve a fixed amount * of bus space for CardBus bridges. */ - b_res[0].start = pci_cardbus_io_size; - b_res[0].end = b_res[0].start + pci_cardbus_io_size - 1; - b_res[0].flags |= IORESOURCE_IO | IORESOURCE_STARTALIGN; + b_res->start = pci_cardbus_io_size; + b_res->end = b_res->start + pci_cardbus_io_size - 1; + b_res->flags |= IORESOURCE_IO | IORESOURCE_STARTALIGN; if (realloc_head) { - b_res[0].end -= pci_cardbus_io_size; + b_res->end -= pci_cardbus_io_size; add_to_list(realloc_head, bridge, b_res, pci_cardbus_io_size, - pci_cardbus_io_size); + pci_cardbus_io_size); } handle_b_res_1: - if (b_res[1].parent) + b_res = &bridge->resource[PCI_CB_BRIDGE_IO_1_WINDOW]; + if (b_res->parent) goto handle_b_res_2; - b_res[1].start = pci_cardbus_io_size; - b_res[1].end = b_res[1].start + pci_cardbus_io_size - 1; - b_res[1].flags |= IORESOURCE_IO | IORESOURCE_STARTALIGN; + b_res->start = pci_cardbus_io_size; + b_res->end = b_res->start + pci_cardbus_io_size - 1; + b_res->flags |= IORESOURCE_IO | IORESOURCE_STARTALIGN; if (realloc_head) { - b_res[1].end -= pci_cardbus_io_size; - add_to_list(realloc_head, bridge, b_res+1, pci_cardbus_io_size, - pci_cardbus_io_size); + b_res->end -= pci_cardbus_io_size; + add_to_list(realloc_head, bridge, b_res, pci_cardbus_io_size, + pci_cardbus_io_size); } handle_b_res_2: @@ -1153,21 +1159,22 @@ handle_b_res_2: pci_read_config_word(bridge, PCI_CB_BRIDGE_CONTROL, &ctrl); } - if (b_res[2].parent) + b_res = &bridge->resource[PCI_CB_BRIDGE_MEM_0_WINDOW]; + if (b_res->parent) goto handle_b_res_3; /* * If we have prefetchable memory support, allocate two regions. * Otherwise, allocate one region of twice the size. */ if (ctrl & PCI_CB_BRIDGE_CTL_PREFETCH_MEM0) { - b_res[2].start = pci_cardbus_mem_size; - b_res[2].end = b_res[2].start + pci_cardbus_mem_size - 1; - b_res[2].flags |= IORESOURCE_MEM | IORESOURCE_PREFETCH | - IORESOURCE_STARTALIGN; + b_res->start = pci_cardbus_mem_size; + b_res->end = b_res->start + pci_cardbus_mem_size - 1; + b_res->flags |= IORESOURCE_MEM | IORESOURCE_PREFETCH | + IORESOURCE_STARTALIGN; if (realloc_head) { - b_res[2].end -= pci_cardbus_mem_size; - add_to_list(realloc_head, bridge, b_res+2, - pci_cardbus_mem_size, pci_cardbus_mem_size); + b_res->end -= pci_cardbus_mem_size; + add_to_list(realloc_head, bridge, b_res, + pci_cardbus_mem_size, pci_cardbus_mem_size); } /* Reduce that to half */ @@ -1175,15 +1182,16 @@ handle_b_res_2: } handle_b_res_3: - if (b_res[3].parent) + b_res = &bridge->resource[PCI_CB_BRIDGE_MEM_1_WINDOW]; + if (b_res->parent) goto handle_done; - b_res[3].start = pci_cardbus_mem_size; - b_res[3].end = b_res[3].start + b_res_3_size - 1; - b_res[3].flags |= IORESOURCE_MEM | IORESOURCE_STARTALIGN; + b_res->start = pci_cardbus_mem_size; + b_res->end = b_res->start + b_res_3_size - 1; + b_res->flags |= IORESOURCE_MEM | IORESOURCE_STARTALIGN; if (realloc_head) { - b_res[3].end -= b_res_3_size; - add_to_list(realloc_head, bridge, b_res+3, b_res_3_size, - pci_cardbus_mem_size); + b_res->end -= b_res_3_size; + add_to_list(realloc_head, bridge, b_res, b_res_3_size, + pci_cardbus_mem_size); } handle_done: @@ -1227,7 +1235,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct list_head *realloc_head) break; hdr_type = -1; /* Intentionally invalid - not a PCI device. */ } else { - pref = &bus->self->resource[PCI_BRIDGE_RESOURCES + 2]; + pref = &bus->self->resource[PCI_BRIDGE_PREF_MEM_WINDOW]; hdr_type = bus->self->hdr_type; } @@ -1885,9 +1893,9 @@ static void pci_bus_distribute_available_resources(struct pci_bus *bus, struct pci_dev *dev, *bridge = bus->self; resource_size_t io_per_hp, mmio_per_hp, mmio_pref_per_hp, align; - io_res = &bridge->resource[PCI_BRIDGE_RESOURCES + 0]; - mmio_res = &bridge->resource[PCI_BRIDGE_RESOURCES + 1]; - mmio_pref_res = &bridge->resource[PCI_BRIDGE_RESOURCES + 2]; + io_res = &bridge->resource[PCI_BRIDGE_IO_WINDOW]; + mmio_res = &bridge->resource[PCI_BRIDGE_MEM_WINDOW]; + mmio_pref_res = &bridge->resource[PCI_BRIDGE_PREF_MEM_WINDOW]; /* * The alignment of this bridge is yet to be considered, hence it must @@ -1960,21 +1968,21 @@ static void pci_bus_distribute_available_resources(struct pci_bus *bus, * Reduce the available resource space by what the * bridge and devices below it occupy. */ - res = &dev->resource[PCI_BRIDGE_RESOURCES + 0]; + res = &dev->resource[PCI_BRIDGE_IO_WINDOW]; align = pci_resource_alignment(dev, res); align = align ? ALIGN(io.start, align) - io.start : 0; used_size = align + resource_size(res); if (!res->parent) io.start = min(io.start + used_size, io.end + 1); - res = &dev->resource[PCI_BRIDGE_RESOURCES + 1]; + res = &dev->resource[PCI_BRIDGE_MEM_WINDOW]; align = pci_resource_alignment(dev, res); align = align ? ALIGN(mmio.start, align) - mmio.start : 0; used_size = align + resource_size(res); if (!res->parent) mmio.start = min(mmio.start + used_size, mmio.end + 1); - res = &dev->resource[PCI_BRIDGE_RESOURCES + 2]; + res = &dev->resource[PCI_BRIDGE_PREF_MEM_WINDOW]; align = pci_resource_alignment(dev, res); align = align ? ALIGN(mmio_pref.start, align) - mmio_pref.start : 0; @@ -2027,9 +2035,9 @@ static void pci_bridge_distribute_available_resources(struct pci_dev *bridge, return; /* Take the initial extra resources from the hotplug port */ - available_io = bridge->resource[PCI_BRIDGE_RESOURCES + 0]; - available_mmio = bridge->resource[PCI_BRIDGE_RESOURCES + 1]; - available_mmio_pref = bridge->resource[PCI_BRIDGE_RESOURCES + 2]; + available_io = bridge->resource[PCI_BRIDGE_IO_WINDOW]; + available_mmio = bridge->resource[PCI_BRIDGE_MEM_WINDOW]; + available_mmio_pref = bridge->resource[PCI_BRIDGE_PREF_MEM_WINDOW]; pci_bus_distribute_available_resources(bridge->subordinate, add_list, available_io, diff --git a/drivers/pcmcia/yenta_socket.c b/drivers/pcmcia/yenta_socket.c index bf6529b0b5b0..5fe58dac0d1d 100644 --- a/drivers/pcmcia/yenta_socket.c +++ b/drivers/pcmcia/yenta_socket.c @@ -694,7 +694,7 @@ static int yenta_allocate_res(struct yenta_socket *socket, int nr, unsigned type struct pci_bus_region region; unsigned mask; - res = dev->resource + PCI_BRIDGE_RESOURCES + nr; + res = &dev->resource[nr]; /* Already allocated? */ if (res->parent) return 0; @@ -711,7 +711,7 @@ static int yenta_allocate_res(struct yenta_socket *socket, int nr, unsigned type region.end = config_readl(socket, addr_end) | ~mask; if (region.start && region.end > region.start && !override_bios) { pcibios_bus_to_resource(dev->bus, res, ®ion); - if (pci_claim_resource(dev, PCI_BRIDGE_RESOURCES + nr) == 0) + if (pci_claim_resource(dev, nr) == 0) return 0; dev_info(&dev->dev, "Preassigned resource %d busy or not available, reconfiguring...\n", @@ -751,13 +751,17 @@ static int yenta_allocate_res(struct yenta_socket *socket, int nr, unsigned type static void yenta_allocate_resources(struct yenta_socket *socket) { int program = 0; - program += yenta_allocate_res(socket, 0, IORESOURCE_IO, + program += yenta_allocate_res(socket, PCI_CB_BRIDGE_IO_0_WINDOW, + IORESOURCE_IO, PCI_CB_IO_BASE_0, PCI_CB_IO_LIMIT_0); - program += yenta_allocate_res(socket, 1, IORESOURCE_IO, + program += yenta_allocate_res(socket, PCI_CB_BRIDGE_IO_1_WINDOW, + IORESOURCE_IO, PCI_CB_IO_BASE_1, PCI_CB_IO_LIMIT_1); - program += yenta_allocate_res(socket, 2, IORESOURCE_MEM|IORESOURCE_PREFETCH, + program += yenta_allocate_res(socket, PCI_CB_BRIDGE_MEM_0_WINDOW, + IORESOURCE_MEM | IORESOURCE_PREFETCH, PCI_CB_MEMORY_BASE_0, PCI_CB_MEMORY_LIMIT_0); - program += yenta_allocate_res(socket, 3, IORESOURCE_MEM, + program += yenta_allocate_res(socket, PCI_CB_BRIDGE_MEM_1_WINDOW, + IORESOURCE_MEM, PCI_CB_MEMORY_BASE_1, PCI_CB_MEMORY_LIMIT_1); if (program) pci_setup_cardbus(socket->dev->subordinate); diff --git a/include/linux/pci.h b/include/linux/pci.h index 83ce1cdf5676..cdfb07bfdf7d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -100,9 +100,21 @@ enum { PCI_IOV_RESOURCE_END = PCI_IOV_RESOURCES + PCI_SRIOV_NUM_BARS - 1, #endif - /* Resources assigned to buses behind the bridge */ +/* PCI-to-PCI (P2P) bridge windows */ +#define PCI_BRIDGE_IO_WINDOW (PCI_BRIDGE_RESOURCES + 0) +#define PCI_BRIDGE_MEM_WINDOW (PCI_BRIDGE_RESOURCES + 1) +#define PCI_BRIDGE_PREF_MEM_WINDOW (PCI_BRIDGE_RESOURCES + 2) + +/* CardBus bridge windows */ +#define PCI_CB_BRIDGE_IO_0_WINDOW (PCI_BRIDGE_RESOURCES + 0) +#define PCI_CB_BRIDGE_IO_1_WINDOW (PCI_BRIDGE_RESOURCES + 1) +#define PCI_CB_BRIDGE_MEM_0_WINDOW (PCI_BRIDGE_RESOURCES + 2) +#define PCI_CB_BRIDGE_MEM_1_WINDOW (PCI_BRIDGE_RESOURCES + 3) + +/* Total number of bridge resources for P2P and CardBus */ #define PCI_BRIDGE_RESOURCE_NUM 4 + /* Resources assigned to buses behind the bridge */ PCI_BRIDGE_RESOURCES, PCI_BRIDGE_RESOURCE_END = PCI_BRIDGE_RESOURCES + PCI_BRIDGE_RESOURCE_NUM - 1, From 11fdcf05032812bd23cdc42850d1f650376ec09d Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Wed, 20 May 2020 18:34:11 +0000 Subject: [PATCH 172/427] pcmcia: Use CardBus window names (PCI_CB_BRIDGE_IO_0_WINDOW etc) when freeing Remove the loop used to free CardBus resources and replace it with a yenta_free_res() helper used to release bridge resources explicitly. Link: https://lore.kernel.org/r/20200520183411.1534621-3-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas Acked-by: Dominik Brodowski --- drivers/pcmcia/yenta_socket.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/pcmcia/yenta_socket.c b/drivers/pcmcia/yenta_socket.c index 5fe58dac0d1d..84bfc0e85d6b 100644 --- a/drivers/pcmcia/yenta_socket.c +++ b/drivers/pcmcia/yenta_socket.c @@ -745,6 +745,18 @@ static int yenta_allocate_res(struct yenta_socket *socket, int nr, unsigned type return 0; } +static void yenta_free_res(struct yenta_socket *socket, int nr) +{ + struct pci_dev *dev = socket->dev; + struct resource *res; + + res = &dev->resource[nr]; + if (res->start != 0 && res->end != 0) + release_resource(res); + + res->start = res->end = res->flags = 0; +} + /* * Allocate the bridge mappings for the device.. */ @@ -773,14 +785,10 @@ static void yenta_allocate_resources(struct yenta_socket *socket) */ static void yenta_free_resources(struct yenta_socket *socket) { - int i; - for (i = 0; i < 4; i++) { - struct resource *res; - res = socket->dev->resource + PCI_BRIDGE_RESOURCES + i; - if (res->start != 0 && res->end != 0) - release_resource(res); - res->start = res->end = res->flags = 0; - } + yenta_free_res(socket, PCI_CB_BRIDGE_IO_0_WINDOW); + yenta_free_res(socket, PCI_CB_BRIDGE_IO_1_WINDOW); + yenta_free_res(socket, PCI_CB_BRIDGE_MEM_0_WINDOW); + yenta_free_res(socket, PCI_CB_BRIDGE_MEM_1_WINDOW); } From d45e3c1a5979efd40dbbac9a5c3586f4fa41f734 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Thu, 7 May 2020 13:33:16 +0100 Subject: [PATCH 173/427] PCI: endpoint: Add support to handle multiple base for mapping outbound memory R-Car PCIe controller has support to map multiple memory regions for mapping the outbound memory in local system also the controller limits single allocation for each region (that is, once a chunk is used from the region it cannot be used to allocate a new one). This features inspires to add support for handling multiple memory bases in endpoint framework. With this patch pci_epc_mem_init() initializes address space for endpoint controller which support single window and pci_epc_multi_mem_init() initializes multiple windows supported by endpoint controller. Link: https://lore.kernel.org/r/1588854799-13710-6-git-send-email-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Lad Prabhakar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Yoshihiro Shimoda Acked-by: Kishon Vijay Abraham I --- .../pci/controller/dwc/pcie-designware-ep.c | 16 +- drivers/pci/endpoint/pci-epc-mem.c | 201 ++++++++++++------ include/linux/pci-epc.h | 33 ++- 3 files changed, 171 insertions(+), 79 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 1cdcbd102ce8..a78902cbf2f0 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -412,11 +412,11 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 func_no, reg = ep->msi_cap + PCI_MSI_DATA_32; msg_data = dw_pcie_readw_dbi(pci, reg); } - aligned_offset = msg_addr_lower & (epc->mem->page_size - 1); + aligned_offset = msg_addr_lower & (epc->mem->window.page_size - 1); msg_addr = ((u64)msg_addr_upper) << 32 | (msg_addr_lower & ~aligned_offset); ret = dw_pcie_ep_map_addr(epc, func_no, ep->msi_mem_phys, msg_addr, - epc->mem->page_size); + epc->mem->window.page_size); if (ret) return ret; @@ -459,9 +459,9 @@ int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no, return -EPERM; } - aligned_offset = msg_addr & (epc->mem->page_size - 1); + aligned_offset = msg_addr & (epc->mem->window.page_size - 1); ret = dw_pcie_ep_map_addr(epc, func_no, ep->msi_mem_phys, msg_addr, - epc->mem->page_size); + epc->mem->window.page_size); if (ret) return ret; @@ -477,7 +477,7 @@ void dw_pcie_ep_exit(struct dw_pcie_ep *ep) struct pci_epc *epc = ep->epc; pci_epc_mem_free_addr(epc, ep->msi_mem_phys, ep->msi_mem, - epc->mem->page_size); + epc->mem->window.page_size); pci_epc_mem_exit(epc); } @@ -610,15 +610,15 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep) if (ret < 0) epc->max_functions = 1; - ret = __pci_epc_mem_init(epc, ep->phys_base, ep->addr_size, - ep->page_size); + ret = pci_epc_mem_init(epc, ep->phys_base, ep->addr_size, + ep->page_size); if (ret < 0) { dev_err(dev, "Failed to initialize address space\n"); return ret; } ep->msi_mem = pci_epc_mem_alloc_addr(epc, &ep->msi_mem_phys, - epc->mem->page_size); + epc->mem->window.page_size); if (!ep->msi_mem) { dev_err(dev, "Failed to reserve memory for MSI/MSI-X\n"); return -ENOMEM; diff --git a/drivers/pci/endpoint/pci-epc-mem.c b/drivers/pci/endpoint/pci-epc-mem.c index cdd1d3821249..80c46f3a4590 100644 --- a/drivers/pci/endpoint/pci-epc-mem.c +++ b/drivers/pci/endpoint/pci-epc-mem.c @@ -23,7 +23,7 @@ static int pci_epc_mem_get_order(struct pci_epc_mem *mem, size_t size) { int order; - unsigned int page_shift = ilog2(mem->page_size); + unsigned int page_shift = ilog2(mem->window.page_size); size--; size >>= page_shift; @@ -36,67 +36,95 @@ static int pci_epc_mem_get_order(struct pci_epc_mem *mem, size_t size) } /** - * __pci_epc_mem_init() - initialize the pci_epc_mem structure + * pci_epc_multi_mem_init() - initialize the pci_epc_mem structure * @epc: the EPC device that invoked pci_epc_mem_init - * @phys_base: the physical address of the base - * @size: the size of the address space - * @page_size: size of each page + * @windows: pointer to windows supported by the device + * @num_windows: number of windows device supports * * Invoke to initialize the pci_epc_mem structure used by the * endpoint functions to allocate mapped PCI address. */ -int __pci_epc_mem_init(struct pci_epc *epc, phys_addr_t phys_base, size_t size, - size_t page_size) +int pci_epc_multi_mem_init(struct pci_epc *epc, + struct pci_epc_mem_window *windows, + unsigned int num_windows) { - int ret; - struct pci_epc_mem *mem; - unsigned long *bitmap; + struct pci_epc_mem *mem = NULL; + unsigned long *bitmap = NULL; unsigned int page_shift; - int pages; + size_t page_size; int bitmap_size; + int pages; + int ret; + int i; - if (page_size < PAGE_SIZE) - page_size = PAGE_SIZE; + epc->num_windows = 0; - page_shift = ilog2(page_size); - pages = size >> page_shift; - bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); + if (!windows || !num_windows) + return -EINVAL; - mem = kzalloc(sizeof(*mem), GFP_KERNEL); - if (!mem) { - ret = -ENOMEM; - goto err; + epc->windows = kcalloc(num_windows, sizeof(*epc->windows), GFP_KERNEL); + if (!epc->windows) + return -ENOMEM; + + for (i = 0; i < num_windows; i++) { + page_size = windows[i].page_size; + if (page_size < PAGE_SIZE) + page_size = PAGE_SIZE; + page_shift = ilog2(page_size); + pages = windows[i].size >> page_shift; + bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); + + mem = kzalloc(sizeof(*mem), GFP_KERNEL); + if (!mem) { + ret = -ENOMEM; + i--; + goto err_mem; + } + + bitmap = kzalloc(bitmap_size, GFP_KERNEL); + if (!bitmap) { + ret = -ENOMEM; + kfree(mem); + i--; + goto err_mem; + } + + mem->window.phys_base = windows[i].phys_base; + mem->window.size = windows[i].size; + mem->window.page_size = page_size; + mem->bitmap = bitmap; + mem->pages = pages; + mutex_init(&mem->lock); + epc->windows[i] = mem; } - bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!bitmap) { - ret = -ENOMEM; - goto err_mem; - } - - mem->bitmap = bitmap; - mem->phys_base = phys_base; - mem->page_size = page_size; - mem->pages = pages; - mem->size = size; - mutex_init(&mem->lock); - - epc->mem = mem; + epc->mem = epc->windows[0]; + epc->num_windows = num_windows; return 0; err_mem: - kfree(mem); + for (; i >= 0; i--) { + mem = epc->windows[i]; + kfree(mem->bitmap); + kfree(mem); + } + kfree(epc->windows); -err: -return ret; + return ret; } -EXPORT_SYMBOL_GPL(__pci_epc_mem_init); +EXPORT_SYMBOL_GPL(pci_epc_multi_mem_init); int pci_epc_mem_init(struct pci_epc *epc, phys_addr_t base, size_t size, size_t page_size) { - return __pci_epc_mem_init(epc, base, size, page_size); + struct pci_epc_mem_window mem_window; + + mem_window.phys_base = base; + mem_window.size = size; + mem_window.page_size = page_size; + + return pci_epc_multi_mem_init(epc, &mem_window, 1); } EXPORT_SYMBOL_GPL(pci_epc_mem_init); @@ -109,11 +137,22 @@ EXPORT_SYMBOL_GPL(pci_epc_mem_init); */ void pci_epc_mem_exit(struct pci_epc *epc) { - struct pci_epc_mem *mem = epc->mem; + struct pci_epc_mem *mem; + int i; + if (!epc->num_windows) + return; + + for (i = 0; i < epc->num_windows; i++) { + mem = epc->windows[i]; + kfree(mem->bitmap); + kfree(mem); + } + kfree(epc->windows); + + epc->windows = NULL; epc->mem = NULL; - kfree(mem->bitmap); - kfree(mem); + epc->num_windows = 0; } EXPORT_SYMBOL_GPL(pci_epc_mem_exit); @@ -129,31 +168,60 @@ EXPORT_SYMBOL_GPL(pci_epc_mem_exit); void __iomem *pci_epc_mem_alloc_addr(struct pci_epc *epc, phys_addr_t *phys_addr, size_t size) { - int pageno; void __iomem *virt_addr = NULL; - struct pci_epc_mem *mem = epc->mem; - unsigned int page_shift = ilog2(mem->page_size); + struct pci_epc_mem *mem; + unsigned int page_shift; + size_t align_size; + int pageno; int order; + int i; - size = ALIGN(size, mem->page_size); - order = pci_epc_mem_get_order(mem, size); + for (i = 0; i < epc->num_windows; i++) { + mem = epc->windows[i]; + mutex_lock(&mem->lock); + align_size = ALIGN(size, mem->window.page_size); + order = pci_epc_mem_get_order(mem, align_size); - mutex_lock(&mem->lock); - pageno = bitmap_find_free_region(mem->bitmap, mem->pages, order); - if (pageno < 0) - goto ret; + pageno = bitmap_find_free_region(mem->bitmap, mem->pages, + order); + if (pageno >= 0) { + page_shift = ilog2(mem->window.page_size); + *phys_addr = mem->window.phys_base + + ((phys_addr_t)pageno << page_shift); + virt_addr = ioremap(*phys_addr, align_size); + if (!virt_addr) { + bitmap_release_region(mem->bitmap, + pageno, order); + mutex_unlock(&mem->lock); + continue; + } + mutex_unlock(&mem->lock); + return virt_addr; + } + mutex_unlock(&mem->lock); + } - *phys_addr = mem->phys_base + ((phys_addr_t)pageno << page_shift); - virt_addr = ioremap(*phys_addr, size); - if (!virt_addr) - bitmap_release_region(mem->bitmap, pageno, order); - -ret: - mutex_unlock(&mem->lock); return virt_addr; } EXPORT_SYMBOL_GPL(pci_epc_mem_alloc_addr); +static struct pci_epc_mem *pci_epc_get_matching_window(struct pci_epc *epc, + phys_addr_t phys_addr) +{ + struct pci_epc_mem *mem; + int i; + + for (i = 0; i < epc->num_windows; i++) { + mem = epc->windows[i]; + + if (phys_addr >= mem->window.phys_base && + phys_addr < (mem->window.phys_base + mem->window.size)) + return mem; + } + + return NULL; +} + /** * pci_epc_mem_free_addr() - free the allocated memory address * @epc: the EPC device on which memory was allocated @@ -166,14 +234,23 @@ EXPORT_SYMBOL_GPL(pci_epc_mem_alloc_addr); void pci_epc_mem_free_addr(struct pci_epc *epc, phys_addr_t phys_addr, void __iomem *virt_addr, size_t size) { + struct pci_epc_mem *mem; + unsigned int page_shift; + size_t page_size; int pageno; - struct pci_epc_mem *mem = epc->mem; - unsigned int page_shift = ilog2(mem->page_size); int order; + mem = pci_epc_get_matching_window(epc, phys_addr); + if (!mem) { + pr_err("failed to get matching window\n"); + return; + } + + page_size = mem->window.page_size; + page_shift = ilog2(page_size); iounmap(virt_addr); - pageno = (phys_addr - mem->phys_base) >> page_shift; - size = ALIGN(size, mem->page_size); + pageno = (phys_addr - mem->window.phys_base) >> page_shift; + size = ALIGN(size, page_size); order = pci_epc_mem_get_order(mem, size); mutex_lock(&mem->lock); bitmap_release_region(mem->bitmap, pageno, order); diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index 5bc1de65849e..cc66bec8be90 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -65,20 +65,28 @@ struct pci_epc_ops { struct module *owner; }; +/** + * struct pci_epc_mem_window - address window of the endpoint controller + * @phys_base: physical base address of the PCI address window + * @size: the size of the PCI address window + * @page_size: size of each page + */ +struct pci_epc_mem_window { + phys_addr_t phys_base; + size_t size; + size_t page_size; +}; + /** * struct pci_epc_mem - address space of the endpoint controller - * @phys_base: physical base address of the PCI address space - * @size: the size of the PCI address space + * @window: address window of the endpoint controller * @bitmap: bitmap to manage the PCI address space * @pages: number of bits representing the address region - * @page_size: size of each page * @lock: mutex to protect bitmap */ struct pci_epc_mem { - phys_addr_t phys_base; - size_t size; + struct pci_epc_mem_window window; unsigned long *bitmap; - size_t page_size; int pages; /* mutex to protect against concurrent access for memory allocation*/ struct mutex lock; @@ -89,7 +97,11 @@ struct pci_epc_mem { * @dev: PCI EPC device * @pci_epf: list of endpoint functions present in this EPC device * @ops: function pointers for performing endpoint operations - * @mem: address space of the endpoint controller + * @windows: array of address space of the endpoint controller + * @mem: first window of the endpoint controller, which corresponds to + * default address space of the endpoint controller supporting + * single window. + * @num_windows: number of windows supported by device * @max_functions: max number of functions that can be configured in this EPC * @group: configfs group representing the PCI EPC device * @lock: mutex to protect pci_epc ops @@ -100,7 +112,9 @@ struct pci_epc { struct device dev; struct list_head pci_epf; const struct pci_epc_ops *ops; + struct pci_epc_mem **windows; struct pci_epc_mem *mem; + unsigned int num_windows; u8 max_functions; struct config_group *group; /* mutex to protect against concurrent access of EP controller */ @@ -194,8 +208,9 @@ void pci_epc_put(struct pci_epc *epc); int pci_epc_mem_init(struct pci_epc *epc, phys_addr_t base, size_t size, size_t page_size); -int __pci_epc_mem_init(struct pci_epc *epc, phys_addr_t phys_addr, size_t size, - size_t page_size); +int pci_epc_multi_mem_init(struct pci_epc *epc, + struct pci_epc_mem_window *window, + unsigned int num_windows); void pci_epc_mem_exit(struct pci_epc *epc); void __iomem *pci_epc_mem_alloc_addr(struct pci_epc *epc, phys_addr_t *phys_addr, size_t size); From 4c0f80920923f1033e9fe048f44b6e1ffe18c58d Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Thu, 7 May 2020 13:33:17 +0100 Subject: [PATCH 174/427] dt-bindings: PCI: rcar: Add bindings for R-Car PCIe endpoint controller This patch adds the bindings for the R-Car PCIe endpoint driver. Link: https://lore.kernel.org/r/1588854799-13710-7-git-send-email-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Lad Prabhakar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Reviewed-by: Yoshihiro Shimoda --- .../devicetree/bindings/pci/rcar-pci-ep.yaml | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 Documentation/devicetree/bindings/pci/rcar-pci-ep.yaml diff --git a/Documentation/devicetree/bindings/pci/rcar-pci-ep.yaml b/Documentation/devicetree/bindings/pci/rcar-pci-ep.yaml new file mode 100644 index 000000000000..aa483c7f27fd --- /dev/null +++ b/Documentation/devicetree/bindings/pci/rcar-pci-ep.yaml @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright (C) 2020 Renesas Electronics Europe GmbH - https://www.renesas.com/eu/en/ +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pci/rcar-pci-ep.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Renesas R-Car PCIe Endpoint + +maintainers: + - Lad Prabhakar + - Yoshihiro Shimoda + +properties: + compatible: + items: + - const: renesas,r8a774c0-pcie-ep + - const: renesas,rcar-gen3-pcie-ep + + reg: + maxItems: 5 + + reg-names: + items: + - const: apb-base + - const: memory0 + - const: memory1 + - const: memory2 + - const: memory3 + + power-domains: + maxItems: 1 + + resets: + maxItems: 1 + + clocks: + maxItems: 1 + + clock-names: + items: + - const: pcie + + max-functions: + minimum: 1 + maximum: 1 + +required: + - compatible + - reg + - reg-names + - resets + - power-domains + - clocks + - clock-names + - max-functions + +examples: + - | + #include + #include + + pcie0_ep: pcie-ep@fe000000 { + compatible = "renesas,r8a774c0-pcie-ep", + "renesas,rcar-gen3-pcie-ep"; + reg = <0xfe000000 0x80000>, + <0xfe100000 0x100000>, + <0xfe200000 0x200000>, + <0x30000000 0x8000000>, + <0x38000000 0x8000000>; + reg-names = "apb-base", "memory0", "memory1", "memory2", "memory3"; + resets = <&cpg 319>; + power-domains = <&sysc R8A774C0_PD_ALWAYS_ON>; + clocks = <&cpg CPG_MOD 319>; + clock-names = "pcie"; + max-functions = /bits/ 8 <1>; + }; From 2a6d0d63d99956a66f6605832f11755d74a41951 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Thu, 7 May 2020 13:33:18 +0100 Subject: [PATCH 175/427] PCI: rcar: Add endpoint mode support Add support for R-Car PCIe controller to work in endpoint mode. Link: https://lore.kernel.org/r/1588854799-13710-8-git-send-email-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Lad Prabhakar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Yoshihiro Shimoda --- drivers/pci/controller/Kconfig | 8 + drivers/pci/controller/Makefile | 1 + drivers/pci/controller/pcie-rcar-ep.c | 563 ++++++++++++++++++++++++++ drivers/pci/controller/pcie-rcar.h | 9 + 4 files changed, 581 insertions(+) create mode 100644 drivers/pci/controller/pcie-rcar-ep.c diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig index 32dcab3c103f..0b2c198b865f 100644 --- a/drivers/pci/controller/Kconfig +++ b/drivers/pci/controller/Kconfig @@ -71,6 +71,14 @@ config PCIE_RCAR_HOST Say Y here if you want PCIe controller support on R-Car SoCs in host mode. +config PCIE_RCAR_EP + bool "Renesas R-Car PCIe endpoint controller" + depends on ARCH_RENESAS || COMPILE_TEST + depends on PCI_ENDPOINT + help + Say Y here if you want PCIe controller support on R-Car SoCs in + endpoint mode. + config PCI_HOST_COMMON bool select PCI_ECAM diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile index 39802ee32946..741a5204aa5e 100644 --- a/drivers/pci/controller/Makefile +++ b/drivers/pci/controller/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_PCI_AARDVARK) += pci-aardvark.o obj-$(CONFIG_PCI_TEGRA) += pci-tegra.o obj-$(CONFIG_PCI_RCAR_GEN2) += pci-rcar-gen2.o obj-$(CONFIG_PCIE_RCAR_HOST) += pcie-rcar.o pcie-rcar-host.o +obj-$(CONFIG_PCIE_RCAR_EP) += pcie-rcar.o pcie-rcar-ep.o obj-$(CONFIG_PCI_HOST_COMMON) += pci-host-common.o obj-$(CONFIG_PCI_HOST_GENERIC) += pci-host-generic.o obj-$(CONFIG_PCIE_XILINX) += pcie-xilinx.o diff --git a/drivers/pci/controller/pcie-rcar-ep.c b/drivers/pci/controller/pcie-rcar-ep.c new file mode 100644 index 000000000000..b4a288e24aaf --- /dev/null +++ b/drivers/pci/controller/pcie-rcar-ep.c @@ -0,0 +1,563 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PCIe endpoint driver for Renesas R-Car SoCs + * Copyright (c) 2020 Renesas Electronics Europe GmbH + * + * Author: Lad Prabhakar + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pcie-rcar.h" + +#define RCAR_EPC_MAX_FUNCTIONS 1 + +/* Structure representing the PCIe interface */ +struct rcar_pcie_endpoint { + struct rcar_pcie pcie; + phys_addr_t *ob_mapped_addr; + struct pci_epc_mem_window *ob_window; + u8 max_functions; + unsigned int bar_to_atu[MAX_NR_INBOUND_MAPS]; + unsigned long *ib_window_map; + u32 num_ib_windows; + u32 num_ob_windows; +}; + +static void rcar_pcie_ep_hw_init(struct rcar_pcie *pcie) +{ + u32 val; + + rcar_pci_write_reg(pcie, 0, PCIETCTLR); + + /* Set endpoint mode */ + rcar_pci_write_reg(pcie, 0, PCIEMSR); + + /* Initialize default capabilities. */ + rcar_rmw32(pcie, REXPCAP(0), 0xff, PCI_CAP_ID_EXP); + rcar_rmw32(pcie, REXPCAP(PCI_EXP_FLAGS), + PCI_EXP_FLAGS_TYPE, PCI_EXP_TYPE_ENDPOINT << 4); + rcar_rmw32(pcie, RCONF(PCI_HEADER_TYPE), 0x7f, + PCI_HEADER_TYPE_NORMAL); + + /* Write out the physical slot number = 0 */ + rcar_rmw32(pcie, REXPCAP(PCI_EXP_SLTCAP), PCI_EXP_SLTCAP_PSN, 0); + + val = rcar_pci_read_reg(pcie, EXPCAP(1)); + /* device supports fixed 128 bytes MPSS */ + val &= ~GENMASK(2, 0); + rcar_pci_write_reg(pcie, val, EXPCAP(1)); + + val = rcar_pci_read_reg(pcie, EXPCAP(2)); + /* read requests size 128 bytes */ + val &= ~GENMASK(14, 12); + /* payload size 128 bytes */ + val &= ~GENMASK(7, 5); + rcar_pci_write_reg(pcie, val, EXPCAP(2)); + + /* Set target link speed to 5.0 GT/s */ + rcar_rmw32(pcie, EXPCAP(12), PCI_EXP_LNKSTA_CLS, + PCI_EXP_LNKSTA_CLS_5_0GB); + + /* Set the completion timer timeout to the maximum 50ms. */ + rcar_rmw32(pcie, TLCTLR + 1, 0x3f, 50); + + /* Terminate list of capabilities (Next Capability Offset=0) */ + rcar_rmw32(pcie, RVCCAP(0), 0xfff00000, 0); + + /* flush modifications */ + wmb(); +} + +static int rcar_pcie_ep_get_window(struct rcar_pcie_endpoint *ep, + phys_addr_t addr) +{ + int i; + + for (i = 0; i < ep->num_ob_windows; i++) + if (ep->ob_window[i].phys_base == addr) + return i; + + return -EINVAL; +} + +static int rcar_pcie_parse_outbound_ranges(struct rcar_pcie_endpoint *ep, + struct platform_device *pdev) +{ + struct rcar_pcie *pcie = &ep->pcie; + char outbound_name[10]; + struct resource *res; + unsigned int i = 0; + + ep->num_ob_windows = 0; + for (i = 0; i < RCAR_PCI_MAX_RESOURCES; i++) { + sprintf(outbound_name, "memory%u", i); + res = platform_get_resource_byname(pdev, + IORESOURCE_MEM, + outbound_name); + if (!res) { + dev_err(pcie->dev, "missing outbound window %u\n", i); + return -EINVAL; + } + if (!devm_request_mem_region(&pdev->dev, res->start, + resource_size(res), + outbound_name)) { + dev_err(pcie->dev, "Cannot request memory region %s.\n", + outbound_name); + return -EIO; + } + + ep->ob_window[i].phys_base = res->start; + ep->ob_window[i].size = resource_size(res); + /* controller doesn't support multiple allocation + * from same window, so set page_size to window size + */ + ep->ob_window[i].page_size = resource_size(res); + } + ep->num_ob_windows = i; + + return 0; +} + +static int rcar_pcie_ep_get_pdata(struct rcar_pcie_endpoint *ep, + struct platform_device *pdev) +{ + struct rcar_pcie *pcie = &ep->pcie; + struct pci_epc_mem_window *window; + struct device *dev = pcie->dev; + struct resource res; + int err; + + err = of_address_to_resource(dev->of_node, 0, &res); + if (err) + return err; + pcie->base = devm_ioremap_resource(dev, &res); + if (IS_ERR(pcie->base)) + return PTR_ERR(pcie->base); + + ep->ob_window = devm_kcalloc(dev, RCAR_PCI_MAX_RESOURCES, + sizeof(*window), GFP_KERNEL); + if (!ep->ob_window) + return -ENOMEM; + + rcar_pcie_parse_outbound_ranges(ep, pdev); + + err = of_property_read_u8(dev->of_node, "max-functions", + &ep->max_functions); + if (err < 0 || ep->max_functions > RCAR_EPC_MAX_FUNCTIONS) + ep->max_functions = RCAR_EPC_MAX_FUNCTIONS; + + return 0; +} + +static int rcar_pcie_ep_write_header(struct pci_epc *epc, u8 fn, + struct pci_epf_header *hdr) +{ + struct rcar_pcie_endpoint *ep = epc_get_drvdata(epc); + struct rcar_pcie *pcie = &ep->pcie; + u32 val; + + if (!fn) + val = hdr->vendorid; + else + val = rcar_pci_read_reg(pcie, IDSETR0); + val |= hdr->deviceid << 16; + rcar_pci_write_reg(pcie, val, IDSETR0); + + val = hdr->revid; + val |= hdr->progif_code << 8; + val |= hdr->subclass_code << 16; + val |= hdr->baseclass_code << 24; + rcar_pci_write_reg(pcie, val, IDSETR1); + + if (!fn) + val = hdr->subsys_vendor_id; + else + val = rcar_pci_read_reg(pcie, SUBIDSETR); + val |= hdr->subsys_id << 16; + rcar_pci_write_reg(pcie, val, SUBIDSETR); + + if (hdr->interrupt_pin > PCI_INTERRUPT_INTA) + return -EINVAL; + val = rcar_pci_read_reg(pcie, PCICONF(15)); + val |= (hdr->interrupt_pin << 8); + rcar_pci_write_reg(pcie, val, PCICONF(15)); + + return 0; +} + +static int rcar_pcie_ep_set_bar(struct pci_epc *epc, u8 func_no, + struct pci_epf_bar *epf_bar) +{ + int flags = epf_bar->flags | LAR_ENABLE | LAM_64BIT; + struct rcar_pcie_endpoint *ep = epc_get_drvdata(epc); + u64 size = 1ULL << fls64(epf_bar->size - 1); + dma_addr_t cpu_addr = epf_bar->phys_addr; + enum pci_barno bar = epf_bar->barno; + struct rcar_pcie *pcie = &ep->pcie; + u32 mask; + int idx; + int err; + + idx = find_first_zero_bit(ep->ib_window_map, ep->num_ib_windows); + if (idx >= ep->num_ib_windows) { + dev_err(pcie->dev, "no free inbound window\n"); + return -EINVAL; + } + + if ((flags & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) + flags |= IO_SPACE; + + ep->bar_to_atu[bar] = idx; + /* use 64-bit BARs */ + set_bit(idx, ep->ib_window_map); + set_bit(idx + 1, ep->ib_window_map); + + if (cpu_addr > 0) { + unsigned long nr_zeros = __ffs64(cpu_addr); + u64 alignment = 1ULL << nr_zeros; + + size = min(size, alignment); + } + + size = min(size, 1ULL << 32); + + mask = roundup_pow_of_two(size) - 1; + mask &= ~0xf; + + rcar_pcie_set_inbound(pcie, cpu_addr, + 0x0, mask | flags, idx, false); + + err = rcar_pcie_wait_for_phyrdy(pcie); + if (err) { + dev_err(pcie->dev, "phy not ready\n"); + return -EINVAL; + } + + return 0; +} + +static void rcar_pcie_ep_clear_bar(struct pci_epc *epc, u8 fn, + struct pci_epf_bar *epf_bar) +{ + struct rcar_pcie_endpoint *ep = epc_get_drvdata(epc); + enum pci_barno bar = epf_bar->barno; + u32 atu_index = ep->bar_to_atu[bar]; + + rcar_pcie_set_inbound(&ep->pcie, 0x0, 0x0, 0x0, bar, false); + + clear_bit(atu_index, ep->ib_window_map); + clear_bit(atu_index + 1, ep->ib_window_map); +} + +static int rcar_pcie_ep_set_msi(struct pci_epc *epc, u8 fn, u8 interrupts) +{ + struct rcar_pcie_endpoint *ep = epc_get_drvdata(epc); + struct rcar_pcie *pcie = &ep->pcie; + u32 flags; + + flags = rcar_pci_read_reg(pcie, MSICAP(fn)); + flags |= interrupts << MSICAP0_MMESCAP_OFFSET; + rcar_pci_write_reg(pcie, flags, MSICAP(fn)); + + return 0; +} + +static int rcar_pcie_ep_get_msi(struct pci_epc *epc, u8 fn) +{ + struct rcar_pcie_endpoint *ep = epc_get_drvdata(epc); + struct rcar_pcie *pcie = &ep->pcie; + u32 flags; + + flags = rcar_pci_read_reg(pcie, MSICAP(fn)); + if (!(flags & MSICAP0_MSIE)) + return -EINVAL; + + return ((flags & MSICAP0_MMESE_MASK) >> MSICAP0_MMESE_OFFSET); +} + +static int rcar_pcie_ep_map_addr(struct pci_epc *epc, u8 fn, + phys_addr_t addr, u64 pci_addr, size_t size) +{ + struct rcar_pcie_endpoint *ep = epc_get_drvdata(epc); + struct rcar_pcie *pcie = &ep->pcie; + struct resource_entry win; + struct resource res; + int window; + int err; + + /* check if we have a link. */ + err = rcar_pcie_wait_for_dl(pcie); + if (err) { + dev_err(pcie->dev, "link not up\n"); + return err; + } + + window = rcar_pcie_ep_get_window(ep, addr); + if (window < 0) { + dev_err(pcie->dev, "failed to get corresponding window\n"); + return -EINVAL; + } + + memset(&win, 0x0, sizeof(win)); + memset(&res, 0x0, sizeof(res)); + res.start = pci_addr; + res.end = pci_addr + size - 1; + res.flags = IORESOURCE_MEM; + win.res = &res; + + rcar_pcie_set_outbound(pcie, window, &win); + + ep->ob_mapped_addr[window] = addr; + + return 0; +} + +static void rcar_pcie_ep_unmap_addr(struct pci_epc *epc, u8 fn, + phys_addr_t addr) +{ + struct rcar_pcie_endpoint *ep = epc_get_drvdata(epc); + struct resource_entry win; + struct resource res; + int idx; + + for (idx = 0; idx < ep->num_ob_windows; idx++) + if (ep->ob_mapped_addr[idx] == addr) + break; + + if (idx >= ep->num_ob_windows) + return; + + memset(&win, 0x0, sizeof(win)); + memset(&res, 0x0, sizeof(res)); + win.res = &res; + rcar_pcie_set_outbound(&ep->pcie, idx, &win); + + ep->ob_mapped_addr[idx] = 0; +} + +static int rcar_pcie_ep_assert_intx(struct rcar_pcie_endpoint *ep, + u8 fn, u8 intx) +{ + struct rcar_pcie *pcie = &ep->pcie; + u32 val; + + val = rcar_pci_read_reg(pcie, PCIEMSITXR); + if ((val & PCI_MSI_FLAGS_ENABLE)) { + dev_err(pcie->dev, "MSI is enabled, cannot assert INTx\n"); + return -EINVAL; + } + + val = rcar_pci_read_reg(pcie, PCICONF(1)); + if ((val & INTDIS)) { + dev_err(pcie->dev, "INTx message transmission is disabled\n"); + return -EINVAL; + } + + val = rcar_pci_read_reg(pcie, PCIEINTXR); + if ((val & ASTINTX)) { + dev_err(pcie->dev, "INTx is already asserted\n"); + return -EINVAL; + } + + val |= ASTINTX; + rcar_pci_write_reg(pcie, val, PCIEINTXR); + usleep_range(1000, 1001); + val = rcar_pci_read_reg(pcie, PCIEINTXR); + val &= ~ASTINTX; + rcar_pci_write_reg(pcie, val, PCIEINTXR); + + return 0; +} + +static int rcar_pcie_ep_assert_msi(struct rcar_pcie *pcie, + u8 fn, u8 interrupt_num) +{ + u16 msi_count; + u32 val; + + /* Check MSI enable bit */ + val = rcar_pci_read_reg(pcie, MSICAP(fn)); + if (!(val & MSICAP0_MSIE)) + return -EINVAL; + + /* Get MSI numbers from MME */ + msi_count = ((val & MSICAP0_MMESE_MASK) >> MSICAP0_MMESE_OFFSET); + msi_count = 1 << msi_count; + + if (!interrupt_num || interrupt_num > msi_count) + return -EINVAL; + + val = rcar_pci_read_reg(pcie, PCIEMSITXR); + rcar_pci_write_reg(pcie, val | (interrupt_num - 1), PCIEMSITXR); + + return 0; +} + +static int rcar_pcie_ep_raise_irq(struct pci_epc *epc, u8 fn, + enum pci_epc_irq_type type, + u16 interrupt_num) +{ + struct rcar_pcie_endpoint *ep = epc_get_drvdata(epc); + + switch (type) { + case PCI_EPC_IRQ_LEGACY: + return rcar_pcie_ep_assert_intx(ep, fn, 0); + + case PCI_EPC_IRQ_MSI: + return rcar_pcie_ep_assert_msi(&ep->pcie, fn, interrupt_num); + + default: + return -EINVAL; + } +} + +static int rcar_pcie_ep_start(struct pci_epc *epc) +{ + struct rcar_pcie_endpoint *ep = epc_get_drvdata(epc); + + rcar_pci_write_reg(&ep->pcie, MACCTLR_INIT_VAL, MACCTLR); + rcar_pci_write_reg(&ep->pcie, CFINIT, PCIETCTLR); + + return 0; +} + +static void rcar_pcie_ep_stop(struct pci_epc *epc) +{ + struct rcar_pcie_endpoint *ep = epc_get_drvdata(epc); + + rcar_pci_write_reg(&ep->pcie, 0, PCIETCTLR); +} + +static const struct pci_epc_features rcar_pcie_epc_features = { + .linkup_notifier = false, + .msi_capable = true, + .msix_capable = false, + /* use 64-bit BARs so mark BAR[1,3,5] as reserved */ + .reserved_bar = 1 << BAR_1 | 1 << BAR_3 | 1 << BAR_5, + .bar_fixed_64bit = 1 << BAR_0 | 1 << BAR_2 | 1 << BAR_4, + .bar_fixed_size[0] = 128, + .bar_fixed_size[2] = 256, + .bar_fixed_size[4] = 256, +}; + +static const struct pci_epc_features* +rcar_pcie_ep_get_features(struct pci_epc *epc, u8 func_no) +{ + return &rcar_pcie_epc_features; +} + +static const struct pci_epc_ops rcar_pcie_epc_ops = { + .write_header = rcar_pcie_ep_write_header, + .set_bar = rcar_pcie_ep_set_bar, + .clear_bar = rcar_pcie_ep_clear_bar, + .set_msi = rcar_pcie_ep_set_msi, + .get_msi = rcar_pcie_ep_get_msi, + .map_addr = rcar_pcie_ep_map_addr, + .unmap_addr = rcar_pcie_ep_unmap_addr, + .raise_irq = rcar_pcie_ep_raise_irq, + .start = rcar_pcie_ep_start, + .stop = rcar_pcie_ep_stop, + .get_features = rcar_pcie_ep_get_features, +}; + +static const struct of_device_id rcar_pcie_ep_of_match[] = { + { .compatible = "renesas,r8a774c0-pcie-ep", }, + { .compatible = "renesas,rcar-gen3-pcie-ep" }, + { }, +}; + +static int rcar_pcie_ep_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct rcar_pcie_endpoint *ep; + struct rcar_pcie *pcie; + struct pci_epc *epc; + int err; + + ep = devm_kzalloc(dev, sizeof(*ep), GFP_KERNEL); + if (!ep) + return -ENOMEM; + + pcie = &ep->pcie; + pcie->dev = dev; + + pm_runtime_enable(dev); + err = pm_runtime_get_sync(dev); + if (err < 0) { + dev_err(dev, "pm_runtime_get_sync failed\n"); + goto err_pm_disable; + } + + err = rcar_pcie_ep_get_pdata(ep, pdev); + if (err < 0) { + dev_err(dev, "failed to request resources: %d\n", err); + goto err_pm_put; + } + + ep->num_ib_windows = MAX_NR_INBOUND_MAPS; + ep->ib_window_map = + devm_kcalloc(dev, BITS_TO_LONGS(ep->num_ib_windows), + sizeof(long), GFP_KERNEL); + if (!ep->ib_window_map) { + err = -ENOMEM; + dev_err(dev, "failed to allocate memory for inbound map\n"); + goto err_pm_put; + } + + ep->ob_mapped_addr = devm_kcalloc(dev, ep->num_ob_windows, + sizeof(*ep->ob_mapped_addr), + GFP_KERNEL); + if (!ep->ob_mapped_addr) { + err = -ENOMEM; + dev_err(dev, "failed to allocate memory for outbound memory pointers\n"); + goto err_pm_put; + } + + epc = devm_pci_epc_create(dev, &rcar_pcie_epc_ops); + if (IS_ERR(epc)) { + dev_err(dev, "failed to create epc device\n"); + err = PTR_ERR(epc); + goto err_pm_put; + } + + epc->max_functions = ep->max_functions; + epc_set_drvdata(epc, ep); + + rcar_pcie_ep_hw_init(pcie); + + err = pci_epc_multi_mem_init(epc, ep->ob_window, ep->num_ob_windows); + if (err < 0) { + dev_err(dev, "failed to initialize the epc memory space\n"); + goto err_pm_put; + } + + return 0; + +err_pm_put: + pm_runtime_put(dev); + +err_pm_disable: + pm_runtime_disable(dev); + + return err; +} + +static struct platform_driver rcar_pcie_ep_driver = { + .driver = { + .name = "rcar-pcie-ep", + .of_match_table = rcar_pcie_ep_of_match, + .suppress_bind_attrs = true, + }, + .probe = rcar_pcie_ep_probe, +}; +builtin_platform_driver(rcar_pcie_ep_driver); diff --git a/drivers/pci/controller/pcie-rcar.h b/drivers/pci/controller/pcie-rcar.h index 97640e16af58..d4c698b5f821 100644 --- a/drivers/pci/controller/pcie-rcar.h +++ b/drivers/pci/controller/pcie-rcar.h @@ -17,6 +17,7 @@ #define PCIECDR 0x000020 #define PCIEMSR 0x000028 #define PCIEINTXR 0x000400 +#define ASTINTX BIT(16) #define PCIEPHYSR 0x0007f0 #define PHYRDY BIT(0) #define PCIEMSITXR 0x000840 @@ -55,12 +56,20 @@ /* Configuration */ #define PCICONF(x) (0x010000 + ((x) * 0x4)) +#define INTDIS BIT(10) #define PMCAP(x) (0x010040 + ((x) * 0x4)) +#define MSICAP(x) (0x010050 + ((x) * 0x4)) +#define MSICAP0_MSIE BIT(16) +#define MSICAP0_MMESCAP_OFFSET 17 +#define MSICAP0_MMESE_OFFSET 20 +#define MSICAP0_MMESE_MASK GENMASK(22, 20) #define EXPCAP(x) (0x010070 + ((x) * 0x4)) #define VCCAP(x) (0x010100 + ((x) * 0x4)) /* link layer */ +#define IDSETR0 0x011000 #define IDSETR1 0x011004 +#define SUBIDSETR 0x011024 #define TLCTLR 0x011048 #define MACSR 0x011054 #define SPCHGFIN BIT(4) From 56ad4a1b368a56d7b8f4613fabd90f63099b9bb2 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Thu, 7 May 2020 13:33:19 +0100 Subject: [PATCH 176/427] MAINTAINERS: Add file patterns for rcar PCI device tree bindings Add file pattern entry for rcar PCI devicetree binding, so that when people run ./scripts/get_maintainer.pl the rcar PCI maintainers could also be listed. Link: https://lore.kernel.org/r/1588854799-13710-9-git-send-email-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Lad Prabhakar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Yoshihiro Shimoda --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index e64e5db31497..8cec0ecd36e2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12949,6 +12949,7 @@ M: Yoshihiro Shimoda L: linux-pci@vger.kernel.org L: linux-renesas-soc@vger.kernel.org S: Maintained +F: Documentation/devicetree/bindings/pci/*rcar* F: drivers/pci/controller/*rcar* PCI DRIVER FOR SAMSUNG EXYNOS From c88d19181771bd189147681ef38fc1533ebeff4c Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Mon, 11 May 2020 12:21:14 -0400 Subject: [PATCH 177/427] PCI: pci-bridge-emul: Fix PCIe bit conflicts This patch fixes two bit conflicts in the pci-bridge-emul driver: 1. Bit 3 of Device Status (19 of Device Control) is marked as both Write-1-to-Clear and Read-Only. It should be Write-1-to-Clear. The Read-Only and Reserved bitmasks are shifted by 1 bit due to this error. 2. Bit 12 of Slot Control is marked as both Read-Write and Reserved. It should be Read-Write. Link: https://lore.kernel.org/r/20200511162117.6674-2-jonathan.derrick@intel.com Signed-off-by: Jon Derrick Signed-off-by: Lorenzo Pieralisi Acked-by: Rob Herring --- drivers/pci/pci-bridge-emul.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pci/pci-bridge-emul.c b/drivers/pci/pci-bridge-emul.c index 4f4f54bc732e..faa414655f33 100644 --- a/drivers/pci/pci-bridge-emul.c +++ b/drivers/pci/pci-bridge-emul.c @@ -185,8 +185,8 @@ static const struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { * RO, the rest is reserved */ .w1c = GENMASK(19, 16), - .ro = GENMASK(20, 19), - .rsvd = GENMASK(31, 21), + .ro = GENMASK(21, 20), + .rsvd = GENMASK(31, 22), }, [PCI_EXP_LNKCAP / 4] = { @@ -226,7 +226,7 @@ static const struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { PCI_EXP_SLTSTA_CC | PCI_EXP_SLTSTA_DLLSC) << 16, .ro = (PCI_EXP_SLTSTA_MRLSS | PCI_EXP_SLTSTA_PDS | PCI_EXP_SLTSTA_EIS) << 16, - .rsvd = GENMASK(15, 12) | (GENMASK(15, 9) << 16), + .rsvd = GENMASK(15, 13) | (GENMASK(15, 9) << 16), }, [PCI_EXP_RTCTL / 4] = { From f61959b6e240640d46b65b4dd93b3144d3895ef6 Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Mon, 11 May 2020 12:21:15 -0400 Subject: [PATCH 178/427] PCI: pci-bridge-emul: Fix Root Cap/Status comment The upper 16-bits of Root Control contain the Root Capabilities register. The code instead describes the Root Status register in the upper 16-bits, although it uses the correct bit definition for Root Capabilities, and for Root Status in the next definition. Fix this comment and add a comment describing the Root Status register. Link: https://lore.kernel.org/r/20200511162117.6674-3-jonathan.derrick@intel.com Signed-off-by: Jon Derrick Signed-off-by: Lorenzo Pieralisi Acked-by: Rob Herring --- drivers/pci/pci-bridge-emul.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pci-bridge-emul.c b/drivers/pci/pci-bridge-emul.c index faa414655f33..c00c30ffb198 100644 --- a/drivers/pci/pci-bridge-emul.c +++ b/drivers/pci/pci-bridge-emul.c @@ -234,7 +234,7 @@ static const struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { * Root control has bits [4:0] RW, the rest is * reserved. * - * Root status has bit 0 RO, the rest is reserved. + * Root capabilities has bit 0 RO, the rest is reserved. */ .rw = (PCI_EXP_RTCTL_SECEE | PCI_EXP_RTCTL_SENFEE | PCI_EXP_RTCTL_SEFEE | PCI_EXP_RTCTL_PMEIE | @@ -244,6 +244,10 @@ static const struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { }, [PCI_EXP_RTSTA / 4] = { + /* + * Root status has bits 17 and [15:0] RO, bit 16 W1C, the rest + * is reserved. + */ .ro = GENMASK(15, 0) | PCI_EXP_RTSTA_PENDING, .w1c = PCI_EXP_RTSTA_PME, .rsvd = GENMASK(31, 18), From 2960865127d77bce085d349c94d49faf51517df3 Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Mon, 11 May 2020 12:21:16 -0400 Subject: [PATCH 179/427] PCI: pci-bridge-emul: Update for PCIe 5.0 r1.0 Add missing bits from PCIe 4.0 and updates for PCIe 5.0 r1.0. PCIe 4.0: Device Status bit 6 - W1C - Emergency Power Reduction Detected Link Control bits 15:14 - RW - DRS Signaling Control Slot Control bit 13 - RW - Auto Slow Power Limit Disable PCIe 5.0: Slot Control bit 14 - RW - In-Band PD Disable Link: https://lore.kernel.org/r/20200511162117.6674-4-jonathan.derrick@intel.com Signed-off-by: Jon Derrick Signed-off-by: Lorenzo Pieralisi Acked-by: Rob Herring --- drivers/pci/pci-bridge-emul.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/pci/pci-bridge-emul.c b/drivers/pci/pci-bridge-emul.c index c00c30ffb198..6b1949995dee 100644 --- a/drivers/pci/pci-bridge-emul.c +++ b/drivers/pci/pci-bridge-emul.c @@ -181,12 +181,12 @@ static const struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { .rw = GENMASK(15, 0), /* - * Device status register has 4 bits W1C, then 2 bits - * RO, the rest is reserved + * Device status register has bits 6 and [3:0] W1C, [5:4] RO, + * the rest is reserved */ - .w1c = GENMASK(19, 16), - .ro = GENMASK(21, 20), - .rsvd = GENMASK(31, 22), + .w1c = (BIT(6) | GENMASK(3, 0)) << 16, + .ro = GENMASK(5, 4) << 16, + .rsvd = GENMASK(15, 7) << 16, }, [PCI_EXP_LNKCAP / 4] = { @@ -197,15 +197,16 @@ static const struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { [PCI_EXP_LNKCTL / 4] = { /* - * Link control has bits [1:0] and [11:3] RW, the - * other bits are reserved. - * Link status has bits [13:0] RO, and bits [14:15] + * Link control has bits [15:14], [11:3] and [1:0] RW, the + * rest is reserved. + * + * Link status has bits [13:0] RO, and bits [15:14] * W1C. */ - .rw = GENMASK(11, 3) | GENMASK(1, 0), + .rw = GENMASK(15, 14) | GENMASK(11, 3) | GENMASK(1, 0), .ro = GENMASK(13, 0) << 16, .w1c = GENMASK(15, 14) << 16, - .rsvd = GENMASK(15, 12) | BIT(2), + .rsvd = GENMASK(13, 12) | BIT(2), }, [PCI_EXP_SLTCAP / 4] = { @@ -214,19 +215,19 @@ static const struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { [PCI_EXP_SLTCTL / 4] = { /* - * Slot control has bits [12:0] RW, the rest is + * Slot control has bits [14:0] RW, the rest is * reserved. * - * Slot status has a mix of W1C and RO bits, as well - * as reserved bits. + * Slot status has bits 8 and [4:0] W1C, bits [7:5] RO, the + * rest is reserved. */ - .rw = GENMASK(12, 0), + .rw = GENMASK(14, 0), .w1c = (PCI_EXP_SLTSTA_ABP | PCI_EXP_SLTSTA_PFD | PCI_EXP_SLTSTA_MRLSC | PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_CC | PCI_EXP_SLTSTA_DLLSC) << 16, .ro = (PCI_EXP_SLTSTA_MRLSS | PCI_EXP_SLTSTA_PDS | PCI_EXP_SLTSTA_EIS) << 16, - .rsvd = GENMASK(15, 13) | (GENMASK(15, 9) << 16), + .rsvd = GENMASK(15) | (GENMASK(15, 9) << 16), }, [PCI_EXP_RTCTL / 4] = { From 1446978d560fd708af4dfc10863109bc098e5b26 Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Mon, 11 May 2020 12:21:17 -0400 Subject: [PATCH 180/427] PCI: pci-bridge-emul: Eliminate the 'reserved' member Per PCIe 5.0 r1.0, Terms and Acronyms, Page 80: Reserved register fields must be read only and must return 0 (all 0's for multi-bit fields) when read. Reserved encodings for register and packet fields must not be used. Any implementation dependence on a Reserved field value or encoding will result in an implementation that is not PCI Express-compliant. This patch ensures reads will return 0 for any bit not in the Read-Only, Read-Write, or Write-1-to-Clear bitmasks. Link: https://lore.kernel.org/r/20200511162117.6674-5-jonathan.derrick@intel.com Signed-off-by: Jon Derrick Signed-off-by: Lorenzo Pieralisi Acked-by: Rob Herring --- drivers/pci/pci-bridge-emul.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/drivers/pci/pci-bridge-emul.c b/drivers/pci/pci-bridge-emul.c index 6b1949995dee..ccf26d12ec61 100644 --- a/drivers/pci/pci-bridge-emul.c +++ b/drivers/pci/pci-bridge-emul.c @@ -24,6 +24,17 @@ #define PCI_CAP_PCIE_START PCI_BRIDGE_CONF_END #define PCI_CAP_PCIE_END (PCI_CAP_PCIE_START + PCI_EXP_SLTSTA2 + 2) +/** + * struct pci_bridge_reg_behavior - register bits behaviors + * @ro: Read-Only bits + * @rw: Read-Write bits + * @w1c: Write-1-to-Clear bits + * + * Reads and Writes will be filtered by specified behavior. All other bits not + * declared are assumed 'Reserved' and will return 0 on reads, per PCIe 5.0: + * "Reserved register fields must be read only and must return 0 (all 0's for + * multi-bit fields) when read". + */ struct pci_bridge_reg_behavior { /* Read-only bits */ u32 ro; @@ -33,9 +44,6 @@ struct pci_bridge_reg_behavior { /* Write-1-to-clear bits */ u32 w1c; - - /* Reserved bits (hardwired to 0) */ - u32 rsvd; }; static const struct pci_bridge_reg_behavior pci_regs_behavior[] = { @@ -49,7 +57,6 @@ static const struct pci_bridge_reg_behavior pci_regs_behavior[] = { PCI_COMMAND_FAST_BACK) | (PCI_STATUS_CAP_LIST | PCI_STATUS_66MHZ | PCI_STATUS_FAST_BACK | PCI_STATUS_DEVSEL_MASK) << 16), - .rsvd = GENMASK(15, 10) | ((BIT(6) | GENMASK(3, 0)) << 16), .w1c = PCI_STATUS_ERROR_BITS << 16, }, [PCI_CLASS_REVISION / 4] = { .ro = ~0 }, @@ -96,8 +103,6 @@ static const struct pci_bridge_reg_behavior pci_regs_behavior[] = { GENMASK(11, 8) | GENMASK(3, 0)), .w1c = PCI_STATUS_ERROR_BITS << 16, - - .rsvd = ((BIT(6) | GENMASK(4, 0)) << 16), }, [PCI_MEMORY_BASE / 4] = { @@ -130,12 +135,10 @@ static const struct pci_bridge_reg_behavior pci_regs_behavior[] = { [PCI_CAPABILITY_LIST / 4] = { .ro = GENMASK(7, 0), - .rsvd = GENMASK(31, 8), }, [PCI_ROM_ADDRESS1 / 4] = { .rw = GENMASK(31, 11) | BIT(0), - .rsvd = GENMASK(10, 1), }, /* @@ -158,8 +161,6 @@ static const struct pci_bridge_reg_behavior pci_regs_behavior[] = { .ro = (GENMASK(15, 8) | ((PCI_BRIDGE_CTL_FAST_BACK) << 16)), .w1c = BIT(10) << 16, - - .rsvd = (GENMASK(15, 12) | BIT(4)) << 16, }, }; @@ -186,13 +187,11 @@ static const struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { */ .w1c = (BIT(6) | GENMASK(3, 0)) << 16, .ro = GENMASK(5, 4) << 16, - .rsvd = GENMASK(15, 7) << 16, }, [PCI_EXP_LNKCAP / 4] = { /* All bits are RO, except bit 23 which is reserved */ .ro = lower_32_bits(~BIT(23)), - .rsvd = BIT(23), }, [PCI_EXP_LNKCTL / 4] = { @@ -206,7 +205,6 @@ static const struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { .rw = GENMASK(15, 14) | GENMASK(11, 3) | GENMASK(1, 0), .ro = GENMASK(13, 0) << 16, .w1c = GENMASK(15, 14) << 16, - .rsvd = GENMASK(13, 12) | BIT(2), }, [PCI_EXP_SLTCAP / 4] = { @@ -227,7 +225,6 @@ static const struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { PCI_EXP_SLTSTA_CC | PCI_EXP_SLTSTA_DLLSC) << 16, .ro = (PCI_EXP_SLTSTA_MRLSS | PCI_EXP_SLTSTA_PDS | PCI_EXP_SLTSTA_EIS) << 16, - .rsvd = GENMASK(15) | (GENMASK(15, 9) << 16), }, [PCI_EXP_RTCTL / 4] = { @@ -241,7 +238,6 @@ static const struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { PCI_EXP_RTCTL_SEFEE | PCI_EXP_RTCTL_PMEIE | PCI_EXP_RTCTL_CRSSVE), .ro = PCI_EXP_RTCAP_CRSVIS << 16, - .rsvd = GENMASK(15, 5) | (GENMASK(15, 1) << 16), }, [PCI_EXP_RTSTA / 4] = { @@ -251,7 +247,6 @@ static const struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { */ .ro = GENMASK(15, 0) | PCI_EXP_RTSTA_PENDING, .w1c = PCI_EXP_RTSTA_PME, - .rsvd = GENMASK(31, 18), }, }; @@ -359,7 +354,8 @@ int pci_bridge_emul_conf_read(struct pci_bridge_emul *bridge, int where, * Make sure we never return any reserved bit with a value * different from 0. */ - *value &= ~behavior[reg / 4].rsvd; + *value &= behavior[reg / 4].ro | behavior[reg / 4].rw | + behavior[reg / 4].w1c; if (size == 1) *value = (*value >> (8 * (where & 3))) & 0xff; From 668b4490a3a56f062172096692ebe9d12e26be5c Mon Sep 17 00:00:00 2001 From: Alan Mikhak Date: Wed, 1 Apr 2020 16:58:13 -0700 Subject: [PATCH 181/427] PCI: dwc: Program outbound ATU upper limit register Function dw_pcie_prog_outbound_atu_unroll() does not program the upper 32-bit ATU limit register. Since ATU programming functions limit the size of the translated region to 4GB by using a u32 size parameter, these issues may combine into undefined behavior for resource sizes with non-zero upper 32-bits. For example, a 128GB address space starting at physical CPU address of 0x2000000000 with size of 0x2000000000 needs the following values programmed into the lower and upper 32-bit limit registers: 0x3fffffff in the upper 32-bit limit register 0xffffffff in the lower 32-bit limit register Currently, only the lower 32-bit limit register is programmed with a value of 0xffffffff but the upper 32-bit limit register is not being programmed. As a result, the upper 32-bit limit register remains at its default value after reset of 0x0. These issues may combine to produce undefined behavior since the ATU limit address may be lower than the ATU base address. Programming the upper ATU limit address register prevents such undefined behavior despite the region size getting truncated due to the 32-bit size limit. Link: https://lore.kernel.org/r/1585785493-23210-1-git-send-email-alan.mikhak@sifive.com Signed-off-by: Alan Mikhak Signed-off-by: Lorenzo Pieralisi Acked-by: Gustavo Pimentel --- drivers/pci/controller/dwc/pcie-designware.c | 7 +++++-- drivers/pci/controller/dwc/pcie-designware.h | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 681548c88282..c92496e36fd5 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -244,13 +244,16 @@ static void dw_pcie_prog_outbound_atu_unroll(struct dw_pcie *pci, int index, u64 pci_addr, u32 size) { u32 retries, val; + u64 limit_addr = cpu_addr + size - 1; dw_pcie_writel_ob_unroll(pci, index, PCIE_ATU_UNR_LOWER_BASE, lower_32_bits(cpu_addr)); dw_pcie_writel_ob_unroll(pci, index, PCIE_ATU_UNR_UPPER_BASE, upper_32_bits(cpu_addr)); - dw_pcie_writel_ob_unroll(pci, index, PCIE_ATU_UNR_LIMIT, - lower_32_bits(cpu_addr + size - 1)); + dw_pcie_writel_ob_unroll(pci, index, PCIE_ATU_UNR_LOWER_LIMIT, + lower_32_bits(limit_addr)); + dw_pcie_writel_ob_unroll(pci, index, PCIE_ATU_UNR_UPPER_LIMIT, + upper_32_bits(limit_addr)); dw_pcie_writel_ob_unroll(pci, index, PCIE_ATU_UNR_LOWER_TARGET, lower_32_bits(pci_addr)); dw_pcie_writel_ob_unroll(pci, index, PCIE_ATU_UNR_UPPER_TARGET, diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h index d6e1f397e6b0..656e00f8fbeb 100644 --- a/drivers/pci/controller/dwc/pcie-designware.h +++ b/drivers/pci/controller/dwc/pcie-designware.h @@ -112,9 +112,10 @@ #define PCIE_ATU_UNR_REGION_CTRL2 0x04 #define PCIE_ATU_UNR_LOWER_BASE 0x08 #define PCIE_ATU_UNR_UPPER_BASE 0x0C -#define PCIE_ATU_UNR_LIMIT 0x10 +#define PCIE_ATU_UNR_LOWER_LIMIT 0x10 #define PCIE_ATU_UNR_LOWER_TARGET 0x14 #define PCIE_ATU_UNR_UPPER_TARGET 0x18 +#define PCIE_ATU_UNR_UPPER_LIMIT 0x20 /* * The default address offset between dbi_base and atu_base. Root controller From 558c1225a2f33a46ee31fba6f2bc4c4838d58752 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Wed, 15 Apr 2020 16:49:53 +0800 Subject: [PATCH 182/427] PCI: dwc: intel: Make intel_pcie_cpu_addr() static Fix the following sparse warning: drivers/pci/controller/dwc/pcie-intel-gw.c:456:5: warning: symbol 'intel_pcie_cpu_addr' was not declared. Should it be static? Link: https://lore.kernel.org/r/20200415084953.6533-1-yanaijie@huawei.com Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/pcie-intel-gw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pcie-intel-gw.c b/drivers/pci/controller/dwc/pcie-intel-gw.c index fc2a12212dec..2d8dbb318087 100644 --- a/drivers/pci/controller/dwc/pcie-intel-gw.c +++ b/drivers/pci/controller/dwc/pcie-intel-gw.c @@ -453,7 +453,7 @@ static int intel_pcie_msi_init(struct pcie_port *pp) return 0; } -u64 intel_pcie_cpu_addr(struct dw_pcie *pcie, u64 cpu_addr) +static u64 intel_pcie_cpu_addr(struct dw_pcie *pcie, u64 cpu_addr) { return cpu_addr + BUS_IATU_OFFSET; } From c8a119779f5609de8dcd98630f71cc7f1b2e4e8c Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 29 Apr 2020 01:50:27 +0000 Subject: [PATCH 183/427] PCI: dwc: pci-dra7xx: Use devm_platform_ioremap_resource_byname() platform_get_resource() may fail and return NULL, so we had better check its return value to avoid a NULL pointer dereference a bit later in the code. Fix it to use devm_platform_ioremap_resource_byname() instead of calling platform_get_resource_byname() and devm_ioremap(). Link: https://lore.kernel.org/r/20200429015027.134485-1-weiyongjun1@huawei.com Signed-off-by: Wei Yongjun [lorenzo.pieralisi@arm.com: commit log] Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/pci-dra7xx.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-dra7xx.c b/drivers/pci/controller/dwc/pci-dra7xx.c index 3b0e58f2de58..6184ebc9392d 100644 --- a/drivers/pci/controller/dwc/pci-dra7xx.c +++ b/drivers/pci/controller/dwc/pci-dra7xx.c @@ -840,7 +840,6 @@ static int __init dra7xx_pcie_probe(struct platform_device *pdev) struct phy **phy; struct device_link **link; void __iomem *base; - struct resource *res; struct dw_pcie *pci; struct dra7xx_pcie *dra7xx; struct device *dev = &pdev->dev; @@ -877,10 +876,9 @@ static int __init dra7xx_pcie_probe(struct platform_device *pdev) return irq; } - res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "ti_conf"); - base = devm_ioremap(dev, res->start, resource_size(res)); - if (!base) - return -ENOMEM; + base = devm_platform_ioremap_resource_byname(pdev, "ti_conf"); + if (IS_ERR(base)) + return PTR_ERR(base); phy_count = of_property_count_strings(np, "phy-names"); if (phy_count < 0) { From 0414b93e78d87ecc24ae1a7e61fe97deb29fa2f4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 1 May 2020 12:39:21 +0100 Subject: [PATCH 184/427] PCI: dwc: Fix inner MSI IRQ domain registration On a system that uses the internal DWC MSI widget, I get this warning from debugfs when CONFIG_GENERIC_IRQ_DEBUGFS is selected: debugfs: File ':soc:pcie@fc000000' in directory 'domains' already present! This is due to the fact that the DWC MSI code tries to register two IRQ domains for the same firmware node, without telling the low level code how to distinguish them (by setting a bus token). This further confuses debugfs which tries to create corresponding files for each domain. Fix it by tagging the inner domain as DOMAIN_BUS_NEXUS, which is the closest thing we have as to "generic MSI". Link: https://lore.kernel.org/r/20200501113921.366597-1-maz@kernel.org Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Acked-by: Jingoo Han --- drivers/pci/controller/dwc/pcie-designware-host.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index 395feb8ca051..3c43311bb95c 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -264,6 +264,8 @@ int dw_pcie_allocate_domains(struct pcie_port *pp) return -ENOMEM; } + irq_domain_update_bus_token(pp->irq_domain, DOMAIN_BUS_NEXUS); + pp->msi_domain = pci_msi_create_irq_domain(fwnode, &dw_pcie_msi_domain_info, pp->irq_domain); From 87dccf09323fc363bd0d072fcc12b96622ab8c69 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 29 Apr 2020 17:42:30 +0100 Subject: [PATCH 185/427] PCI: amlogic: meson: Don't use FAST_LINK_MODE to set up link The vim3l board does not work with a standard PCIe switch (ASM1184e), spitting all kind of errors - hinting at HW misconfiguration (no link, port enumeration issues, etc). According to the the Synopsys DWC PCIe Reference Manual, in the section dedicated to the PLCR register, bit 7 is described (FAST_LINK_MODE) as: "Sets all internal timers to fast mode for simulation purposes." it is sound to set this bit from a simulation perspective, but on actual silicon, which expects timers to have a nominal value, it is not. Make sure the FAST_LINK_MODE bit is cleared when configuring the RC to solve this problem. Link: https://lore.kernel.org/r/20200429164230.309922-1-maz@kernel.org Fixes: 9c0ef6d34fdb ("PCI: amlogic: Add the Amlogic Meson PCIe controller driver") Signed-off-by: Marc Zyngier [lorenzo.pieralisi@arm.com: commit log] Signed-off-by: Lorenzo Pieralisi Reviewed-by: Neil Armstrong Acked-by: Rob Herring --- drivers/pci/controller/dwc/pci-meson.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-meson.c b/drivers/pci/controller/dwc/pci-meson.c index 3715dceca1bf..ca59ba9e0ecd 100644 --- a/drivers/pci/controller/dwc/pci-meson.c +++ b/drivers/pci/controller/dwc/pci-meson.c @@ -289,11 +289,11 @@ static void meson_pcie_init_dw(struct meson_pcie *mp) meson_cfg_writel(mp, val, PCIE_CFG0); val = meson_elb_readl(mp, PCIE_PORT_LINK_CTRL_OFF); - val &= ~LINK_CAPABLE_MASK; + val &= ~(LINK_CAPABLE_MASK | FAST_LINK_MODE); meson_elb_writel(mp, val, PCIE_PORT_LINK_CTRL_OFF); val = meson_elb_readl(mp, PCIE_PORT_LINK_CTRL_OFF); - val |= LINK_CAPABLE_X1 | FAST_LINK_MODE; + val |= LINK_CAPABLE_X1; meson_elb_writel(mp, val, PCIE_PORT_LINK_CTRL_OFF); val = meson_elb_readl(mp, PCIE_GEN2_CTRL_OFF); From 03f8c1b350d001db4e3912095f09a68740a7ff23 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Fri, 20 Dec 2019 15:35:50 +0530 Subject: [PATCH 186/427] PCI: dwc: Use private data pointer of "struct irq_domain" to get pcie_port No functional change. Get "struct pcie_port *" from private data pointer of "struct irq_domain" in dw_pcie_irq_domain_free() to make it look similar to how "struct pcie_port *" is obtained in dw_pcie_irq_domain_alloc() Link: https://lore.kernel.org/r/20191220100550.777-1-kishon@ti.com Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Lorenzo Pieralisi Acked-by: Gustavo Pimentel --- drivers/pci/controller/dwc/pcie-designware-host.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index 3c43311bb95c..0a4a5aa6fe46 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -236,7 +236,7 @@ static void dw_pcie_irq_domain_free(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { struct irq_data *d = irq_domain_get_irq_data(domain, virq); - struct pcie_port *pp = irq_data_get_irq_chip_data(d); + struct pcie_port *pp = domain->host_data; unsigned long flags; raw_spin_lock_irqsave(&pp->lock, flags); From b4756d43a1dd2cfb778eb3cef3ba2efd2dcd5263 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 22 May 2020 10:58:53 +0200 Subject: [PATCH 187/427] dm zoned: remove leftover hunk for switching to sequential zones Remove a leftover hunk to switch from random zones to sequential zones when selecting a reclaim zone; the logic has moved into the caller and this hunk is now pointless. Fixes: 34f5affd04c4 ("dm zoned: separate random and cache zones") Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index db0dc2b5d44d..4a2e351365c5 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -2111,14 +2111,6 @@ again: */ if (!(flags & DMZ_ALLOC_RECLAIM)) return NULL; - /* - * Use sequential write zones if we started off with random - * zones and the list is empty - */ - if (list == &zmd->unmap_rnd_list) { - list = &zmd->unmap_seq_list; - goto again; - } /* * Fallback to the reserved sequential zones */ From 8eb613c0b8f19627ba1846dcf78bb2c85edbe8dd Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Sun, 3 May 2020 01:00:02 -0400 Subject: [PATCH 188/427] ima: verify mprotect change is consistent with mmap policy Files can be mmap'ed read/write and later changed to execute to circumvent IMA's mmap appraise policy rules. Due to locking issues (mmap semaphore would be taken prior to i_mutex), files can not be measured or appraised at this point. Eliminate this integrity gap, by denying the mprotect PROT_EXECUTE change, if an mmap appraise policy rule exists. On mprotect change success, return 0. On failure, return -EACESS. Reviewed-by: Lakshmi Ramasubramanian Signed-off-by: Mimi Zohar --- include/linux/ima.h | 7 +++++ security/integrity/ima/ima_main.c | 51 +++++++++++++++++++++++++++++++ security/security.c | 7 ++++- 3 files changed, 64 insertions(+), 1 deletion(-) diff --git a/include/linux/ima.h b/include/linux/ima.h index aefe758f4466..9164e1534ec9 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -18,6 +18,7 @@ extern int ima_file_check(struct file *file, int mask); extern void ima_post_create_tmpfile(struct inode *inode); extern void ima_file_free(struct file *file); extern int ima_file_mmap(struct file *file, unsigned long prot); +extern int ima_file_mprotect(struct vm_area_struct *vma, unsigned long prot); extern int ima_load_data(enum kernel_load_data_id id); extern int ima_read_file(struct file *file, enum kernel_read_file_id id); extern int ima_post_read_file(struct file *file, void *buf, loff_t size, @@ -70,6 +71,12 @@ static inline int ima_file_mmap(struct file *file, unsigned long prot) return 0; } +static inline int ima_file_mprotect(struct vm_area_struct *vma, + unsigned long prot) +{ + return 0; +} + static inline int ima_load_data(enum kernel_load_data_id id) { return 0; diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index f96f151294e6..800fb3bba418 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -393,6 +393,57 @@ int ima_file_mmap(struct file *file, unsigned long prot) return 0; } +/** + * ima_file_mprotect - based on policy, limit mprotect change + * @prot: contains the protection that will be applied by the kernel. + * + * Files can be mmap'ed read/write and later changed to execute to circumvent + * IMA's mmap appraisal policy rules. Due to locking issues (mmap semaphore + * would be taken before i_mutex), files can not be measured or appraised at + * this point. Eliminate this integrity gap by denying the mprotect + * PROT_EXECUTE change, if an mmap appraise policy rule exists. + * + * On mprotect change success, return 0. On failure, return -EACESS. + */ +int ima_file_mprotect(struct vm_area_struct *vma, unsigned long prot) +{ + struct ima_template_desc *template; + struct file *file = vma->vm_file; + char filename[NAME_MAX]; + char *pathbuf = NULL; + const char *pathname = NULL; + struct inode *inode; + int result = 0; + int action; + u32 secid; + int pcr; + + /* Is mprotect making an mmap'ed file executable? */ + if (!vma->vm_file || !(prot & PROT_EXEC) || (vma->vm_flags & VM_EXEC)) + return 0; + + security_task_getsecid(current, &secid); + inode = file_inode(vma->vm_file); + action = ima_get_action(inode, current_cred(), secid, MAY_EXEC, + MMAP_CHECK, &pcr, &template, 0); + + /* Is the mmap'ed file in policy? */ + if (!(action & (IMA_MEASURE | IMA_APPRAISE_SUBMASK))) + return 0; + + if (action & IMA_APPRAISE_SUBMASK) + result = -EPERM; + + file = vma->vm_file; + pathname = ima_d_path(&file->f_path, &pathbuf, filename); + integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, pathname, + "collect_data", "failed-mprotect", result, 0); + if (pathbuf) + __putname(pathbuf); + + return result; +} + /** * ima_bprm_check - based on policy, collect/store measurement. * @bprm: contains the linux_binprm structure diff --git a/security/security.c b/security/security.c index 7fed24b9d57e..dd0917c5bfe9 100644 --- a/security/security.c +++ b/security/security.c @@ -1512,7 +1512,12 @@ int security_mmap_addr(unsigned long addr) int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) { - return call_int_hook(file_mprotect, 0, vma, reqprot, prot); + int ret; + + ret = call_int_hook(file_mprotect, 0, vma, reqprot, prot); + if (ret) + return ret; + return ima_file_mprotect(vma, prot); } int security_file_lock(struct file *file, unsigned int cmd) From bcfefb61cd2bc86329915a4074f7b4c48b00b33a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 30 Apr 2020 15:18:45 +0900 Subject: [PATCH 189/427] kconfig: announce removal of 'kvmconfig' and 'xenconfig' shorthands kvmconfig' is a shorthand for kvm_guest.config to save 7 character typing. xenconfig' is a shorthand for xen.config to save 1 character typing. There is nothing more than that. There are more files in kernel/configs/, so it is not maintainable to wire-up every config fragment to the Kconfig Makefile. Hence, we should not do this at all. These will be removed after Linux 5.10. Meanwhile, the following warning message will be displayed if they are used. WARNING: 'make kvmconfig' will be removed after Linux 5.10 Please use 'make kvm_guest.config' instead. Signed-off-by: Masahiro Yamada --- scripts/kconfig/Makefile | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile index f3355bd86aa5..426881ea954f 100644 --- a/scripts/kconfig/Makefile +++ b/scripts/kconfig/Makefile @@ -96,11 +96,13 @@ configfiles=$(wildcard $(srctree)/kernel/configs/$@ $(srctree)/arch/$(SRCARCH)/c PHONY += kvmconfig kvmconfig: kvm_guest.config - @: + @echo >&2 "WARNING: 'make $@' will be removed after Linux 5.10" + @echo >&2 " Please use 'make $<' instead." PHONY += xenconfig xenconfig: xen.config - @: + @echo >&2 "WARNING: 'make $@' will be removed after Linux 5.10" + @echo >&2 " Please use 'make $<' instead." PHONY += tinyconfig tinyconfig: @@ -139,9 +141,6 @@ help: @echo ' helpnewconfig - List new options and help text' @echo ' olddefconfig - Same as oldconfig but sets new symbols to their' @echo ' default value without prompting' - @echo ' kvmconfig - Enable additional options for kvm guest kernel support' - @echo ' xenconfig - Enable additional options for xen dom0 and guest kernel' - @echo ' support' @echo ' tinyconfig - Configure the tiniest possible kernel' @echo ' testconfig - Run Kconfig unit tests (requires python3 and pytest)' From 3044dd05289d6c768c7c5d00f58208fc1f3927f3 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 1 May 2020 15:01:41 +0900 Subject: [PATCH 190/427] kbuild: invoke syncconfig if autoconf.h is missing If include/generated/autoconf.h is accidentally lost somehow, there is no clear way to fix it. Make it self-healing. Signed-off-by: Masahiro Yamada --- Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index faec37f23c48..e329bd4a0f17 100644 --- a/Makefile +++ b/Makefile @@ -698,7 +698,7 @@ $(KCONFIG_CONFIG): # This exploits the 'multi-target pattern rule' trick. # The syncconfig should be executed only once to make all the targets. # (Note: use the grouped target '&:' when we bump to GNU Make 4.3) -%/auto.conf %/auto.conf.cmd: $(KCONFIG_CONFIG) +%/config/auto.conf %/config/auto.conf.cmd %/generated/autoconf.h: $(KCONFIG_CONFIG) $(Q)$(MAKE) -f $(srctree)/Makefile syncconfig else # !may-sync-config # External modules and some install targets need include/generated/autoconf.h @@ -1148,7 +1148,8 @@ scripts: scripts_basic scripts_dtc PHONY += prepare archprepare archprepare: outputmakefile archheaders archscripts scripts include/config/kernel.release \ - asm-generic $(version_h) $(autoksyms_h) include/generated/utsrelease.h + asm-generic $(version_h) $(autoksyms_h) include/generated/utsrelease.h \ + include/generated/autoconf.h prepare0: archprepare $(Q)$(MAKE) $(build)=scripts/mod From 610134b750bb33675864a9041090886cd787bd04 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 4 May 2020 17:08:06 +0900 Subject: [PATCH 191/427] kbuild: remove misleading stale FIXME comment This comment was added by commit ("kbuild: Restore build nr, improve vmlinux link") [1]. It was talking about if_changed_rule at that time. Now, it is unclear what to fix. [1]: https://git.kernel.org/pub/scm/linux/kernel/git/history/history.git/commit/?id=ea52ca1b3e3882b499cc6c043f384958b88b62ff Signed-off-by: Masahiro Yamada --- Makefile | 3 --- 1 file changed, 3 deletions(-) diff --git a/Makefile b/Makefile index e329bd4a0f17..97c819ae2ea7 100644 --- a/Makefile +++ b/Makefile @@ -1835,9 +1835,6 @@ tools/%: FORCE $(Q)mkdir -p $(objtree)/tools $(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(tools_silent) $(filter --j% -j,$(MAKEFLAGS))" O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ $* -# FIXME Should go into a make.lib or something -# =========================================================================== - quiet_cmd_rmdirs = $(if $(wildcard $(rm-dirs)),CLEAN $(wildcard $(rm-dirs))) cmd_rmdirs = rm -rf $(rm-dirs) From 0663c68c4d2d3b74055b4c1cf7d8ae4782774e53 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 4 May 2020 17:08:07 +0900 Subject: [PATCH 192/427] kbuild: remove {CLEAN,MRPROPER,DISTCLEAN}_DIRS Merge {CLEAN,MRPROPER,DISTCLEAN}_DIRS into {CLEAN,MRPROPER,DISTCLEAN}_FILES because the difference is just the -r option passed to the 'rm' command. Do likewise as commit 1634f2bfdb84 ("kbuild: remove clean-dirs syntax"). Signed-off-by: Masahiro Yamada --- Makefile | 22 ++++++---------------- arch/um/Makefile | 2 +- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/Makefile b/Makefile index 97c819ae2ea7..385e0930ca6d 100644 --- a/Makefile +++ b/Makefile @@ -1397,14 +1397,14 @@ endif # CONFIG_MODULES # make distclean Remove editor backup files, patch leftover files and the like # Directories & files removed with 'make clean' -CLEAN_DIRS += include/ksym -CLEAN_FILES += modules.builtin modules.builtin.modinfo modules.nsdeps +CLEAN_FILES += include/ksym \ + modules.builtin modules.builtin.modinfo modules.nsdeps # Directories & files removed with 'make mrproper' -MRPROPER_DIRS += include/config include/generated \ +MRPROPER_FILES += include/config include/generated \ arch/$(SRCARCH)/include/generated .tmp_objdiff \ - debian/ snap/ tar-install/ -MRPROPER_FILES += .config .config.old .version \ + debian snap tar-install \ + .config .config.old .version \ Module.symvers \ signing_key.pem signing_key.priv signing_key.x509 \ x509.genkey extra_certificates signing_key.x509.keyid \ @@ -1412,12 +1412,10 @@ MRPROPER_FILES += .config .config.old .version \ *.spec # Directories & files removed with 'make distclean' -DISTCLEAN_DIRS += DISTCLEAN_FILES += tags TAGS cscope* GPATH GTAGS GRTAGS GSYMS # clean - Delete most, but leave enough to build external modules # -clean: rm-dirs := $(CLEAN_DIRS) clean: rm-files := $(CLEAN_FILES) PHONY += archclean vmlinuxclean @@ -1430,7 +1428,6 @@ clean: archclean vmlinuxclean # mrproper - Delete all generated files, including .config # -mrproper: rm-dirs := $(wildcard $(MRPROPER_DIRS)) mrproper: rm-files := $(wildcard $(MRPROPER_FILES)) mrproper-dirs := $(addprefix _mrproper_,scripts) @@ -1439,18 +1436,15 @@ $(mrproper-dirs): $(Q)$(MAKE) $(clean)=$(patsubst _mrproper_%,%,$@) mrproper: clean $(mrproper-dirs) - $(call cmd,rmdirs) $(call cmd,rmfiles) # distclean # -distclean: rm-dirs := $(wildcard $(DISTCLEAN_DIRS)) distclean: rm-files := $(wildcard $(DISTCLEAN_FILES)) PHONY += distclean distclean: mrproper - $(call cmd,rmdirs) $(call cmd,rmfiles) @find $(srctree) $(RCS_FIND_IGNORE) \ \( -name '*.orig' -o -name '*.rej' -o -name '*~' \ @@ -1740,7 +1734,6 @@ $(clean-dirs): $(Q)$(MAKE) $(clean)=$(patsubst _clean_%,%,$@) clean: $(clean-dirs) - $(call cmd,rmdirs) $(call cmd,rmfiles) @find $(if $(KBUILD_EXTMOD), $(KBUILD_EXTMOD), .) $(RCS_FIND_IGNORE) \ \( -name '*.[aios]' -o -name '*.ko' -o -name '.*.cmd' \ @@ -1835,11 +1828,8 @@ tools/%: FORCE $(Q)mkdir -p $(objtree)/tools $(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(tools_silent) $(filter --j% -j,$(MAKEFLAGS))" O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ $* -quiet_cmd_rmdirs = $(if $(wildcard $(rm-dirs)),CLEAN $(wildcard $(rm-dirs))) - cmd_rmdirs = rm -rf $(rm-dirs) - quiet_cmd_rmfiles = $(if $(wildcard $(rm-files)),CLEAN $(wildcard $(rm-files))) - cmd_rmfiles = rm -f $(rm-files) + cmd_rmfiles = rm -rf $(rm-files) # Run depmod only if we have System.map and depmod is executable quiet_cmd_depmod = DEPMOD $(KERNELRELEASE) diff --git a/arch/um/Makefile b/arch/um/Makefile index 275f5ffdf6f0..3f27aa3ec0a6 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -140,7 +140,7 @@ export CFLAGS_vmlinux := $(LINK-y) $(LINK_WRAPS) $(LD_FLAGS_CMDLINE) # When cleaning we don't include .config, so we don't include # TT or skas makefiles and don't clean skas_ptregs.h. CLEAN_FILES += linux x.i gmon.out -MRPROPER_DIRS += arch/$(SUBARCH)/include/generated +MRPROPER_FILES += arch/$(SUBARCH)/include/generated archclean: @find . \( -name '*.bb' -o -name '*.bbg' -o -name '*.da' \ From 5967577231f9b19acd5a59485e9075964065bbe3 Mon Sep 17 00:00:00 2001 From: Siddharth Gupta Date: Tue, 5 May 2020 18:52:37 -0700 Subject: [PATCH 193/427] scripts: headers_install: Exit with error on config leak Misuse of CONFIG_* in UAPI headers should result in an error. These config options can be set in userspace by the user application which includes these headers to control the APIs and structures being used in a kernel which supports multiple targets. Signed-off-by: Siddharth Gupta Signed-off-by: Masahiro Yamada --- scripts/headers_install.sh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/headers_install.sh b/scripts/headers_install.sh index a07668a5c36b..94a833597a88 100755 --- a/scripts/headers_install.sh +++ b/scripts/headers_install.sh @@ -64,7 +64,7 @@ configs=$(sed -e ' d ' $OUTFILE) -# The entries in the following list are not warned. +# The entries in the following list do not result in an error. # Please do not add a new entry. This list is only for existing ones. # The list will be reduced gradually, and deleted eventually. (hopefully) # @@ -98,18 +98,19 @@ include/uapi/linux/raw.h:CONFIG_MAX_RAW_DEVS for c in $configs do - warn=1 + leak_error=1 for ignore in $config_leak_ignores do if echo "$INFILE:$c" | grep -q "$ignore$"; then - warn= + leak_error= break fi done - if [ "$warn" = 1 ]; then - echo "warning: $INFILE: leak $c to user-space" >&2 + if [ "$leak_error" = 1 ]; then + echo "error: $INFILE: leak $c to user-space" >&2 + exit 1 fi done From 859c81750130844590a83eff847c6c55e2340ab1 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:56:01 -0500 Subject: [PATCH 194/427] modpost,fixdep: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Masahiro Yamada --- scripts/basic/fixdep.c | 2 +- scripts/mod/modpost.c | 2 +- scripts/mod/modpost.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/basic/fixdep.c b/scripts/basic/fixdep.c index 877ca2c88246..d98540552941 100644 --- a/scripts/basic/fixdep.c +++ b/scripts/basic/fixdep.c @@ -160,7 +160,7 @@ struct item { struct item *next; unsigned int len; unsigned int hash; - char name[0]; + char name[]; }; #define HASHSZ 256 diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 5c3c50c5ec52..4d4b979d76be 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -166,7 +166,7 @@ struct symbol { * (only for external modules) **/ unsigned int is_static:1; /* 1 if symbol is not global */ enum export export; /* Type of export */ - char name[0]; + char name[]; }; static struct symbol *symbolhash[SYMBOL_HASH_SIZE]; diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index 60dca9b7106b..39f6c29fb568 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -111,7 +111,7 @@ buf_write(struct buffer *buf, const char *s, int len); struct namespace_list { struct namespace_list *next; - char namespace[0]; + char namespace[]; }; struct module { From 677f1410e05813fde62d724d9210fce04c505fc7 Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Fri, 8 May 2020 16:33:14 +0530 Subject: [PATCH 195/427] scripts/checkstack.pl: don't display $dre as different entity currently script prints stack usage for functions in two ways:($re and $dre) dre breaks sorting mechanism. 0xffffa00011f26f88 sunxi_mux_clk_setup.isra.0 [vmlinux]:Dynamic (0x140) .. 0xffffa00011f27210 sunxi_divs_clk_setup [vmlinux]: Dynamic (0x1d0) so we can print it in decimal only. Also address before function name is changed to function start address rather than stack consumption address. Because in next patch, arm has two ways to use stack which can be clubbed and printed in one function only. All symbols whose stack by adding(re and dre) is greater than 100, will be printed. 0xffffa00011f2720c0 sunxi_divs_clk_setup [vmlinux]: 464 ... 0xffffa00011f26f840 sunxi_mux_clk_setup.isra.0 [vmlinux]:320 Co-developed-by: Vaneet Narang Signed-off-by: Vaneet Narang Signed-off-by: Maninder Singh Signed-off-by: Masahiro Yamada --- scripts/checkstack.pl | 54 +++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/scripts/checkstack.pl b/scripts/checkstack.pl index 371bd17a4983..695710895560 100755 --- a/scripts/checkstack.pl +++ b/scripts/checkstack.pl @@ -109,11 +109,30 @@ my (@stack, $re, $dre, $x, $xs, $funcre); # # main() # -my ($func, $file, $lastslash); +my ($func, $file, $lastslash, $total_size, $addr, $intro); + +$total_size = 0; while (my $line = ) { if ($line =~ m/$funcre/) { $func = $1; + next if $line !~ m/^($xs*)/; + if ($total_size > 100) { + push @stack, "$intro$total_size\n"; + } + + $addr = $1; + $addr =~ s/ /0/g; + $addr = "0x$addr"; + + $intro = "$addr $func [$file]:"; + my $padlen = 56 - length($intro); + while ($padlen > 0) { + $intro .= ' '; + $padlen -= 8; + } + + $total_size = 0; } elsif ($line =~ m/(.*):\s*file format/) { $file = $1; @@ -134,37 +153,18 @@ while (my $line = ) { } next if ($size > 0x10000000); - next if $line !~ m/^($xs*)/; - my $addr = $1; - $addr =~ s/ /0/g; - $addr = "0x$addr"; - - my $intro = "$addr $func [$file]:"; - my $padlen = 56 - length($intro); - while ($padlen > 0) { - $intro .= ' '; - $padlen -= 8; - } - next if ($size < 100); - push @stack, "$intro$size\n"; + $total_size += $size; } elsif (defined $dre && $line =~ m/$dre/) { - my $size = "Dynamic ($1)"; + my $size = $1; - next if $line !~ m/^($xs*)/; - my $addr = $1; - $addr =~ s/ /0/g; - $addr = "0x$addr"; - - my $intro = "$addr $func [$file]:"; - my $padlen = 56 - length($intro); - while ($padlen > 0) { - $intro .= ' '; - $padlen -= 8; - } - push @stack, "$intro$size\n"; + $size = hex($size) if ($size =~ /^0x/); + $total_size += $size; } } +if ($total_size > 100) { + push @stack, "$intro$total_size\n"; +} # Sort output by size (last field) print sort { ($b =~ /:\t*(\d+)$/)[0] <=> ($a =~ /:\t*(\d+)$/)[0] } @stack; From 572220aad525bd3650f796d7e29cc06d41df4235 Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Fri, 8 May 2020 16:33:15 +0530 Subject: [PATCH 196/427] scripts/checkstack.pl: Add argument to print stacks greather than value. Add arguments support to print stacks which are greater than argument value only. Co-developed-by: Vaneet Narang Signed-off-by: Vaneet Narang Signed-off-by: Maninder Singh Signed-off-by: Masahiro Yamada --- scripts/checkstack.pl | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/scripts/checkstack.pl b/scripts/checkstack.pl index 695710895560..bc23cc7edcaa 100755 --- a/scripts/checkstack.pl +++ b/scripts/checkstack.pl @@ -35,7 +35,7 @@ use strict; # $1 (first bracket) matches the dynamic amount of the stack growth # # use anything else and feel the pain ;) -my (@stack, $re, $dre, $x, $xs, $funcre); +my (@stack, $re, $dre, $x, $xs, $funcre, $min_stack); { my $arch = shift; if ($arch eq "") { @@ -43,6 +43,11 @@ my (@stack, $re, $dre, $x, $xs, $funcre); chomp($arch); } + $min_stack = shift; + if ($min_stack eq "" || $min_stack !~ /^\d+$/) { + $min_stack = 100; + } + $x = "[0-9a-f]"; # hex character $xs = "[0-9a-f ]"; # hex character or space $funcre = qr/^$x* <(.*)>:$/; @@ -117,7 +122,7 @@ while (my $line = ) { if ($line =~ m/$funcre/) { $func = $1; next if $line !~ m/^($xs*)/; - if ($total_size > 100) { + if ($total_size > $min_stack) { push @stack, "$intro$total_size\n"; } @@ -162,7 +167,7 @@ while (my $line = ) { $total_size += $size; } } -if ($total_size > 100) { +if ($total_size > $min_stack) { push @stack, "$intro$total_size\n"; } From 3311eeebae94b37a21b37af4410bb5e2fe3dc0c0 Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Fri, 8 May 2020 16:33:16 +0530 Subject: [PATCH 197/427] scripts/checkstack.pl: add arm push handling for stack usage To count stack usage of push {*, fp, ip, lr, pc} instruction in ARM, if FRAME POINTER is enabled. e.g. c01f0d48: e92ddff0 push {r4, r5, r6, r7, r8, r9, sl, fp, ip, lr, pc} c01f0d50 : c01f0d44: e1a0c00d mov ip, sp c01f0d48: e92ddff0 push {r4, r5, r6, r7, r8, r9, sl, fp, ip, lr, pc} c01f0d4c: e24cb004 sub fp, ip, #4 c01f0d50: e24dd094 sub sp, sp, #448 ; 0x1C0 $ cat dump | scripts/checkstack.pl arm 0xc01f0d50 Y []: 448 added subroutine frame work for this. After change: 0xc01f0d500 Y []: 492 Co-developed-by: Vaneet Narang Signed-off-by: Vaneet Narang Signed-off-by: Maninder Singh Signed-off-by: Masahiro Yamada --- scripts/checkstack.pl | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/scripts/checkstack.pl b/scripts/checkstack.pl index bc23cc7edcaa..bc07e19f2786 100755 --- a/scripts/checkstack.pl +++ b/scripts/checkstack.pl @@ -34,8 +34,10 @@ use strict; # $& (whole re) matches the complete objdump line with the stack growth # $1 (first bracket) matches the dynamic amount of the stack growth # +# $sub: subroutine for special handling to check stack usage. +# # use anything else and feel the pain ;) -my (@stack, $re, $dre, $x, $xs, $funcre, $min_stack); +my (@stack, $re, $dre, $sub, $x, $xs, $funcre, $min_stack); { my $arch = shift; if ($arch eq "") { @@ -59,6 +61,7 @@ my (@stack, $re, $dre, $x, $xs, $funcre, $min_stack); } elsif ($arch eq 'arm') { #c0008ffc: e24dd064 sub sp, sp, #100 ; 0x64 $re = qr/.*sub.*sp, sp, #(([0-9]{2}|[3-9])[0-9]{2})/o; + $sub = \&arm_push_handling; } elsif ($arch =~ /^x86(_64)?$/ || $arch =~ /^i[3456]86$/) { #c0105234: 81 ec ac 05 00 00 sub $0x5ac,%esp # or @@ -111,6 +114,24 @@ my (@stack, $re, $dre, $x, $xs, $funcre, $min_stack); } } +# +# To count stack usage of push {*, fp, ip, lr, pc} instruction in ARM, +# if FRAME POINTER is enabled. +# e.g. c01f0d48: e92ddff0 push {r4, r5, r6, r7, r8, r9, sl, fp, ip, lr, pc} +# +sub arm_push_handling { + my $regex = qr/.*push.*fp, ip, lr, pc}/o; + my $size = 0; + my $line_arg = shift; + + if ($line_arg =~ m/$regex/) { + $size = $line_arg =~ tr/,//; + $size = ($size + 1) * 4; + } + + return $size; +} + # # main() # @@ -166,6 +187,11 @@ while (my $line = ) { $size = hex($size) if ($size =~ /^0x/); $total_size += $size; } + elsif (defined $sub) { + my $size = &$sub($line); + + $total_size += $size; + } } if ($total_size > $min_stack) { push @stack, "$intro$total_size\n"; From 6ce16f2bc879fb8943d2165f81862c6f89ec1b77 Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Fri, 8 May 2020 16:33:17 +0530 Subject: [PATCH 198/427] scripts/checkstack.pl: fix arm sp regex if objdump has below entries; c01ed608 : c01ed614: e24ddff7 sub sp, sp, #120 ; 0x78 c01f0d50 : c01f0d50: e24dd094 sub sp, sp, #140 ; 0x8c scripts fails to read stack usage. so making regex $re for ARM similar to aarch64 Co-developed-by: Vaneet Narang Signed-off-by: Vaneet Narang Signed-off-by: Maninder Singh Signed-off-by: Masahiro Yamada --- scripts/checkstack.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkstack.pl b/scripts/checkstack.pl index bc07e19f2786..d2c38584ece6 100755 --- a/scripts/checkstack.pl +++ b/scripts/checkstack.pl @@ -60,7 +60,7 @@ my (@stack, $re, $dre, $sub, $x, $xs, $funcre, $min_stack); $dre = qr/^.*sub.*sp, sp, #(0x$x{1,8})/o; } elsif ($arch eq 'arm') { #c0008ffc: e24dd064 sub sp, sp, #100 ; 0x64 - $re = qr/.*sub.*sp, sp, #(([0-9]{2}|[3-9])[0-9]{2})/o; + $re = qr/.*sub.*sp, sp, #([0-9]{1,4})/o; $sub = \&arm_push_handling; } elsif ($arch =~ /^x86(_64)?$/ || $arch =~ /^i[3456]86$/) { #c0105234: 81 ec ac 05 00 00 sub $0x5ac,%esp From 9f64fbdb774838db1445268fa8c46041fb1c28ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valdis=20Kl=20=C4=93=20tnieks?= Date: Sat, 9 May 2020 00:47:19 -0400 Subject: [PATCH 199/427] bpfilter: document build requirements for bpfilter_umh It's not intuitively obvious that bpfilter_umh is a statically linked binary. Mention the toolchain requirement in the Kconfig help, so people have an easier time figuring out what's needed. Signed-off-by: Valdis Kletnieks Signed-off-by: Masahiro Yamada --- net/bpfilter/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig index 045144d4a42c..84015ef3ee27 100644 --- a/net/bpfilter/Kconfig +++ b/net/bpfilter/Kconfig @@ -13,4 +13,8 @@ config BPFILTER_UMH default m help This builds bpfilter kernel module with embedded user mode helper + + Note: your toolchain must support building static binaries, since + rootfs isn't mounted at the time when __init functions are called + and do_execv won't be able to find the elf interpreter. endif From 827365ffdaa9aa9c0b423800c4d0e72b1fbb938e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 10 May 2020 11:00:44 +0900 Subject: [PATCH 200/427] gcc-plugins: remove always-false $(if ...) in Makefile This is the remnant of commit c17d6179ad5a ("gcc-plugins: remove unused GCC_PLUGIN_SUBDIR"). The conditional $(if $(findstring /,$(p)),...) is always false because none of plugins contains '/' in the file name. Clean up the code. Signed-off-by: Masahiro Yamada Reviewed-by: Kees Cook --- scripts/gcc-plugins/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gcc-plugins/Makefile b/scripts/gcc-plugins/Makefile index 80f354289eeb..4014ba7e2fbd 100644 --- a/scripts/gcc-plugins/Makefile +++ b/scripts/gcc-plugins/Makefile @@ -14,7 +14,7 @@ $(objtree)/$(obj)/randomize_layout_seed.h: FORCE $(call if_changed,create_randomize_layout_seed) targets = randomize_layout_seed.h randomize_layout_hash.h -hostcxxlibs-y := $(foreach p,$(GCC_PLUGIN),$(if $(findstring /,$(p)),,$(p))) +hostcxxlibs-y := $(GCC_PLUGIN) always-y := $(hostcxxlibs-y) $(foreach p,$(hostcxxlibs-y:%.so=%),$(eval $(p)-objs := $(p).o)) From 93fdddfefc831981c8cffc3db90275e9b1d8a0e5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 11 May 2020 12:50:12 +0900 Subject: [PATCH 201/427] kbuild: add this-makefile as a shorthand for $(lastword $(MAKEFILE_LIST)) Make it clearer, and self-documenting. Signed-off-by: Masahiro Yamada --- Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 385e0930ca6d..fcf31edc7522 100644 --- a/Makefile +++ b/Makefile @@ -157,12 +157,14 @@ MAKEFLAGS += --include-dir=$(abs_srctree) need-sub-make := 1 endif +this-makefile := $(lastword $(MAKEFILE_LIST)) + ifneq ($(filter 3.%,$(MAKE_VERSION)),) # 'MAKEFLAGS += -rR' does not immediately become effective for GNU Make 3.x # We need to invoke sub-make to avoid implicit rules in the top Makefile. need-sub-make := 1 # Cancel implicit rules for this Makefile. -$(lastword $(MAKEFILE_LIST)): ; +$(this-makefile): ; endif export abs_srctree abs_objtree @@ -172,7 +174,7 @@ ifeq ($(need-sub-make),1) PHONY += $(MAKECMDGOALS) sub-make -$(filter-out _all sub-make $(lastword $(MAKEFILE_LIST)), $(MAKECMDGOALS)) _all: sub-make +$(filter-out _all sub-make $(this-makefile), $(MAKECMDGOALS)) _all: sub-make @: # Invoke a second make in the output directory, passing relevant variables From 121c2a137767df5f9f301795e7d3757d1af04b21 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 11 May 2020 12:50:13 +0900 Subject: [PATCH 202/427] kbuild: error out if targets prefixed with '__' are directly run Some targets are internal-use only. It is tedious to care about "what if __build_one_by_one is contained in $(MAKECMDGOALS)?" etc. Prefix internal targets with double underscores. Stop parsing Makefile if they are directly run. Signed-off-by: Masahiro Yamada --- Makefile | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index fcf31edc7522..1915630cc24b 100644 --- a/Makefile +++ b/Makefile @@ -11,9 +11,12 @@ NAME = Kleptomaniac Octopus # Comments in this file are targeted only to the developer, do not # expect to learn how to build the kernel reading this file. +$(if $(filter __%, $(MAKECMDGOALS)), \ + $(error targets prefixed with '__' are only for internal use)) + # That's our default target when none is given on the command line -PHONY := _all -_all: +PHONY := __all +__all: # We are using a recursive build, so we need to do a little thinking # to get the ordering right. @@ -172,13 +175,13 @@ export sub_make_done := 1 ifeq ($(need-sub-make),1) -PHONY += $(MAKECMDGOALS) sub-make +PHONY += $(MAKECMDGOALS) __sub-make -$(filter-out _all sub-make $(this-makefile), $(MAKECMDGOALS)) _all: sub-make +$(filter-out $(this-makefile), $(MAKECMDGOALS)) __all: __sub-make @: # Invoke a second make in the output directory, passing relevant variables -sub-make: +__sub-make: $(Q)$(MAKE) -C $(abs_objtree) -f $(abs_srctree)/Makefile $(MAKECMDGOALS) endif # need-sub-make @@ -323,7 +326,7 @@ ifdef mixed-build PHONY += $(MAKECMDGOALS) __build_one_by_one -$(filter-out __build_one_by_one, $(MAKECMDGOALS)): __build_one_by_one +$(MAKECMDGOALS): __build_one_by_one @: __build_one_by_one: @@ -598,12 +601,12 @@ else #!config-build # targets and others. In general all targets except *config targets. # If building an external module we do not care about the all: rule -# but instead _all depend on modules +# but instead __all depend on modules PHONY += all ifeq ($(KBUILD_EXTMOD),) -_all: all +__all: all else -_all: modules +__all: modules endif # Decide whether to build built-in, modular, or both. @@ -625,7 +628,7 @@ endif # in addition to whatever we do anyway. # Just "make" or "make all" shall build modules as well -ifneq ($(filter all _all modules nsdeps,$(MAKECMDGOALS)),) +ifneq ($(filter all modules nsdeps,$(MAKECMDGOALS)),) KBUILD_MODULES := 1 endif From 8451791d1ff0fd229e3f5ef267a32423f5b5540f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 11 May 2020 13:21:49 +0900 Subject: [PATCH 203/427] kbuild: make module name conflict fatal error I think all the warnings have been fixed by now. Make it a fatal error. Check it before modpost because we need to stop building *.ko files. Also, pass modules.order via a script parameter. Signed-off-by: Masahiro Yamada --- Makefile | 7 +++++-- scripts/modules-check.sh | 16 +++++++++++++--- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 1915630cc24b..1f5bbfb31103 100644 --- a/Makefile +++ b/Makefile @@ -1335,9 +1335,12 @@ all: modules # using awk while concatenating to the final file. PHONY += modules -modules: $(if $(KBUILD_BUILTIN),vmlinux) modules.order +modules: $(if $(KBUILD_BUILTIN),vmlinux) modules_check $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost - $(Q)$(CONFIG_SHELL) $(srctree)/scripts/modules-check.sh + +PHONY += modules_check +modules_check: modules.order + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/modules-check.sh $< modules.order: descend $(Q)$(AWK) '!x[$$0]++' $(addsuffix /$@, $(build-dirs)) > $@ diff --git a/scripts/modules-check.sh b/scripts/modules-check.sh index f51f446707b8..43de226071ae 100755 --- a/scripts/modules-check.sh +++ b/scripts/modules-check.sh @@ -3,14 +3,24 @@ set -e +if [ $# != 1 ]; then + echo "Usage: $0 " >& 2 + exit 1 +fi + +exit_code=0 + # Check uniqueness of module names check_same_name_modules() { - for m in $(sed 's:.*/::' modules.order | sort | uniq -d) + for m in $(sed 's:.*/::' $1 | sort | uniq -d) do - echo "warning: same module names found:" >&2 + echo "error: the following would cause module name conflict:" >&2 sed -n "/\/$m/s:^: :p" modules.order >&2 + exit_code=1 done } -check_same_name_modules +check_same_name_modules "$1" + +exit $exit_code From 9504bbe91efc163e4c46496ae790da60353b23b4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 21 May 2020 13:31:17 +0900 Subject: [PATCH 204/427] kbuild: doc: remove documentation about copying Module.symvers around This is a left-over of commit 39808e451fdf ("kbuild: do not read $(KBUILD_EXTMOD)/Module.symvers"). Kbuild no longer supports this way. Signed-off-by: Masahiro Yamada --- Documentation/kbuild/modules.rst | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/Documentation/kbuild/modules.rst b/Documentation/kbuild/modules.rst index e0b45a257f21..a45cccff467d 100644 --- a/Documentation/kbuild/modules.rst +++ b/Documentation/kbuild/modules.rst @@ -528,18 +528,6 @@ build. will then do the expected and compile both modules with full knowledge of symbols from either module. - Use an extra Module.symvers file - When an external module is built, a Module.symvers file - is generated containing all exported symbols which are - not defined in the kernel. To get access to symbols - from bar.ko, copy the Module.symvers file from the - compilation of bar.ko to the directory where foo.ko is - built. During the module build, kbuild will read the - Module.symvers file in the directory of the external - module, and when the build is finished, a new - Module.symvers file is created containing the sum of - all symbols defined and not part of the kernel. - Use "make" variable KBUILD_EXTRA_SYMBOLS If it is impractical to add a top-level kbuild file, you can assign a space separated list From e578edc72276280b8fae57f6bf79cb443ceee7a2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 22 May 2020 10:59:58 +0900 Subject: [PATCH 205/427] kbuild: remove ifdef builtin-target / lib-target I do not see a good reason to add ifdef here. Signed-off-by: Masahiro Yamada --- scripts/Makefile.build | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 3665b1a0bc8e..9af88f4cacb8 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -384,16 +384,14 @@ $(obj)/%/built-in.a: $(obj)/% ; # # Rule to compile a set of .o files into one .a file (without symbol table) # -ifdef builtin-target quiet_cmd_ar_builtin = AR $@ cmd_ar_builtin = rm -f $@; $(AR) cDPrST $@ $(real-prereqs) -$(builtin-target): $(real-obj-y) FORCE +$(obj)/built-in.a: $(real-obj-y) FORCE $(call if_changed,ar_builtin) targets += $(builtin-target) -endif # builtin-target # # Rule to create modules.order file @@ -408,15 +406,11 @@ $(modorder-target): $(subdir-ym) FORCE # # Rule to compile a set of .o files into one .a file (with symbol table) # -ifdef lib-target - -$(lib-target): $(lib-y) FORCE +$(obj)/lib.a: $(lib-y) FORCE $(call if_changed,ar) targets += $(lib-target) -endif - # NOTE: # Do not replace $(filter %.o,^) with $(real-prereqs). When a single object # module is turned into a multi object module, $^ will contain header file From b480fec988b051df792633e99bf622fc63a305f6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 22 May 2020 10:59:59 +0900 Subject: [PATCH 206/427] kbuild: clear KBUILD_MODULES in top Makefile if CONFIG_MODULES=n Do not try to build any module-related artifacts when CONFIG_MODULES is disabled. Signed-off-by: Masahiro Yamada --- Makefile | 4 ++++ scripts/Makefile.build | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 1f5bbfb31103..72eb55a36545 100644 --- a/Makefile +++ b/Makefile @@ -1724,6 +1724,10 @@ build-dirs := $(foreach d, $(build-dirs), \ endif +ifndef CONFIG_MODULES +KBUILD_MODULES := +endif + # Handle descending into subdirectories listed in $(build-dirs) # Preset locale variables to speed up the build process. Limit locale # tweaks to this spot to avoid wrong language settings when running diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 9af88f4cacb8..f46d25441804 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -77,7 +77,7 @@ ifdef need-builtin builtin-target := $(obj)/built-in.a endif -ifeq ($(CONFIG_MODULES)$(need-modorder),y1) +ifdef need-modorder modorder-target := $(obj)/modules.order endif From 6ba3bcb01393777d38c8b466249e4a3e6ffc8adb Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 22 May 2020 11:00:00 +0900 Subject: [PATCH 207/427] kbuild: move subdir-obj-y to scripts/Makefile.build Save $(addprefix ...) for subdir-obj-y. Signed-off-by: Masahiro Yamada --- scripts/Makefile.build | 2 ++ scripts/Makefile.lib | 5 ----- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index f46d25441804..ee283efc1b45 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -69,6 +69,8 @@ endif # =========================================================================== +subdir-obj-y := $(filter %/built-in.a, $(real-obj-y)) + ifneq ($(strip $(lib-y) $(lib-m) $(lib-)),) lib-target := $(obj)/lib.a endif diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 52299d5dba28..a41a4bbd20e2 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -62,10 +62,6 @@ multi-used-y := $(sort $(foreach m,$(obj-y), $(if $(strip $($(m:.o=-objs)) $($(m multi-used-m := $(sort $(foreach m,$(obj-m), $(if $(strip $($(m:.o=-objs)) $($(m:.o=-y)) $($(m:.o=-m)) $($(m:.o=-))), $(m)))) multi-used := $(multi-used-y) $(multi-used-m) -# $(subdir-obj-y) is the list of objects in $(obj-y) which uses dir/ to -# tell kbuild to descend -subdir-obj-y := $(filter %/built-in.a, $(obj-y)) - # Replace multi-part objects by their individual parts, # including built-in.a from subdirectories real-obj-y := $(foreach m, $(obj-y), $(if $(strip $($(m:.o=-objs)) $($(m:.o=-y)) $($(m:.o=-))),$($(m:.o=-objs)) $($(m:.o=-y)),$(m))) @@ -91,7 +87,6 @@ targets := $(addprefix $(obj)/,$(targets)) modorder := $(addprefix $(obj)/,$(modorder)) obj-m := $(addprefix $(obj)/,$(obj-m)) lib-y := $(addprefix $(obj)/,$(lib-y)) -subdir-obj-y := $(addprefix $(obj)/,$(subdir-obj-y)) real-obj-y := $(addprefix $(obj)/,$(real-obj-y)) real-obj-m := $(addprefix $(obj)/,$(real-obj-m)) multi-used-m := $(addprefix $(obj)/,$(multi-used-m)) From aaa385ba9afe7aca25a1545a609963ee59b6c76b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 22 May 2020 11:00:01 +0900 Subject: [PATCH 208/427] kbuild: rename subdir-obj-y to subdir-builtin I think subdir-builtin is clearer. While I was here, I made its build rule explicit. Signed-off-by: Masahiro Yamada --- scripts/Makefile.build | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index ee283efc1b45..323264607b9f 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -69,7 +69,8 @@ endif # =========================================================================== -subdir-obj-y := $(filter %/built-in.a, $(real-obj-y)) +# subdir-builtin may contain duplications. Use $(sort ...) +subdir-builtin := $(sort $(filter %/built-in.a, $(real-obj-y))) ifneq ($(strip $(lib-y) $(lib-m) $(lib-)),) lib-target := $(obj)/lib.a @@ -356,7 +357,7 @@ endif $(obj)/%.o: $(src)/%.S $(objtool_dep) FORCE $(call if_changed_rule,as_o_S) -targets += $(filter-out $(subdir-obj-y), $(real-obj-y)) $(real-obj-m) $(lib-y) +targets += $(filter-out $(subdir-builtin), $(real-obj-y)) $(real-obj-m) $(lib-y) targets += $(extra-y) $(always-y) $(MAKECMDGOALS) # Linker scripts preprocessor (.lds.S -> .lds) @@ -381,7 +382,7 @@ $(obj)/%.asn1.c $(obj)/%.asn1.h: $(src)/%.asn1 $(objtree)/scripts/asn1_compiler # --------------------------------------------------------------------------- # To build objects in subdirs, we need to descend into the directories -$(obj)/%/built-in.a: $(obj)/% ; +$(subdir-builtin): $(obj)/%/built-in.a: $(obj)/% ; # # Rule to compile a set of .o files into one .a file (without symbol table) @@ -489,7 +490,7 @@ PHONY += $(subdir-ym) $(subdir-ym): $(Q)$(MAKE) $(build)=$@ \ $(if $(filter $@/, $(KBUILD_SINGLE_TARGETS)),single-build=) \ - need-builtin=$(if $(filter $@/built-in.a, $(subdir-obj-y)),1) \ + need-builtin=$(if $(filter $@/built-in.a, $(subdir-builtin)),1) \ need-modorder=$(if $(need-modorder),$(if $(filter $@/modules.order, $(modorder)),1)) # Add FORCE to the prequisites of a target to force it to be always rebuilt. From 454753d9f67ae40b6a2142ddb6b4dbdcc9654aa9 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 22 May 2020 11:00:02 +0900 Subject: [PATCH 209/427] kbuild: make modules.order rule consistent with built-in.a built-in.a contains the built-in object paths from the current and sub directories. module.order collects the module paths from the current and sub directories. Make their build rules look more symmetrical. Signed-off-by: Masahiro Yamada --- scripts/Makefile.build | 10 ++++++---- scripts/Makefile.lib | 2 ++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 323264607b9f..ee9a817e19a3 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -69,8 +69,9 @@ endif # =========================================================================== -# subdir-builtin may contain duplications. Use $(sort ...) +# subdir-builtin and subdir-modorder may contain duplications. Use $(sort ...) subdir-builtin := $(sort $(filter %/built-in.a, $(real-obj-y))) +subdir-modorder := $(sort $(filter %/modules.order, $(modorder))) ifneq ($(strip $(lib-y) $(lib-m) $(lib-)),) lib-target := $(obj)/lib.a @@ -383,6 +384,7 @@ $(obj)/%.asn1.c $(obj)/%.asn1.h: $(src)/%.asn1 $(objtree)/scripts/asn1_compiler # To build objects in subdirs, we need to descend into the directories $(subdir-builtin): $(obj)/%/built-in.a: $(obj)/% ; +$(subdir-modorder): $(obj)/%/modules.order: $(obj)/% ; # # Rule to compile a set of .o files into one .a file (without symbol table) @@ -401,9 +403,9 @@ targets += $(builtin-target) # # Create commands to either record .ko file or cat modules.order from # a subdirectory -$(modorder-target): $(subdir-ym) FORCE +$(obj)/modules.order: $(subdir-modorder) FORCE $(Q){ $(foreach m, $(modorder), \ - $(if $(filter %/modules.order, $m), cat $m, echo $m);) :; } \ + $(if $(filter $^, $m), cat $m, echo $m);) :; } \ | $(AWK) '!x[$$0]++' - > $@ # @@ -491,7 +493,7 @@ $(subdir-ym): $(Q)$(MAKE) $(build)=$@ \ $(if $(filter $@/, $(KBUILD_SINGLE_TARGETS)),single-build=) \ need-builtin=$(if $(filter $@/built-in.a, $(subdir-builtin)),1) \ - need-modorder=$(if $(need-modorder),$(if $(filter $@/modules.order, $(modorder)),1)) + need-modorder=$(if $(filter $@/modules.order, $(subdir-modorder)),1) # Add FORCE to the prequisites of a target to force it to be always rebuilt. # --------------------------------------------------------------------------- diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index a41a4bbd20e2..0d931cc0df94 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -35,7 +35,9 @@ lib-y := $(filter-out $(obj-y), $(sort $(lib-y) $(lib-m))) # Determine modorder. # Unfortunately, we don't have information about ordering between -y # and -m subdirs. Just put -y's first. +ifdef need-modorder modorder := $(patsubst %/,%/modules.order, $(filter %/, $(obj-y)) $(obj-m:.o=.ko)) +endif # Handle objects in subdirs # --------------------------------------------------------------------------- From 6b6ebb34744b21467aa01be7c53cc570fc41f70d Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Wed, 13 May 2020 10:13:11 +0800 Subject: [PATCH 210/427] cgroup: Remove stale comments - The default root is where we can create v2 cgroups. - The __DEVEL__sane_behavior mount option has been removed long long ago. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 06b5ea9d899d..7a016749de21 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -153,11 +153,7 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); -/* - * The default hierarchy, reserved for the subsystems that are otherwise - * unattached - it never has more than a single cgroup, and all tasks are - * part of that cgroup. - */ +/* the default hierarchy */ struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); @@ -251,9 +247,6 @@ bool cgroup_ssid_enabled(int ssid) * cases where a subsystem should behave differnetly depending on the * interface version. * - * The set of behaviors which change on the default hierarchy are still - * being determined and the mount option is prefixed with __DEVEL__. - * * List of changed behaviors: * * - Mount options "noprefix", "xattr", "clone_children", "release_agent" From 342ed2400b78072cc01c0130ce41240dec60d56d Mon Sep 17 00:00:00 2001 From: Zhang Qiang Date: Wed, 27 May 2020 15:57:15 +0800 Subject: [PATCH 211/427] workqueue: Remove unnecessary kfree() call in rcu_free_wq() The data structure member "wq->rescuer" was reset to a null pointer in one if branch. It was passed to a call of the function "kfree" in the callback function "rcu_free_wq" (which was eventually executed). The function "kfree" does not perform more meaningful data processing for a passed null pointer (besides immediately returning from such a call). Thus delete this function call which became unnecessary with the referenced software update. Fixes: def98c84b6cd ("workqueue: Fix spurious sanity check failures in destroy_workqueue()") Suggested-by: Markus Elfring Signed-off-by: Zhang Qiang Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 10ed8d761e0b..7a1fc9fe6314 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3491,7 +3491,6 @@ static void rcu_free_wq(struct rcu_head *rcu) else free_workqueue_attrs(wq->unbound_attrs); - kfree(wq->rescuer); kfree(wq); } From 60369a4f8d61bacd437adab0518581afb90bea24 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Tue, 28 Apr 2020 15:36:40 +0000 Subject: [PATCH 212/427] x86/PCI: Drop unused xen_register_pirq() gsi_override parameter All callers of xen_register_pirq() pass -1 (no override) for the gsi_override parameter. Remove it and related code. Link: https://lore.kernel.org/r/20200428153640.76476-1-wei.liu@kernel.org Signed-off-by: Wei Liu Signed-off-by: Bjorn Helgaas Reviewed-by: Boris Ostrovsky --- arch/x86/pci/xen.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 91220cc25854..e3f1ca316068 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -60,8 +60,7 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev) } #ifdef CONFIG_ACPI -static int xen_register_pirq(u32 gsi, int gsi_override, int triggering, - bool set_pirq) +static int xen_register_pirq(u32 gsi, int triggering, bool set_pirq) { int rc, pirq = -1, irq = -1; struct physdev_map_pirq map_irq; @@ -94,9 +93,6 @@ static int xen_register_pirq(u32 gsi, int gsi_override, int triggering, name = "ioapic-level"; } - if (gsi_override >= 0) - gsi = gsi_override; - irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name); if (irq < 0) goto out; @@ -112,12 +108,12 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, if (!xen_hvm_domain()) return -1; - return xen_register_pirq(gsi, -1 /* no GSI override */, trigger, + return xen_register_pirq(gsi, trigger, false /* no mapping of GSI to PIRQ */); } #ifdef CONFIG_XEN_DOM0 -static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polarity) +static int xen_register_gsi(u32 gsi, int triggering, int polarity) { int rc, irq; struct physdev_setup_gsi setup_gsi; @@ -128,7 +124,7 @@ static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polar printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n", gsi, triggering, polarity); - irq = xen_register_pirq(gsi, gsi_override, triggering, true); + irq = xen_register_pirq(gsi, triggering, true); setup_gsi.gsi = gsi; setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1); @@ -148,7 +144,7 @@ static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polar static int acpi_register_gsi_xen(struct device *dev, u32 gsi, int trigger, int polarity) { - return xen_register_gsi(gsi, -1 /* no GSI override */, trigger, polarity); + return xen_register_gsi(gsi, trigger, polarity); } #endif #endif @@ -491,7 +487,7 @@ int __init pci_xen_initial_domain(void) if (acpi_get_override_irq(irq, &trigger, &polarity) == -1) continue; - xen_register_pirq(irq, -1 /* no GSI override */, + xen_register_pirq(irq, trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE, true /* Map GSI to PIRQ */); } From 0d14f06cd6657ba3446a5eb780672da487b068e7 Mon Sep 17 00:00:00 2001 From: Marcos Scriven Date: Wed, 20 May 2020 18:23:30 -0500 Subject: [PATCH 213/427] PCI: Avoid FLR for AMD Matisse HD Audio & USB 3.0 The AMD Matisse HD Audio & USB 3.0 devices advertise Function Level Reset support, but hang when an FLR is triggered. To reproduce the problem, attach the device to a VM, then detach and try to attach again. Rename the existing quirk_intel_no_flr(), which was not Intel-specific, to quirk_no_flr(), and apply it to prevent the use of FLR on these AMD devices. Link: https://lore.kernel.org/r/CAAri2DpkcuQZYbT6XsALhx2e6vRqPHwtbjHYeiH7MNp4zmt1RA@mail.gmail.com Signed-off-by: Marcos Scriven Signed-off-by: Bjorn Helgaas --- drivers/pci/quirks.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 28c9a2409c50..ff310f0cac22 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -5129,13 +5129,23 @@ static void quirk_intel_qat_vf_cap(struct pci_dev *pdev) } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, quirk_intel_qat_vf_cap); -/* FLR may cause some 82579 devices to hang */ -static void quirk_intel_no_flr(struct pci_dev *dev) +/* + * FLR may cause the following to devices to hang: + * + * AMD Starship/Matisse HD Audio Controller 0x1487 + * AMD Matisse USB 3.0 Host Controller 0x149c + * Intel 82579LM Gigabit Ethernet Controller 0x1502 + * Intel 82579V Gigabit Ethernet Controller 0x1503 + * + */ +static void quirk_no_flr(struct pci_dev *dev) { dev->dev_flags |= PCI_DEV_FLAGS_NO_FLR_RESET; } -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x1502, quirk_intel_no_flr); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x1503, quirk_intel_no_flr); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1487, quirk_no_flr); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x149c, quirk_no_flr); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x1502, quirk_no_flr); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x1503, quirk_no_flr); static void quirk_no_ext_tags(struct pci_dev *pdev) { From 5727043c73fdfe04597971b5f3f4850d879c1f4f Mon Sep 17 00:00:00 2001 From: Kevin Buettner Date: Sun, 24 May 2020 00:35:29 -0700 Subject: [PATCH 214/427] PCI: Avoid FLR for AMD Starship USB 3.0 The AMD Starship USB 3.0 host controller advertises Function Level Reset support, but it apparently doesn't work. Add a quirk to prevent use of FLR on this device. Without this quirk, when attempting to assign (pass through) an AMD Starship USB 3.0 host controller to a guest OS, the system becomes increasingly unresponsive over the course of several minutes, eventually requiring a hard reset. Shortly after attempting to start the guest, I see these messages: vfio-pci 0000:05:00.3: not ready 1023ms after FLR; waiting vfio-pci 0000:05:00.3: not ready 2047ms after FLR; waiting vfio-pci 0000:05:00.3: not ready 4095ms after FLR; waiting vfio-pci 0000:05:00.3: not ready 8191ms after FLR; waiting And then eventually: vfio-pci 0000:05:00.3: not ready 65535ms after FLR; giving up INFO: NMI handler (perf_event_nmi_handler) took too long to run: 0.000 msecs perf: interrupt took too long (642744 > 2500), lowering kernel.perf_event_max_sample_rate to 1000 INFO: NMI handler (perf_event_nmi_handler) took too long to run: 82.270 msecs INFO: NMI handler (perf_event_nmi_handler) took too long to run: 680.608 msecs INFO: NMI handler (perf_event_nmi_handler) took too long to run: 100.952 msecs ... watchdog: BUG: soft lockup - CPU#3 stuck for 22s! [qemu-system-x86:7487] Tested on a Micro-Star International Co., Ltd. MS-7C59/Creator TRX40 motherboard with an AMD Ryzen Threadripper 3970X. Link: https://lore.kernel.org/r/20200524003529.598434ff@f31-4.lan Signed-off-by: Kevin Buettner Signed-off-by: Bjorn Helgaas --- drivers/pci/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index ff310f0cac22..15341eacc50d 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -5133,6 +5133,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, quirk_intel_qat_vf_cap); * FLR may cause the following to devices to hang: * * AMD Starship/Matisse HD Audio Controller 0x1487 + * AMD Starship USB 3.0 Host Controller 0x148c * AMD Matisse USB 3.0 Host Controller 0x149c * Intel 82579LM Gigabit Ethernet Controller 0x1502 * Intel 82579V Gigabit Ethernet Controller 0x1503 @@ -5143,6 +5144,7 @@ static void quirk_no_flr(struct pci_dev *dev) dev->dev_flags |= PCI_DEV_FLAGS_NO_FLR_RESET; } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1487, quirk_no_flr); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x148c, quirk_no_flr); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x149c, quirk_no_flr); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x1502, quirk_no_flr); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x1503, quirk_no_flr); From 3910ebaca8eae0cb9d41a20efe1bcb375ec64dfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Tue, 26 May 2020 21:39:05 +0000 Subject: [PATCH 215/427] PCI: Rename _DSM constants to align with spec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename PCI-related _DSM constants to align them with the PCI Firmware Spec, r3.2, sec 4.6. No functional change intended. Link: https://lore.kernel.org/r/20200526213905.2479381-1-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/acpi/pci_root.c | 2 +- drivers/pci/pci-acpi.c | 4 ++-- drivers/pci/pci-label.c | 4 ++-- include/linux/pci-acpi.h | 10 ++++++---- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index ac8ad6cb82aa..191204a4abe9 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -938,7 +938,7 @@ struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root, * assignments made by firmware for this host bridge. */ obj = acpi_evaluate_dsm(ACPI_HANDLE(bus->bridge), &pci_acpi_dsm_guid, 1, - IGNORE_PCI_BOOT_CONFIG_DSM, NULL); + DSM_PCI_PRESERVE_BOOT_CONFIG, NULL); if (obj && obj->type == ACPI_TYPE_INTEGER && obj->integer.value == 0) host_bridge->preserve_config = 1; ACPI_FREE(obj); diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index d820a55ae71c..7224b1e5f2a8 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -1128,7 +1128,7 @@ void acpi_pci_add_bus(struct pci_bus *bus) return; obj = acpi_evaluate_dsm(ACPI_HANDLE(bus->bridge), &pci_acpi_dsm_guid, 3, - RESET_DELAY_DSM, NULL); + DSM_PCI_POWER_ON_RESET_DELAY, NULL); if (!obj) return; @@ -1193,7 +1193,7 @@ static void pci_acpi_optimize_delay(struct pci_dev *pdev, pdev->d3cold_delay = 0; obj = acpi_evaluate_dsm(handle, &pci_acpi_dsm_guid, 3, - FUNCTION_DELAY_DSM, NULL); + DSM_PCI_DEVICE_READINESS_DURATIONS, NULL); if (!obj) return; diff --git a/drivers/pci/pci-label.c b/drivers/pci/pci-label.c index a5910f942857..707dd9808676 100644 --- a/drivers/pci/pci-label.c +++ b/drivers/pci/pci-label.c @@ -178,7 +178,7 @@ static int dsm_get_label(struct device *dev, char *buf, return -1; obj = acpi_evaluate_dsm(handle, &pci_acpi_dsm_guid, 0x2, - DEVICE_LABEL_DSM, NULL); + DSM_PCI_DEVICE_NAME, NULL); if (!obj) return -1; @@ -218,7 +218,7 @@ static bool device_has_dsm(struct device *dev) return false; return !!acpi_check_dsm(handle, &pci_acpi_dsm_guid, 0x2, - 1 << DEVICE_LABEL_DSM); + 1 << DSM_PCI_DEVICE_NAME); } static umode_t acpi_index_string_exist(struct kobject *kobj, diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 2d155bfb8fbf..a3bb8e768778 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -107,10 +107,12 @@ static inline void acpiphp_check_host_bridge(struct acpi_device *adev) { } #endif extern const guid_t pci_acpi_dsm_guid; -#define IGNORE_PCI_BOOT_CONFIG_DSM 0x05 -#define DEVICE_LABEL_DSM 0x07 -#define RESET_DELAY_DSM 0x08 -#define FUNCTION_DELAY_DSM 0x09 + +/* _DSM Definitions for PCI */ +#define DSM_PCI_PRESERVE_BOOT_CONFIG 0x05 +#define DSM_PCI_DEVICE_NAME 0x07 +#define DSM_PCI_POWER_ON_RESET_DELAY 0x08 +#define DSM_PCI_DEVICE_READINESS_DURATIONS 0x09 #ifdef CONFIG_PCIE_EDR void pci_acpi_add_edr_notifier(struct pci_dev *pdev); From 936f2a70f2077f64fab1dcb3eca71879e82ecd3f Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 27 May 2020 14:43:19 -0700 Subject: [PATCH 216/427] cgroup: add cpu.stat file to root cgroup Currently, the root cgroup does not have a cpu.stat file. Add one which is consistent with /proc/stat to capture global cpu statistics that might not fall under cgroup accounting. We haven't done this in the past because the data are already presented in /proc/stat and we didn't want to add overhead from collecting root cgroup stats when cgroups are configured, but no cgroups have been created. By keeping the data consistent with /proc/stat, I think we avoid the first problem, while improving the usability of cgroups stats. We avoid the second problem by computing the contents of cpu.stat from existing data collected for /proc/stat anyway. Signed-off-by: Boris Burkov Suggested-by: Tejun Heo Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 6 +-- kernel/cgroup/cgroup.c | 1 - kernel/cgroup/rstat.c | 58 ++++++++++++++++++++++--- 3 files changed, 53 insertions(+), 12 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index bcc80269bb6a..341a6c2340d5 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -714,9 +714,7 @@ Conventions - Settings for a single feature should be contained in a single file. - The root cgroup should be exempt from resource control and thus - shouldn't have resource control interface files. Also, - informational files on the root cgroup which end up showing global - information available elsewhere shouldn't exist. + shouldn't have resource control interface files. - The default time unit is microseconds. If a different unit is ever used, an explicit unit suffix must be present. @@ -985,7 +983,7 @@ CPU Interface Files All time durations are in microseconds. cpu.stat - A read-only flat-keyed file which exists on non-root cgroups. + A read-only flat-keyed file. This file exists whether the controller is enabled or not. It always reports the following three stats: diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 7a016749de21..51924ebdff51 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4874,7 +4874,6 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cpu.stat", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_stat_show, }, #ifdef CONFIG_PSI diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 41ca996568df..b6397a186ce9 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -389,18 +389,62 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp, cgroup_base_stat_cputime_account_end(cgrp, rstatc); } +/* + * compute the cputime for the root cgroup by getting the per cpu data + * at a global level, then categorizing the fields in a manner consistent + * with how it is done by __cgroup_account_cputime_field for each bit of + * cpu time attributed to a cgroup. + */ +static void root_cgroup_cputime(struct task_cputime *cputime) +{ + int i; + + cputime->stime = 0; + cputime->utime = 0; + cputime->sum_exec_runtime = 0; + for_each_possible_cpu(i) { + struct kernel_cpustat kcpustat; + u64 *cpustat = kcpustat.cpustat; + u64 user = 0; + u64 sys = 0; + + kcpustat_cpu_fetch(&kcpustat, i); + + user += cpustat[CPUTIME_USER]; + user += cpustat[CPUTIME_NICE]; + cputime->utime += user; + + sys += cpustat[CPUTIME_SYSTEM]; + sys += cpustat[CPUTIME_IRQ]; + sys += cpustat[CPUTIME_SOFTIRQ]; + cputime->stime += sys; + + cputime->sum_exec_runtime += user; + cputime->sum_exec_runtime += sys; + cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; + cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST]; + cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE]; + } +} + void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; u64 usage, utime, stime; + struct task_cputime cputime; - if (!cgroup_parent(cgrp)) - return; - - cgroup_rstat_flush_hold(cgrp); - usage = cgrp->bstat.cputime.sum_exec_runtime; - cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); - cgroup_rstat_flush_release(); + if (cgroup_parent(cgrp)) { + cgroup_rstat_flush_hold(cgrp); + usage = cgrp->bstat.cputime.sum_exec_runtime; + cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, + &utime, &stime); + cgroup_rstat_flush_release(); + } else { + root_cgroup_cputime(&cputime); + usage = cputime.sum_exec_runtime; + utime = cputime.utime; + stime = cputime.stime; + } do_div(usage, NSEC_PER_USEC); do_div(utime, NSEC_PER_USEC); From d0684fd0bd79395e074dd668feee5d53b134b1a3 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 25 May 2020 11:43:19 -0500 Subject: [PATCH 217/427] PCI: hv: Use struct_size() helper One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct hv_dr_state { ... struct hv_pcidev_description func[]; }; struct pci_bus_relations { ... struct pci_function_description func[]; } __packed; Make use of the struct_size() helper instead of an open-coded version in order to avoid any potential type mistakes. So, replace the following forms: offsetof(struct hv_dr_state, func) + (sizeof(struct hv_pcidev_description) * (relations->device_count)) offsetof(struct pci_bus_relations, func) + (sizeof(struct pci_function_description) * (bus_rel->device_count)) with: struct_size(dr, func, relations->device_count) and struct_size(bus_rel, func, bus_rel->device_count) respectively. Link: https://lore.kernel.org/r/20200525164319.GA13596@embeddedor Signed-off-by: Gustavo A. R. Silva Signed-off-by: Lorenzo Pieralisi Reviewed-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 92092a47d3af..c95e520e62e4 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -2201,10 +2201,8 @@ static void hv_pci_devices_present(struct hv_pcibus_device *hbus, struct hv_dr_state *dr; int i; - dr = kzalloc(offsetof(struct hv_dr_state, func) + - (sizeof(struct hv_pcidev_description) * - (relations->device_count)), GFP_NOWAIT); - + dr = kzalloc(struct_size(dr, func, relations->device_count), + GFP_NOWAIT); if (!dr) return; @@ -2238,10 +2236,8 @@ static void hv_pci_devices_present2(struct hv_pcibus_device *hbus, struct hv_dr_state *dr; int i; - dr = kzalloc(offsetof(struct hv_dr_state, func) + - (sizeof(struct hv_pcidev_description) * - (relations->device_count)), GFP_NOWAIT); - + dr = kzalloc(struct_size(dr, func, relations->device_count), + GFP_NOWAIT); if (!dr) return; @@ -2435,9 +2431,8 @@ static void hv_pci_onchannelcallback(void *context) bus_rel = (struct pci_bus_relations *)buffer; if (bytes_recvd < - offsetof(struct pci_bus_relations, func) + - (sizeof(struct pci_function_description) * - (bus_rel->device_count))) { + struct_size(bus_rel, func, + bus_rel->device_count)) { dev_err(&hbus->hdev->device, "bus relations too small\n"); break; @@ -2450,9 +2445,8 @@ static void hv_pci_onchannelcallback(void *context) bus_rel2 = (struct pci_bus_relations2 *)buffer; if (bytes_recvd < - offsetof(struct pci_bus_relations2, func) + - (sizeof(struct pci_function_description2) * - (bus_rel2->device_count))) { + struct_size(bus_rel2, func, + bus_rel2->device_count)) { dev_err(&hbus->hdev->device, "bus relations v2 too small\n"); break; From eeab133e1f144f3e7326019ca3ec11bdce44c210 Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Thu, 14 May 2020 21:03:20 +0900 Subject: [PATCH 218/427] dt-bindings: PCI: Add UniPhier PCIe endpoint controller description Add DT bindings for PCIe controller implemented in UniPhier SoCs when configured in endpoint mode. This controller is based on the DesignWare PCIe core. Link: https://lore.kernel.org/r/1589457801-12796-2-git-send-email-hayashi.kunihiko@socionext.com Signed-off-by: Kunihiko Hayashi Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- .../pci/socionext,uniphier-pcie-ep.yaml | 92 +++++++++++++++++++ MAINTAINERS | 2 +- 2 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 Documentation/devicetree/bindings/pci/socionext,uniphier-pcie-ep.yaml diff --git a/Documentation/devicetree/bindings/pci/socionext,uniphier-pcie-ep.yaml b/Documentation/devicetree/bindings/pci/socionext,uniphier-pcie-ep.yaml new file mode 100644 index 000000000000..f0558b9cf9e9 --- /dev/null +++ b/Documentation/devicetree/bindings/pci/socionext,uniphier-pcie-ep.yaml @@ -0,0 +1,92 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pci/socionext,uniphier-pcie-ep.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Socionext UniPhier PCIe endpoint controller + +description: | + UniPhier PCIe endpoint controller is based on the Synopsys DesignWare + PCI core. It shares common features with the PCIe DesignWare core and + inherits common properties defined in + Documentation/devicetree/bindings/pci/designware-pcie.txt. + +maintainers: + - Kunihiko Hayashi + +allOf: + - $ref: "pci-ep.yaml#" + +properties: + compatible: + const: socionext,uniphier-pro5-pcie-ep + + reg: + maxItems: 4 + + reg-names: + items: + - const: dbi + - const: dbi2 + - const: link + - const: addr_space + + clocks: + maxItems: 2 + + clock-names: + items: + - const: gio + - const: link + + resets: + maxItems: 2 + + reset-names: + items: + - const: gio + - const: link + + num-ib-windows: + const: 16 + + num-ob-windows: + const: 16 + + num-lanes: true + + phys: + maxItems: 1 + + phy-names: + const: pcie-phy + +required: + - compatible + - reg + - reg-names + - clocks + - clock-names + - resets + - reset-names + +additionalProperties: false + +examples: + - | + pcie_ep: pcie-ep@66000000 { + compatible = "socionext,uniphier-pro5-pcie-ep"; + reg-names = "dbi", "dbi2", "link", "addr_space"; + reg = <0x66000000 0x1000>, <0x66001000 0x1000>, + <0x66010000 0x10000>, <0x67000000 0x400000>; + clock-names = "gio", "link"; + clocks = <&sys_clk 12>, <&sys_clk 24>; + reset-names = "gio", "link"; + resets = <&sys_rst 12>, <&sys_rst 24>; + num-ib-windows = <16>; + num-ob-windows = <16>; + num-lanes = <4>; + phy-names = "pcie-phy"; + phys = <&pcie_phy>; + }; diff --git a/MAINTAINERS b/MAINTAINERS index e64e5db31497..dc7b42d1d2ce 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13142,7 +13142,7 @@ PCIE DRIVER FOR SOCIONEXT UNIPHIER M: Kunihiko Hayashi L: linux-pci@vger.kernel.org S: Maintained -F: Documentation/devicetree/bindings/pci/uniphier-pcie.txt +F: Documentation/devicetree/bindings/pci/uniphier-pcie* F: drivers/pci/controller/dwc/pcie-uniphier.c PCIE DRIVER FOR ST SPEAR13XX From e9e81b634303b215e83beced03f04f02f7893442 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 25 May 2020 00:42:15 +0900 Subject: [PATCH 219/427] kbuild: disallow multi-word in M= or KBUILD_EXTMOD $(firstword ...) in scripts/Makefile.modpost was added by commit 3f3fd3c05585 ("[PATCH] kbuild: allow multi-word $M in Makefile.modpost") to build multiple external module directories. It was a solution to resolve symbol dependencies when an external module depends on another external module. Commit 0d96fb20b7ed ("kbuild: Add new Kbuild variable KBUILD_EXTRA_SYMBOLS") introduced another solution by passing symbol info via KBUILD_EXTRA_SYMBOLS, then broke the multi-word M= support. include $(if $(wildcard $(KBUILD_EXTMOD)/Kbuild), \ $(KBUILD_EXTMOD)/Kbuild, $(KBUILD_EXTMOD)/Makefile) ... does not work if KBUILD_EXTMOD contains multiple words. This feature has been broken for more than a decade. Remove the bitrotten code, and stop parsing if M or KBUILD_EXTMOD contains multiple words. As Documentation/kbuild/modules.rst explains, if your module depends on another one, there are two solutions: - add a common top-level Kbuild file - use KBUILD_EXTRA_SYMBOLS Signed-off-by: Masahiro Yamada --- Makefile | 3 +++ scripts/Makefile.modpost | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 72eb55a36545..48a2dfaf3bf3 100644 --- a/Makefile +++ b/Makefile @@ -218,6 +218,9 @@ ifeq ("$(origin M)", "command line") KBUILD_EXTMOD := $(M) endif +$(if $(word 2, $(KBUILD_EXTMOD)), \ + $(error building multiple external modules is not supported)) + export KBUILD_CHECKSRC KBUILD_EXTMOD extmod-prefix = $(if $(KBUILD_EXTMOD),$(KBUILD_EXTMOD)/) diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index 957eed6a17a5..b79bf0e30d32 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -44,7 +44,7 @@ include include/config/auto.conf include scripts/Kbuild.include kernelsymfile := $(objtree)/Module.symvers -modulesymfile := $(firstword $(KBUILD_EXTMOD))/Module.symvers +modulesymfile := $(KBUILD_EXTMOD)/Module.symvers MODPOST = scripts/mod/modpost \ $(if $(CONFIG_MODVERSIONS),-m) \ From d2e4d05cf1a1f8bfe168ea29b217355be7a4e9ec Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 25 May 2020 14:47:04 +0900 Subject: [PATCH 220/427] modpost: fix potential segmentation fault for addend_i386_rel() This may not be a practical problem, but the second pass of ARCH=i386 modpost causes segmentation fault if the -s option is not passed. MODPOST 12 modules Segmentation fault (core dumped) make[2]: *** [scripts/Makefile.modpost:94: __modpost] Error 139 make[1]: *** [Makefile:1339: modules] Error 2 make[1]: *** Waiting for unfinished jobs.... The segmentation fault occurs when section_rel() is called for vmlinux, which is untested in regular builds. The cause of the problem is reloc_location() returning a wrong pointer for ET_EXEC object type. In this case, you need to subtract sechdr->sh_addr, otherwise it would get access beyond the mmap'ed memory. Add sym_get_data_by_offset() helper to avoid code duplication. Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 4d4b979d76be..8c5f1bd75481 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -300,19 +300,23 @@ static const char *sec_name(struct elf_info *elf, int secindex) return sech_name(elf, &elf->sechdrs[secindex]); } -static void *sym_get_data(const struct elf_info *info, const Elf_Sym *sym) +static void *sym_get_data_by_offset(const struct elf_info *info, + unsigned int secindex, unsigned long offset) { - unsigned int secindex = get_secindex(info, sym); Elf_Shdr *sechdr = &info->sechdrs[secindex]; - unsigned long offset; - offset = sym->st_value; if (info->hdr->e_type != ET_REL) offset -= sechdr->sh_addr; return (void *)info->hdr + sechdr->sh_offset + offset; } +static void *sym_get_data(const struct elf_info *info, const Elf_Sym *sym) +{ + return sym_get_data_by_offset(info, get_secindex(info, sym), + sym->st_value); +} + #define strstarts(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0) static enum export export_from_secname(struct elf_info *elf, unsigned int sec) @@ -1752,11 +1756,7 @@ static void check_section_mismatch(const char *modname, struct elf_info *elf, static unsigned int *reloc_location(struct elf_info *elf, Elf_Shdr *sechdr, Elf_Rela *r) { - Elf_Shdr *sechdrs = elf->sechdrs; - int section = sechdr->sh_info; - - return (void *)elf->hdr + sechdrs[section].sh_offset + - r->r_offset; + return sym_get_data_by_offset(elf, sechdr->sh_info, r->r_offset); } static int addend_386_rel(struct elf_info *elf, Elf_Shdr *sechdr, Elf_Rela *r) From 565587d8d5b518234652063820561587fc269c11 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 25 May 2020 14:47:05 +0900 Subject: [PATCH 221/427] modpost: refactor sech_name() Use sym_get_data_by_offset() helper to get access to the .shstrtab section data. No functional change is intended because elf->sechdrs[elf->secindex_strings].sh_addr is 0 for both ET_REL and ET_EXEC object types. Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 8c5f1bd75481..160139508821 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -288,18 +288,6 @@ static enum export export_no(const char *s) return export_unknown; } -static const char *sech_name(struct elf_info *elf, Elf_Shdr *sechdr) -{ - return (void *)elf->hdr + - elf->sechdrs[elf->secindex_strings].sh_offset + - sechdr->sh_name; -} - -static const char *sec_name(struct elf_info *elf, int secindex) -{ - return sech_name(elf, &elf->sechdrs[secindex]); -} - static void *sym_get_data_by_offset(const struct elf_info *info, unsigned int secindex, unsigned long offset) { @@ -317,6 +305,17 @@ static void *sym_get_data(const struct elf_info *info, const Elf_Sym *sym) sym->st_value); } +static const char *sech_name(const struct elf_info *info, Elf_Shdr *sechdr) +{ + return sym_get_data_by_offset(info, info->secindex_strings, + sechdr->sh_name); +} + +static const char *sec_name(const struct elf_info *info, int secindex) +{ + return sech_name(info, &info->sechdrs[secindex]); +} + #define strstarts(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0) static enum export export_from_secname(struct elf_info *elf, unsigned int sec) From 10e68b02c861ccf2b3adb59d3f0c10dc6b5e3ace Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 26 May 2020 10:18:29 -0700 Subject: [PATCH 222/427] Makefile: support compressed debug info As debug information gets larger and larger, it helps significantly save the size of vmlinux images to compress the information in the debug information sections. Note: this debug info is typically split off from the final compressed kernel image, which is why vmlinux is what's used in conjunction with GDB. Minimizing the debug info size should have no impact on boot times, or final compressed kernel image size. All of the debug sections will have a `C` flag set. $ readelf -S $ bloaty vmlinux.gcc75.compressed.dwarf4 -- \ vmlinux.gcc75.uncompressed.dwarf4 FILE SIZE VM SIZE -------------- -------------- +0.0% +18 [ = ] 0 [Unmapped] -73.3% -114Ki [ = ] 0 .debug_aranges -76.2% -2.01Mi [ = ] 0 .debug_frame -73.6% -2.89Mi [ = ] 0 .debug_str -80.7% -4.66Mi [ = ] 0 .debug_abbrev -82.9% -4.88Mi [ = ] 0 .debug_ranges -70.5% -9.04Mi [ = ] 0 .debug_line -79.3% -10.9Mi [ = ] 0 .debug_loc -39.5% -88.6Mi [ = ] 0 .debug_info -18.2% -123Mi [ = ] 0 TOTAL $ bloaty vmlinux.clang11.compressed.dwarf4 -- \ vmlinux.clang11.uncompressed.dwarf4 FILE SIZE VM SIZE -------------- -------------- +0.0% +23 [ = ] 0 [Unmapped] -65.6% -871 [ = ] 0 .debug_aranges -77.4% -1.84Mi [ = ] 0 .debug_frame -82.9% -2.33Mi [ = ] 0 .debug_abbrev -73.1% -2.43Mi [ = ] 0 .debug_str -84.8% -3.07Mi [ = ] 0 .debug_ranges -65.9% -8.62Mi [ = ] 0 .debug_line -86.2% -40.0Mi [ = ] 0 .debug_loc -42.0% -64.1Mi [ = ] 0 .debug_info -22.1% -122Mi [ = ] 0 TOTAL For x86_64 defconfig + LLVM=1 (before): Elapsed (wall clock) time (h:mm:ss or m:ss): 3:22.03 Maximum resident set size (kbytes): 43856 For x86_64 defconfig + LLVM=1 (after): Elapsed (wall clock) time (h:mm:ss or m:ss): 3:32.52 Maximum resident set size (kbytes): 1566776 Thanks to: Nick Clifton helped us to provide the minimal binutils version. Sedat Dilek found an increase in size of debug .deb package. Cc: Nick Clifton Suggested-by: David Blaikie Reviewed-by: Fangrui Song Tested-by: Sedat Dilek Signed-off-by: Nick Desaulniers Signed-off-by: Masahiro Yamada --- Makefile | 6 ++++++ lib/Kconfig.debug | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/Makefile b/Makefile index 48a2dfaf3bf3..2df903429d31 100644 --- a/Makefile +++ b/Makefile @@ -825,6 +825,12 @@ DEBUG_CFLAGS += $(call cc-option, -femit-struct-debug-baseonly) \ $(call cc-option,-fno-var-tracking) endif +ifdef CONFIG_DEBUG_INFO_COMPRESSED +DEBUG_CFLAGS += -gz=zlib +KBUILD_AFLAGS += -Wa,--compress-debug-sections=zlib +KBUILD_LDFLAGS += --compress-debug-sections=zlib +endif + KBUILD_CFLAGS += $(DEBUG_CFLAGS) export DEBUG_CFLAGS diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 21d9c5f6e7ec..d7da101bccc8 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -213,6 +213,23 @@ config DEBUG_INFO_REDUCED DEBUG_INFO build and compile times are reduced too. Only works with newer gcc versions. +config DEBUG_INFO_COMPRESSED + bool "Compressed debugging information" + depends on DEBUG_INFO + depends on $(cc-option,-gz=zlib) + depends on $(as-option,-Wa$(comma)--compress-debug-sections=zlib) + depends on $(ld-option,--compress-debug-sections=zlib) + help + Compress the debug information using zlib. Requires GCC 5.0+ or Clang + 5.0+, binutils 2.26+, and zlib. + + Users of dpkg-deb via scripts/package/builddeb may find an increase in + size of their debug .deb packages with this config set, due to the + debug info being compressed with zlib, then the object files being + recompressed with a different compression scheme. But this is still + preferable to setting $KDEB_COMPRESS to "none" which would be even + larger. + config DEBUG_INFO_SPLIT bool "Produce split debuginfo in .dwo files" depends on DEBUG_INFO From 1c1dbb2c02623db18a50c61b175f19aead800b4e Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Thu, 21 May 2020 11:13:49 +0800 Subject: [PATCH 223/427] PCI: tegra194: Fix runtime PM imbalance on error pm_runtime_get_sync() increments the runtime PM usage counter even when it returns an error code. Thus a pairing decrement is needed on the error handling path to keep the counter balanced. Link: https://lore.kernel.org/r/20200521031355.7022-1-dinghao.liu@zju.edu.cn Signed-off-by: Dinghao Liu Signed-off-by: Lorenzo Pieralisi Acked-by: Thierry Reding Acked-by: Vidya Sagar --- drivers/pci/controller/dwc/pcie-tegra194.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c index ae30a2fd3716..2c0d2ce16b47 100644 --- a/drivers/pci/controller/dwc/pcie-tegra194.c +++ b/drivers/pci/controller/dwc/pcie-tegra194.c @@ -1623,7 +1623,7 @@ static int tegra_pcie_config_rp(struct tegra_pcie_dw *pcie) ret = pinctrl_pm_select_default_state(dev); if (ret < 0) { dev_err(dev, "Failed to configure sideband pins: %d\n", ret); - goto fail_pinctrl; + goto fail_pm_get_sync; } tegra_pcie_init_controller(pcie); @@ -1650,9 +1650,8 @@ static int tegra_pcie_config_rp(struct tegra_pcie_dw *pcie) fail_host_init: tegra_pcie_deinit_controller(pcie); -fail_pinctrl: - pm_runtime_put_sync(dev); fail_pm_get_sync: + pm_runtime_put_sync(dev); pm_runtime_disable(dev); return ret; } From d8bb65ab70f702531aaaa11d9710f9450078e295 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 27 May 2020 21:46:32 +0200 Subject: [PATCH 224/427] workqueue: Use rcuwait for wq_manager_wait The workqueue code has it's internal spinlock (pool::lock) and also implicit spinlock usage in the wq_manager waitqueue. These spinlocks are converted to 'sleeping' spinlocks on a RT-kernel. Workqueue functions can be invoked from contexts which are truly atomic even on a PREEMPT_RT enabled kernel. Taking sleeping locks from such contexts is forbidden. pool::lock can be converted to a raw spinlock as the lock held times are short. But the workqueue manager waitqueue is handled inside of pool::lock held regions which again violates the lock nesting rules of raw and regular spinlocks. The manager waitqueue has no special requirements like custom wakeup callbacks or mass wakeups. While it does not use exclusive wait mode explicitly there is no strict requirement to queue the waiters in a particular order as there is only one waiter at a time. This allows to replace the waitqueue with rcuwait which solves the locking problem because rcuwait relies on existing locking. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tejun Heo --- kernel/workqueue.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7a1fc9fe6314..7c3566f8e4ca 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -301,7 +301,8 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ -static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */ +/* wait for manager to go away */ +static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait); static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ @@ -2140,7 +2141,7 @@ static bool manage_workers(struct worker *worker) pool->manager = NULL; pool->flags &= ~POOL_MANAGER_ACTIVE; - wake_up(&wq_manager_wait); + rcuwait_wake_up(&manager_wait); return true; } @@ -3503,6 +3504,18 @@ static void rcu_free_pool(struct rcu_head *rcu) kfree(pool); } +/* This returns with the lock held on success (pool manager is inactive). */ +static bool wq_manager_inactive(struct worker_pool *pool) +{ + spin_lock_irq(&pool->lock); + + if (pool->flags & POOL_MANAGER_ACTIVE) { + spin_unlock_irq(&pool->lock); + return false; + } + return true; +} + /** * put_unbound_pool - put a worker_pool * @pool: worker_pool to put @@ -3538,10 +3551,11 @@ static void put_unbound_pool(struct worker_pool *pool) * Become the manager and destroy all workers. This prevents * @pool's workers from blocking on attach_mutex. We're the last * manager and @pool gets freed with the flag set. + * Because of how wq_manager_inactive() works, we will hold the + * spinlock after a successful wait. */ - spin_lock_irq(&pool->lock); - wait_event_lock_irq(wq_manager_wait, - !(pool->flags & POOL_MANAGER_ACTIVE), pool->lock); + rcuwait_wait_event(&manager_wait, wq_manager_inactive(pool), + TASK_UNINTERRUPTIBLE); pool->flags |= POOL_MANAGER_ACTIVE; while ((worker = first_idle_worker(pool))) From a9b8a985294debae00f6c087dfec8c384d30a3b9 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 27 May 2020 21:46:33 +0200 Subject: [PATCH 225/427] workqueue: Convert the pool::lock and wq_mayday_lock to raw_spinlock_t The workqueue code has it's internal spinlocks (pool::lock), which are acquired on most workqueue operations. These spinlocks are converted to 'sleeping' spinlocks on a RT-kernel. Workqueue functions can be invoked from contexts which are truly atomic even on a PREEMPT_RT enabled kernel. Taking sleeping locks from such contexts is forbidden. The pool::lock hold times are bound and the code sections are relatively short, which allows to convert pool::lock and as a consequence wq_mayday_lock to raw spinlocks which are truly spinning locks even on a PREEMPT_RT kernel. With the previous conversion of the manager waitqueue to a simple waitqueue workqueues are now fully RT compliant. Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 176 ++++++++++++++++++++++----------------------- 1 file changed, 88 insertions(+), 88 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7c3566f8e4ca..82f85f5d81a8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -145,7 +145,7 @@ enum { /* struct worker is defined in workqueue_internal.h */ struct worker_pool { - spinlock_t lock; /* the pool lock */ + raw_spinlock_t lock; /* the pool lock */ int cpu; /* I: the associated cpu */ int node; /* I: the associated node ID */ int id; /* I: pool ID */ @@ -300,7 +300,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ -static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ +static DEFINE_RAW_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ /* wait for manager to go away */ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait); @@ -827,7 +827,7 @@ static struct worker *first_idle_worker(struct worker_pool *pool) * Wake up the first idle worker of @pool. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void wake_up_worker(struct worker_pool *pool) { @@ -882,7 +882,7 @@ void wq_worker_sleeping(struct task_struct *task) return; worker->sleeping = 1; - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * The counterpart of the following dec_and_test, implied mb, @@ -901,7 +901,7 @@ void wq_worker_sleeping(struct task_struct *task) if (next) wake_up_process(next->task); } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } /** @@ -912,7 +912,7 @@ void wq_worker_sleeping(struct task_struct *task) * the scheduler to get a worker's last known identity. * * CONTEXT: - * spin_lock_irq(rq->lock) + * raw_spin_lock_irq(rq->lock) * * This function is called during schedule() when a kworker is going * to sleep. It's used by psi to identify aggregation workers during @@ -943,7 +943,7 @@ work_func_t wq_worker_last_func(struct task_struct *task) * Set @flags in @worker->flags and adjust nr_running accordingly. * * CONTEXT: - * spin_lock_irq(pool->lock) + * raw_spin_lock_irq(pool->lock) */ static inline void worker_set_flags(struct worker *worker, unsigned int flags) { @@ -968,7 +968,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags) * Clear @flags in @worker->flags and adjust nr_running accordingly. * * CONTEXT: - * spin_lock_irq(pool->lock) + * raw_spin_lock_irq(pool->lock) */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { @@ -1016,7 +1016,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) * actually occurs, it should be easy to locate the culprit work function. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). * * Return: * Pointer to worker which is executing @work if found, %NULL @@ -1051,7 +1051,7 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool, * nested inside outer list_for_each_entry_safe(). * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void move_linked_works(struct work_struct *work, struct list_head *head, struct work_struct **nextp) @@ -1129,9 +1129,9 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq) * As both pwqs and pools are RCU protected, the * following lock operations are safe. */ - spin_lock_irq(&pwq->pool->lock); + raw_spin_lock_irq(&pwq->pool->lock); put_pwq(pwq); - spin_unlock_irq(&pwq->pool->lock); + raw_spin_unlock_irq(&pwq->pool->lock); } } @@ -1164,7 +1164,7 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq) * decrement nr_in_flight of its pwq and handle workqueue flushing. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) { @@ -1263,7 +1263,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, if (!pool) goto fail; - spin_lock(&pool->lock); + raw_spin_lock(&pool->lock); /* * work->data is guaranteed to point to pwq only while the work * item is queued on pwq->wq, and both updating work->data to point @@ -1292,11 +1292,11 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, /* work->data points to pwq iff queued, point to pool */ set_work_pool_and_keep_pending(work, pool->id); - spin_unlock(&pool->lock); + raw_spin_unlock(&pool->lock); rcu_read_unlock(); return 1; } - spin_unlock(&pool->lock); + raw_spin_unlock(&pool->lock); fail: rcu_read_unlock(); local_irq_restore(*flags); @@ -1317,7 +1317,7 @@ fail: * work_struct flags. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, struct list_head *head, unsigned int extra_flags) @@ -1434,7 +1434,7 @@ retry: if (last_pool && last_pool != pwq->pool) { struct worker *worker; - spin_lock(&last_pool->lock); + raw_spin_lock(&last_pool->lock); worker = find_worker_executing_work(last_pool, work); @@ -1442,11 +1442,11 @@ retry: pwq = worker->current_pwq; } else { /* meh... not running there, queue here */ - spin_unlock(&last_pool->lock); - spin_lock(&pwq->pool->lock); + raw_spin_unlock(&last_pool->lock); + raw_spin_lock(&pwq->pool->lock); } } else { - spin_lock(&pwq->pool->lock); + raw_spin_lock(&pwq->pool->lock); } /* @@ -1459,7 +1459,7 @@ retry: */ if (unlikely(!pwq->refcnt)) { if (wq->flags & WQ_UNBOUND) { - spin_unlock(&pwq->pool->lock); + raw_spin_unlock(&pwq->pool->lock); cpu_relax(); goto retry; } @@ -1491,7 +1491,7 @@ retry: insert_work(pwq, work, worklist, work_flags); out: - spin_unlock(&pwq->pool->lock); + raw_spin_unlock(&pwq->pool->lock); rcu_read_unlock(); } @@ -1760,7 +1760,7 @@ EXPORT_SYMBOL(queue_rcu_work); * necessary. * * LOCKING: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void worker_enter_idle(struct worker *worker) { @@ -1800,7 +1800,7 @@ static void worker_enter_idle(struct worker *worker) * @worker is leaving idle state. Update stats. * * LOCKING: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void worker_leave_idle(struct worker *worker) { @@ -1938,11 +1938,11 @@ static struct worker *create_worker(struct worker_pool *pool) worker_attach_to_pool(worker, pool); /* start the newly created worker */ - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); worker->pool->nr_workers++; worker_enter_idle(worker); wake_up_process(worker->task); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); return worker; @@ -1961,7 +1961,7 @@ fail: * be idle. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void destroy_worker(struct worker *worker) { @@ -1987,7 +1987,7 @@ static void idle_worker_timeout(struct timer_list *t) { struct worker_pool *pool = from_timer(pool, t, idle_timer); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); while (too_many_workers(pool)) { struct worker *worker; @@ -2005,7 +2005,7 @@ static void idle_worker_timeout(struct timer_list *t) destroy_worker(worker); } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } static void send_mayday(struct work_struct *work) @@ -2036,8 +2036,8 @@ static void pool_mayday_timeout(struct timer_list *t) struct worker_pool *pool = from_timer(pool, t, mayday_timer); struct work_struct *work; - spin_lock_irq(&pool->lock); - spin_lock(&wq_mayday_lock); /* for wq->maydays */ + raw_spin_lock_irq(&pool->lock); + raw_spin_lock(&wq_mayday_lock); /* for wq->maydays */ if (need_to_create_worker(pool)) { /* @@ -2050,8 +2050,8 @@ static void pool_mayday_timeout(struct timer_list *t) send_mayday(work); } - spin_unlock(&wq_mayday_lock); - spin_unlock_irq(&pool->lock); + raw_spin_unlock(&wq_mayday_lock); + raw_spin_unlock_irq(&pool->lock); mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } @@ -2070,7 +2070,7 @@ static void pool_mayday_timeout(struct timer_list *t) * may_start_working() %true. * * LOCKING: - * spin_lock_irq(pool->lock) which may be released and regrabbed + * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. Called only from * manager. */ @@ -2079,7 +2079,7 @@ __releases(&pool->lock) __acquires(&pool->lock) { restart: - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); @@ -2095,7 +2095,7 @@ restart: } del_timer_sync(&pool->mayday_timer); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * This is necessary even after a new worker was just successfully * created as @pool->lock was dropped and the new worker might have @@ -2118,7 +2118,7 @@ restart: * and may_start_working() is true. * * CONTEXT: - * spin_lock_irq(pool->lock) which may be released and regrabbed + * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. * * Return: @@ -2157,7 +2157,7 @@ static bool manage_workers(struct worker *worker) * call this function to process a work. * * CONTEXT: - * spin_lock_irq(pool->lock) which is released and regrabbed. + * raw_spin_lock_irq(pool->lock) which is released and regrabbed. */ static void process_one_work(struct worker *worker, struct work_struct *work) __releases(&pool->lock) @@ -2239,7 +2239,7 @@ __acquires(&pool->lock) */ set_work_pool_and_clear_pending(work, pool->id); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); lock_map_acquire(&pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); @@ -2294,7 +2294,7 @@ __acquires(&pool->lock) */ cond_resched(); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* clear cpu intensive status */ if (unlikely(cpu_intensive)) @@ -2320,7 +2320,7 @@ __acquires(&pool->lock) * fetches a work from the top and executes it. * * CONTEXT: - * spin_lock_irq(pool->lock) which may be released and regrabbed + * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. */ static void process_scheduled_works(struct worker *worker) @@ -2362,11 +2362,11 @@ static int worker_thread(void *__worker) /* tell the scheduler that this is a workqueue worker */ set_pf_worker(true); woke_up: - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* am I supposed to die? */ if (unlikely(worker->flags & WORKER_DIE)) { - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); WARN_ON_ONCE(!list_empty(&worker->entry)); set_pf_worker(false); @@ -2432,7 +2432,7 @@ sleep: */ worker_enter_idle(worker); __set_current_state(TASK_IDLE); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); schedule(); goto woke_up; } @@ -2486,7 +2486,7 @@ repeat: should_stop = kthread_should_stop(); /* see whether any pwq is asking for help */ - spin_lock_irq(&wq_mayday_lock); + raw_spin_lock_irq(&wq_mayday_lock); while (!list_empty(&wq->maydays)) { struct pool_workqueue *pwq = list_first_entry(&wq->maydays, @@ -2498,11 +2498,11 @@ repeat: __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); - spin_unlock_irq(&wq_mayday_lock); + raw_spin_unlock_irq(&wq_mayday_lock); worker_attach_to_pool(rescuer, pool); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * Slurp in all works issued via this workqueue and @@ -2531,7 +2531,7 @@ repeat: * incur MAYDAY_INTERVAL delay inbetween. */ if (need_to_create_worker(pool)) { - spin_lock(&wq_mayday_lock); + raw_spin_lock(&wq_mayday_lock); /* * Queue iff we aren't racing destruction * and somebody else hasn't queued it already. @@ -2540,7 +2540,7 @@ repeat: get_pwq(pwq); list_add_tail(&pwq->mayday_node, &wq->maydays); } - spin_unlock(&wq_mayday_lock); + raw_spin_unlock(&wq_mayday_lock); } } @@ -2558,14 +2558,14 @@ repeat: if (need_more_worker(pool)) wake_up_worker(pool); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); worker_detach_from_pool(rescuer); - spin_lock_irq(&wq_mayday_lock); + raw_spin_lock_irq(&wq_mayday_lock); } - spin_unlock_irq(&wq_mayday_lock); + raw_spin_unlock_irq(&wq_mayday_lock); if (should_stop) { __set_current_state(TASK_RUNNING); @@ -2645,7 +2645,7 @@ static void wq_barrier_func(struct work_struct *work) * underneath us, so we can't reliably determine pwq from @target. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, @@ -2732,7 +2732,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, for_each_pwq(pwq, wq) { struct worker_pool *pool = pwq->pool; - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); if (flush_color >= 0) { WARN_ON_ONCE(pwq->flush_color != -1); @@ -2749,7 +2749,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, pwq->work_color = work_color; } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) @@ -2949,9 +2949,9 @@ reflush: for_each_pwq(pwq, wq) { bool drained; - spin_lock_irq(&pwq->pool->lock); + raw_spin_lock_irq(&pwq->pool->lock); drained = !pwq->nr_active && list_empty(&pwq->delayed_works); - spin_unlock_irq(&pwq->pool->lock); + raw_spin_unlock_irq(&pwq->pool->lock); if (drained) continue; @@ -2987,7 +2987,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, return false; } - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* see the comment in try_to_grab_pending() with the same code */ pwq = get_work_pwq(work); if (pwq) { @@ -3003,7 +3003,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, check_flush_dependency(pwq->wq, work); insert_wq_barrier(pwq, barr, work, worker); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); /* * Force a lock recursion deadlock when using flush_work() inside a @@ -3022,7 +3022,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, rcu_read_unlock(); return true; already_gone: - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); rcu_read_unlock(); return false; } @@ -3415,7 +3415,7 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, */ static int init_worker_pool(struct worker_pool *pool) { - spin_lock_init(&pool->lock); + raw_spin_lock_init(&pool->lock); pool->id = -1; pool->cpu = -1; pool->node = NUMA_NO_NODE; @@ -3507,10 +3507,10 @@ static void rcu_free_pool(struct rcu_head *rcu) /* This returns with the lock held on success (pool manager is inactive). */ static bool wq_manager_inactive(struct worker_pool *pool) { - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); if (pool->flags & POOL_MANAGER_ACTIVE) { - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); return false; } return true; @@ -3561,7 +3561,7 @@ static void put_unbound_pool(struct worker_pool *pool) while ((worker = first_idle_worker(pool))) destroy_worker(worker); WARN_ON(pool->nr_workers || pool->nr_idle); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); mutex_lock(&wq_pool_attach_mutex); if (!list_empty(&pool->workers)) @@ -3717,7 +3717,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) return; /* this function can be called during early boot w/ irq disabled */ - spin_lock_irqsave(&pwq->pool->lock, flags); + raw_spin_lock_irqsave(&pwq->pool->lock, flags); /* * During [un]freezing, the caller is responsible for ensuring that @@ -3740,7 +3740,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) pwq->max_active = 0; } - spin_unlock_irqrestore(&pwq->pool->lock, flags); + raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); } /* initialize newly alloced @pwq which is associated with @wq and @pool */ @@ -4142,9 +4142,9 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, use_dfl_pwq: mutex_lock(&wq->mutex); - spin_lock_irq(&wq->dfl_pwq->pool->lock); + raw_spin_lock_irq(&wq->dfl_pwq->pool->lock); get_pwq(wq->dfl_pwq); - spin_unlock_irq(&wq->dfl_pwq->pool->lock); + raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock); old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq); out_unlock: mutex_unlock(&wq->mutex); @@ -4373,9 +4373,9 @@ void destroy_workqueue(struct workqueue_struct *wq) struct worker *rescuer = wq->rescuer; /* this prevents new queueing */ - spin_lock_irq(&wq_mayday_lock); + raw_spin_lock_irq(&wq_mayday_lock); wq->rescuer = NULL; - spin_unlock_irq(&wq_mayday_lock); + raw_spin_unlock_irq(&wq_mayday_lock); /* rescuer will empty maydays list before exiting */ kthread_stop(rescuer->task); @@ -4389,18 +4389,18 @@ void destroy_workqueue(struct workqueue_struct *wq) mutex_lock(&wq_pool_mutex); mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { - spin_lock_irq(&pwq->pool->lock); + raw_spin_lock_irq(&pwq->pool->lock); if (WARN_ON(pwq_busy(pwq))) { pr_warn("%s: %s has the following busy pwq\n", __func__, wq->name); show_pwq(pwq); - spin_unlock_irq(&pwq->pool->lock); + raw_spin_unlock_irq(&pwq->pool->lock); mutex_unlock(&wq->mutex); mutex_unlock(&wq_pool_mutex); show_workqueue_state(); return; } - spin_unlock_irq(&pwq->pool->lock); + raw_spin_unlock_irq(&pwq->pool->lock); } mutex_unlock(&wq->mutex); mutex_unlock(&wq_pool_mutex); @@ -4571,10 +4571,10 @@ unsigned int work_busy(struct work_struct *work) rcu_read_lock(); pool = get_work_pool(work); if (pool) { - spin_lock_irqsave(&pool->lock, flags); + raw_spin_lock_irqsave(&pool->lock, flags); if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; - spin_unlock_irqrestore(&pool->lock, flags); + raw_spin_unlock_irqrestore(&pool->lock, flags); } rcu_read_unlock(); @@ -4781,10 +4781,10 @@ void show_workqueue_state(void) pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); for_each_pwq(pwq, wq) { - spin_lock_irqsave(&pwq->pool->lock, flags); + raw_spin_lock_irqsave(&pwq->pool->lock, flags); if (pwq->nr_active || !list_empty(&pwq->delayed_works)) show_pwq(pwq); - spin_unlock_irqrestore(&pwq->pool->lock, flags); + raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); /* * We could be printing a lot from atomic context, e.g. * sysrq-t -> show_workqueue_state(). Avoid triggering @@ -4798,7 +4798,7 @@ void show_workqueue_state(void) struct worker *worker; bool first = true; - spin_lock_irqsave(&pool->lock, flags); + raw_spin_lock_irqsave(&pool->lock, flags); if (pool->nr_workers == pool->nr_idle) goto next_pool; @@ -4817,7 +4817,7 @@ void show_workqueue_state(void) } pr_cont("\n"); next_pool: - spin_unlock_irqrestore(&pool->lock, flags); + raw_spin_unlock_irqrestore(&pool->lock, flags); /* * We could be printing a lot from atomic context, e.g. * sysrq-t -> show_workqueue_state(). Avoid triggering @@ -4847,7 +4847,7 @@ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) struct worker_pool *pool = worker->pool; if (pool) { - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * ->desc tracks information (wq name or * set_worker_desc()) for the latest execution. If @@ -4861,7 +4861,7 @@ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) scnprintf(buf + off, size - off, "-%s", worker->desc); } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } } @@ -4892,7 +4892,7 @@ static void unbind_workers(int cpu) for_each_cpu_worker_pool(pool, cpu) { mutex_lock(&wq_pool_attach_mutex); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * We've blocked all attach/detach operations. Make all workers @@ -4906,7 +4906,7 @@ static void unbind_workers(int cpu) pool->flags |= POOL_DISASSOCIATED; - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); mutex_unlock(&wq_pool_attach_mutex); /* @@ -4932,9 +4932,9 @@ static void unbind_workers(int cpu) * worker blocking could lead to lengthy stalls. Kick off * unbound chain execution of currently pending work items. */ - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); wake_up_worker(pool); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } } @@ -4961,7 +4961,7 @@ static void rebind_workers(struct worker_pool *pool) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask) < 0); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); pool->flags &= ~POOL_DISASSOCIATED; @@ -5000,7 +5000,7 @@ static void rebind_workers(struct worker_pool *pool) WRITE_ONCE(worker->flags, worker_flags); } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } /** From 4f3f4cf388f8fda7ee8ea7c6af0ff0ebb2d05fe4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 29 May 2020 06:58:59 +0000 Subject: [PATCH 226/427] workqueue: void unneeded requeuing the pwq in rescuer thread 008847f66c3 ("workqueue: allow rescuer thread to do more work.") made the rescuer worker requeue the pwq immediately if there may be more work items which need rescuing instead of waiting for the next mayday timer expiration. Unfortunately, it checks only whether the pool needs help from rescuers, but it doesn't check whether the pwq has work items in the pool (the real reason that this rescuer can help for the pool). The patch adds the check and void unneeded requeuing. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 82f85f5d81a8..6feefc65d332 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2530,7 +2530,7 @@ repeat: * being used to relieve memory pressure, don't * incur MAYDAY_INTERVAL delay inbetween. */ - if (need_to_create_worker(pool)) { + if (pwq->nr_active && need_to_create_worker(pool)) { raw_spin_lock(&wq_mayday_lock); /* * Queue iff we aren't racing destruction From b8f06b0444ec146e3ae98caac8039c77e5308ce2 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 29 May 2020 06:59:02 +0000 Subject: [PATCH 227/427] workqueue: remove useless unlock() and lock() in series This is no point to unlock() and then lock() the same mutex back to back. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6feefc65d332..c667ca5aed61 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4403,13 +4403,11 @@ void destroy_workqueue(struct workqueue_struct *wq) raw_spin_unlock_irq(&pwq->pool->lock); } mutex_unlock(&wq->mutex); - mutex_unlock(&wq_pool_mutex); /* * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ - mutex_lock(&wq_pool_mutex); list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); From 3e5095eebe015d5a4d566aa5e03c8621add5f0a7 Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Wed, 27 May 2020 23:02:39 -0400 Subject: [PATCH 228/427] PCI: vmd: Filter resource type bits from shadow register Versions of VMD with the Host Physical Address shadow register use this register to calculate the bus address offset needed to do guest passthrough of the domain. This register shadows the Host Physical Address registers including the resource type bits. After calculating the offset, the extra resource type bits lead to the VMD resources being over-provisioned at the front and under-provisioned at the back. Example: pci 10000:80:02.0: reg 0x10: [mem 0xf801fffc-0xf803fffb 64bit] Expected: pci 10000:80:02.0: reg 0x10: [mem 0xf8020000-0xf803ffff 64bit] If other devices are mapped in the over-provisioned front, it could lead to resource conflict issues with VMD or those devices. Link: https://lore.kernel.org/r/20200528030240.16024-3-jonathan.derrick@intel.com Fixes: a1a30170138c9 ("PCI: vmd: Fix shadow offsets to reflect spec changes") Signed-off-by: Jon Derrick Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/vmd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index dac91d60701d..e386d4eac407 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -445,9 +445,11 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) if (!membar2) return -ENOMEM; offset[0] = vmd->dev->resource[VMD_MEMBAR1].start - - readq(membar2 + MB2_SHADOW_OFFSET); + (readq(membar2 + MB2_SHADOW_OFFSET) & + PCI_BASE_ADDRESS_MEM_MASK); offset[1] = vmd->dev->resource[VMD_MEMBAR2].start - - readq(membar2 + MB2_SHADOW_OFFSET + 8); + (readq(membar2 + MB2_SHADOW_OFFSET + 8) & + PCI_BASE_ADDRESS_MEM_MASK); pci_iounmap(vmd->dev, membar2); } } From 22ce85611fd5a793edf84bfc0a101077cbe85e4f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 24 May 2020 23:48:02 +0100 Subject: [PATCH 229/427] orangefs: remove redundant assignment to variable ret The variable ret is being initialized with a value that is never read and it is being updated later with a new value. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Mike Marshall --- fs/orangefs/orangefs-mod.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c index c010c1fddafc..289b648ae196 100644 --- a/fs/orangefs/orangefs-mod.c +++ b/fs/orangefs/orangefs-mod.c @@ -79,7 +79,7 @@ DECLARE_WAIT_QUEUE_HEAD(orangefs_request_list_waitq); static int __init orangefs_init(void) { - int ret = -1; + int ret; __u32 i = 0; if (op_timeout_secs < 0) From 0df556457748d160013e88202c11712c16a83b0c Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 22 May 2020 20:59:09 -0700 Subject: [PATCH 230/427] orangefs: convert get_user_pages() --> pin_user_pages() This code was using get_user_pages*(), in a "Case 1" scenario (Direct IO), using the categorization from [1]. That means that it's time to convert the get_user_pages*() + put_page() calls to pin_user_pages*() + unpin_user_pages() calls. There is some helpful background in [2]: basically, this is a small part of fixing a long-standing disconnect between pinning pages, and file systems' use of those pages. [1] Documentation/core-api/pin_user_pages.rst [2] "Explicit pinning of user-space pages": https://lwn.net/Articles/807108/ Cc: Mike Marshall Cc: Martin Brandenburg Cc: devel@lists.orangefs.org Cc: linux-fsdevel@vger.kernel.org Signed-off-by: John Hubbard Signed-off-by: Mike Marshall --- fs/orangefs/orangefs-bufmap.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index 2bb916d68576..538e839590ef 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -168,10 +168,7 @@ static DEFINE_SPINLOCK(orangefs_bufmap_lock); static void orangefs_bufmap_unmap(struct orangefs_bufmap *bufmap) { - int i; - - for (i = 0; i < bufmap->page_count; i++) - put_page(bufmap->page_array[i]); + unpin_user_pages(bufmap->page_array, bufmap->page_count); } static void @@ -268,7 +265,7 @@ orangefs_bufmap_map(struct orangefs_bufmap *bufmap, int offset = 0, ret, i; /* map the pages */ - ret = get_user_pages_fast((unsigned long)user_desc->ptr, + ret = pin_user_pages_fast((unsigned long)user_desc->ptr, bufmap->page_count, FOLL_WRITE, bufmap->page_array); if (ret < 0) @@ -280,7 +277,7 @@ orangefs_bufmap_map(struct orangefs_bufmap *bufmap, for (i = 0; i < ret; i++) { SetPageError(bufmap->page_array[i]); - put_page(bufmap->page_array[i]); + unpin_user_page(bufmap->page_array[i]); } return -ENOMEM; } From 3f19b2ab97a97b413c24b66c67ae16daa4f56c35 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 1 Dec 2017 11:40:16 +0000 Subject: [PATCH 231/427] vfs, afs, ext4: Make the inode hash table RCU searchable Make the inode hash table RCU searchable so that searches that want to access or modify an inode without taking a ref on that inode can do so without taking the inode hash table lock. The main thing this requires is some RCU annotation on the list manipulation operations. Inodes are already freed by RCU in most cases. Users of this interface must take care as the inode may be still under construction or may be being torn down around them. There are at least three instances where this can be of use: (1) Testing whether the inode number iunique() is going to return is currently unique (the iunique_lock is still held). (2) Ext4 date stamp updating. (3) AFS callback breaking. Signed-off-by: David Howells Acked-by: Konstantin Khlebnikov cc: linux-ext4@vger.kernel.org cc: linux-afs@lists.infradead.org --- fs/afs/callback.c | 12 +++-- fs/ext4/inode.c | 44 +++++++++--------- fs/inode.c | 112 ++++++++++++++++++++++++++++++++++++++------- include/linux/fs.h | 3 ++ 4 files changed, 130 insertions(+), 41 deletions(-) diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 2dca8df1a18d..0dcbd40732d1 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -252,6 +252,7 @@ static void afs_break_one_callback(struct afs_server *server, struct afs_vnode *vnode; struct inode *inode; + rcu_read_lock(); read_lock(&server->cb_break_lock); hlist_for_each_entry(vi, &server->cb_volumes, srv_link) { if (vi->vid < fid->vid) @@ -287,12 +288,16 @@ static void afs_break_one_callback(struct afs_server *server, } else { data.volume = NULL; data.fid = *fid; - inode = ilookup5_nowait(cbi->sb, fid->vnode, - afs_iget5_test, &data); + + /* See if we can find a matching inode - even an I_NEW + * inode needs to be marked as it can have its callback + * broken before we finish setting up the local inode. + */ + inode = find_inode_rcu(cbi->sb, fid->vnode, + afs_iget5_test, &data); if (inode) { vnode = AFS_FS_I(inode); afs_break_callback(vnode, afs_cb_break_for_callback); - iput(inode); } else { trace_afs_cb_miss(fid, afs_cb_break_for_callback); } @@ -301,6 +306,7 @@ static void afs_break_one_callback(struct afs_server *server, out: read_unlock(&server->cb_break_lock); + rcu_read_unlock(); } /* diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2a4aae6acdcb..2bbb55d05bb7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4860,21 +4860,22 @@ static int ext4_inode_blocks_set(handle_t *handle, return 0; } -struct other_inode { - unsigned long orig_ino; - struct ext4_inode *raw_inode; -}; - -static int other_inode_match(struct inode * inode, unsigned long ino, - void *data) +static void __ext4_update_other_inode_time(struct super_block *sb, + unsigned long orig_ino, + unsigned long ino, + struct ext4_inode *raw_inode) { - struct other_inode *oi = (struct other_inode *) data; + struct inode *inode; - if ((inode->i_ino != ino) || - (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | + inode = find_inode_by_ino_rcu(sb, ino); + if (!inode) + return; + + if ((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | I_DIRTY_INODE)) || ((inode->i_state & I_DIRTY_TIME) == 0)) - return 0; + return; + spin_lock(&inode->i_lock); if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | I_DIRTY_INODE)) == 0) && @@ -4885,16 +4886,15 @@ static int other_inode_match(struct inode * inode, unsigned long ino, spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); - EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode); - EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode); - EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode); - ext4_inode_csum_set(inode, oi->raw_inode, ei); + EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); + ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); - trace_ext4_other_inode_update_time(inode, oi->orig_ino); - return -1; + trace_ext4_other_inode_update_time(inode, orig_ino); + return; } spin_unlock(&inode->i_lock); - return -1; } /* @@ -4904,24 +4904,24 @@ static int other_inode_match(struct inode * inode, unsigned long ino, static void ext4_update_other_inodes_time(struct super_block *sb, unsigned long orig_ino, char *buf) { - struct other_inode oi; unsigned long ino; int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; int inode_size = EXT4_INODE_SIZE(sb); - oi.orig_ino = orig_ino; /* * Calculate the first inode in the inode table block. Inode * numbers are one-based. That is, the first inode in a block * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1). */ ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1; + rcu_read_lock(); for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { if (ino == orig_ino) continue; - oi.raw_inode = (struct ext4_inode *) buf; - (void) find_inode_nowait(sb, ino, other_inode_match, &oi); + __ext4_update_other_inode_time(sb, orig_ino, ino, + (struct ext4_inode *)buf); } + rcu_read_unlock(); } /* diff --git a/fs/inode.c b/fs/inode.c index 93d9252a00ab..b7bd7162c902 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -497,7 +497,7 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval) spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); - hlist_add_head(&inode->i_hash, b); + hlist_add_head_rcu(&inode->i_hash, b); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); } @@ -513,7 +513,7 @@ void __remove_inode_hash(struct inode *inode) { spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); - hlist_del_init(&inode->i_hash); + hlist_del_init_rcu(&inode->i_hash); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); } @@ -1107,7 +1107,7 @@ again: */ spin_lock(&inode->i_lock); inode->i_state |= I_NEW; - hlist_add_head(&inode->i_hash, head); + hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); if (!creating) inode_sb_list_add(inode); @@ -1201,7 +1201,7 @@ again: inode->i_ino = ino; spin_lock(&inode->i_lock); inode->i_state = I_NEW; - hlist_add_head(&inode->i_hash, head); + hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); inode_sb_list_add(inode); spin_unlock(&inode_hash_lock); @@ -1244,15 +1244,10 @@ static int test_inode_iunique(struct super_block *sb, unsigned long ino) struct hlist_head *b = inode_hashtable + hash(sb, ino); struct inode *inode; - spin_lock(&inode_hash_lock); - hlist_for_each_entry(inode, b, i_hash) { - if (inode->i_ino == ino && inode->i_sb == sb) { - spin_unlock(&inode_hash_lock); + hlist_for_each_entry_rcu(inode, b, i_hash) { + if (inode->i_ino == ino && inode->i_sb == sb) return 0; - } } - spin_unlock(&inode_hash_lock); - return 1; } @@ -1281,6 +1276,7 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved) static unsigned int counter; ino_t res; + rcu_read_lock(); spin_lock(&iunique_lock); do { if (counter <= max_reserved) @@ -1288,6 +1284,7 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved) res = counter++; } while (!test_inode_iunique(sb, res)); spin_unlock(&iunique_lock); + rcu_read_unlock(); return res; } @@ -1456,6 +1453,84 @@ out: } EXPORT_SYMBOL(find_inode_nowait); +/** + * find_inode_rcu - find an inode in the inode cache + * @sb: Super block of file system to search + * @hashval: Key to hash + * @test: Function to test match on an inode + * @data: Data for test function + * + * Search for the inode specified by @hashval and @data in the inode cache, + * where the helper function @test will return 0 if the inode does not match + * and 1 if it does. The @test function must be responsible for taking the + * i_lock spin_lock and checking i_state for an inode being freed or being + * initialized. + * + * If successful, this will return the inode for which the @test function + * returned 1 and NULL otherwise. + * + * The @test function is not permitted to take a ref on any inode presented. + * It is also not permitted to sleep. + * + * The caller must hold the RCU read lock. + */ +struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), + "suspicious find_inode_rcu() usage"); + + hlist_for_each_entry_rcu(inode, head, i_hash) { + if (inode->i_sb == sb && + !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && + test(inode, data)) + return inode; + } + return NULL; +} +EXPORT_SYMBOL(find_inode_rcu); + +/** + * find_inode_by_rcu - Find an inode in the inode cache + * @sb: Super block of file system to search + * @ino: The inode number to match + * + * Search for the inode specified by @hashval and @data in the inode cache, + * where the helper function @test will return 0 if the inode does not match + * and 1 if it does. The @test function must be responsible for taking the + * i_lock spin_lock and checking i_state for an inode being freed or being + * initialized. + * + * If successful, this will return the inode for which the @test function + * returned 1 and NULL otherwise. + * + * The @test function is not permitted to take a ref on any inode presented. + * It is also not permitted to sleep. + * + * The caller must hold the RCU read lock. + */ +struct inode *find_inode_by_ino_rcu(struct super_block *sb, + unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode *inode; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), + "suspicious find_inode_by_ino_rcu() usage"); + + hlist_for_each_entry_rcu(inode, head, i_hash) { + if (inode->i_ino == ino && + inode->i_sb == sb && + !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) + return inode; + } + return NULL; +} +EXPORT_SYMBOL(find_inode_by_ino_rcu); + int insert_inode_locked(struct inode *inode) { struct super_block *sb = inode->i_sb; @@ -1480,7 +1555,7 @@ int insert_inode_locked(struct inode *inode) if (likely(!old)) { spin_lock(&inode->i_lock); inode->i_state |= I_NEW | I_CREATING; - hlist_add_head(&inode->i_hash, head); + hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); return 0; @@ -1540,6 +1615,7 @@ static void iput_final(struct inode *inode) { struct super_block *sb = inode->i_sb; const struct super_operations *op = inode->i_sb->s_op; + unsigned long state; int drop; WARN_ON(inode->i_state & I_NEW); @@ -1555,16 +1631,20 @@ static void iput_final(struct inode *inode) return; } + state = inode->i_state; if (!drop) { - inode->i_state |= I_WILL_FREE; + WRITE_ONCE(inode->i_state, state | I_WILL_FREE); spin_unlock(&inode->i_lock); + write_inode_now(inode, 1); + spin_lock(&inode->i_lock); - WARN_ON(inode->i_state & I_NEW); - inode->i_state &= ~I_WILL_FREE; + state = inode->i_state; + WARN_ON(state & I_NEW); + state &= ~I_WILL_FREE; } - inode->i_state |= I_FREEING; + WRITE_ONCE(inode->i_state, state | I_FREEING); if (!list_empty(&inode->i_lru)) inode_lru_list_del(inode); spin_unlock(&inode->i_lock); diff --git a/include/linux/fs.h b/include/linux/fs.h index 45cc10cdf6dd..5f9b2bb4b44f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3070,6 +3070,9 @@ extern struct inode *find_inode_nowait(struct super_block *, int (*match)(struct inode *, unsigned long, void *), void *data); +extern struct inode *find_inode_rcu(struct super_block *, unsigned long, + int (*)(struct inode *, void *), void *); +extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long); extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); extern int insert_inode_locked(struct inode *); #ifdef CONFIG_DEBUG_LOCK_ALLOC From 23e2db311a10ba66c439ddac7a703991309702ee Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 2 May 2020 13:31:19 +0100 Subject: [PATCH 232/427] rxrpc: Map the EACCES error produced by some ICMP6 to EHOSTUNREACH Map the EACCES error that is produced by some ICMP6 packets to EHOSTUNREACH when we get them as EACCES has other meanings within a filesystem context. Signed-off-by: David Howells --- net/rxrpc/peer_event.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index b1449d971883..112e490ebbcd 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -271,6 +271,9 @@ static void rxrpc_store_error(struct rxrpc_peer *peer, break; case SO_EE_ORIGIN_ICMP6: + if (err == EACCES) + err = EHOSTUNREACH; + /* Fall through */ default: _proto("Rx Received error report { orig=%u }", ee->ee_origin); break; From 32f71aa497cfb23d37149c2ef16ad71fce2e45e2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 2 May 2020 13:38:23 +0100 Subject: [PATCH 233/427] rxrpc: Adjust /proc/net/rxrpc/calls to display call->debug_id not user_ID The user ID value isn't actually much use - and leaks a kernel pointer or a userspace value - so replace it with the call debug ID, which appears in trace points. Signed-off-by: David Howells --- net/rxrpc/proc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 8b179e3c802a..543afd9bd664 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -68,7 +68,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) "Proto Local " " Remote " " SvID ConnID CallID End Use State Abort " - " UserID TxSeq TW RxSeq RW RxSerial RxTimo\n"); + " DebugId TxSeq TW RxSeq RW RxSerial RxTimo\n"); return 0; } @@ -100,7 +100,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) rx_hard_ack = READ_ONCE(call->rx_hard_ack); seq_printf(seq, "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u" - " %-8.8s %08x %lx %08x %02x %08x %02x %08x %06lx\n", + " %-8.8s %08x %08x %08x %02x %08x %02x %08x %06lx\n", lbuff, rbuff, call->service_id, @@ -110,7 +110,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) atomic_read(&call->usage), rxrpc_call_states[call->state], call->abort_code, - call->user_call_ID, + call->debug_id, tx_hard_ack, READ_ONCE(call->tx_top) - tx_hard_ack, rx_hard_ack, READ_ONCE(call->rx_top) - rx_hard_ack, call->rx_serial, From 13fcc6356a94558a0a4857dc00cd26b3834a1b3e Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Apr 2020 14:20:32 +0100 Subject: [PATCH 234/427] afs: Always include dir in bulk status fetch from afs_do_lookup() When a lookup is done in an AFS directory, the filesystem will speculate and fetch up to 49 other statuses for files in the same directory and fetch those as well, turning them into inodes or updating inodes that already exist. However, occasionally, a callback break might go missing due to NAT timing out, but the afs filesystem doesn't then realise that the directory is not up to date. Alleviate this by using one of the status slots to check the directory in which the lookup is being done. Reported-by: Dave Botsch Suggested-by: Jeffrey Altman Signed-off-by: David Howells --- fs/afs/dir.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index d1e1caa23c8b..3c486340b220 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -658,7 +658,8 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, cookie->ctx.actor = afs_lookup_filldir; cookie->name = dentry->d_name; - cookie->nr_fids = 1; /* slot 0 is saved for the fid we actually want */ + cookie->nr_fids = 2; /* slot 0 is saved for the fid we actually want + * and slot 1 for the directory */ read_seqlock_excl(&dvnode->cb_lock); dcbi = rcu_dereference_protected(dvnode->cb_interest, @@ -709,7 +710,11 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, if (!cookie->inodes) goto out_s; - for (i = 1; i < cookie->nr_fids; i++) { + cookie->fids[1] = dvnode->fid; + cookie->statuses[1].cb_break = afs_calc_vnode_cb_break(dvnode); + cookie->inodes[1] = igrab(&dvnode->vfs_inode); + + for (i = 2; i < cookie->nr_fids; i++) { scb = &cookie->statuses[i]; /* Find any inodes that already exist and get their From 810068059234551b6973b46ca572e654f0c5e665 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Apr 2020 17:05:28 +0100 Subject: [PATCH 235/427] afs: Use the serverUnique field in the UVLDB record to reduce rpc ops The U-version VLDB volume record retrieved by the VL.GetEntryByNameU rpc op carries a change counter (the serverUnique field) for each fileserver listed in the record as backing that volume. This is incremented whenever the registration details for a fileserver change (such as its address list). Note that the same value will be seen in all UVLDB records that refer to that fileserver. This should be checked before calling the VL server to re-query the address list for a fileserver. If it's the same, there's no point doing the query. Reported-by: Jeffrey Altman Signed-off-by: David Howells --- fs/afs/internal.h | 5 +++-- fs/afs/server.c | 26 ++++++++++++++------------ fs/afs/server_list.c | 3 ++- fs/afs/vlclient.c | 1 + 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 80255513e230..ee17c868ad2c 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -471,6 +471,7 @@ struct afs_vldb_entry { #define AFS_VLDB_QUERY_ERROR 4 /* - VL server returned error */ uuid_t fs_server[AFS_NMAXNSERVERS]; + u32 addr_version[AFS_NMAXNSERVERS]; /* Registration change counters */ u8 fs_mask[AFS_NMAXNSERVERS]; #define AFS_VOL_VTM_RW 0x01 /* R/W version of the volume is available (on this server) */ #define AFS_VOL_VTM_RO 0x02 /* R/O version of the volume is available (on this server) */ @@ -498,7 +499,6 @@ struct afs_server { struct hlist_node proc_link; /* Link in net->fs_proc */ struct afs_server *gc_next; /* Next server in manager's list */ time64_t put_time; /* Time at which last put */ - time64_t update_at; /* Time at which to next update the record */ unsigned long flags; #define AFS_SERVER_FL_NOT_READY 1 /* The record is not ready for use */ #define AFS_SERVER_FL_NOT_FOUND 2 /* VL server says no such server */ @@ -511,6 +511,7 @@ struct afs_server { #define AFS_SERVER_FL_IS_YFS 9 /* Server is YFS not AFS */ #define AFS_SERVER_FL_NO_RM2 10 /* Fileserver doesn't support YFS.RemoveFile2 */ #define AFS_SERVER_FL_HAVE_EPOCH 11 /* ->epoch is valid */ +#define AFS_SERVER_FL_NEEDS_UPDATE 12 /* Fileserver address list is out of date */ atomic_t usage; u32 addr_version; /* Address list version */ u32 cm_epoch; /* Server RxRPC epoch */ @@ -1241,7 +1242,7 @@ extern spinlock_t afs_server_peer_lock; extern struct afs_server *afs_find_server(struct afs_net *, const struct sockaddr_rxrpc *); extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *); -extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *); +extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32); extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace); extern void afs_put_server(struct afs_net *, struct afs_server *, enum afs_server_trace); extern void afs_manage_servers(struct work_struct *); diff --git a/fs/afs/server.c b/fs/afs/server.c index 11b90ac7ea30..9e50ccde5d37 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -12,7 +12,6 @@ #include "protocol_yfs.h" static unsigned afs_server_gc_delay = 10; /* Server record timeout in seconds */ -static unsigned afs_server_update_delay = 30; /* Time till VLDB recheck in secs */ static atomic_t afs_server_debug_id; static void afs_inc_servers_outstanding(struct afs_net *net) @@ -218,7 +217,6 @@ static struct afs_server *afs_alloc_server(struct afs_net *net, RCU_INIT_POINTER(server->addresses, alist); server->addr_version = alist->version; server->uuid = *uuid; - server->update_at = ktime_get_real_seconds() + afs_server_update_delay; rwlock_init(&server->fs_lock); INIT_HLIST_HEAD(&server->cb_volumes); rwlock_init(&server->cb_break_lock); @@ -264,7 +262,7 @@ static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell, * Get or create a fileserver record. */ struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key, - const uuid_t *uuid) + const uuid_t *uuid, u32 addr_version) { struct afs_addr_list *alist; struct afs_server *server, *candidate; @@ -272,8 +270,11 @@ struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key, _enter("%p,%pU", cell->net, uuid); server = afs_find_server_by_uuid(cell->net, uuid); - if (server) + if (server) { + if (server->addr_version != addr_version) + set_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags); return server; + } alist = afs_vl_lookup_addrs(cell, key, uuid); if (IS_ERR(alist)) @@ -558,7 +559,6 @@ static noinline bool afs_update_server_record(struct afs_fs_cursor *fc, struct a write_unlock(&server->fs_lock); } - server->update_at = ktime_get_real_seconds() + afs_server_update_delay; afs_put_addrlist(discard); _leave(" = t"); return true; @@ -569,8 +569,6 @@ static noinline bool afs_update_server_record(struct afs_fs_cursor *fc, struct a */ bool afs_check_server_record(struct afs_fs_cursor *fc, struct afs_server *server) { - time64_t now = ktime_get_real_seconds(); - long diff; bool success; int ret, retries = 0; @@ -579,13 +577,16 @@ bool afs_check_server_record(struct afs_fs_cursor *fc, struct afs_server *server ASSERT(server); retry: - diff = READ_ONCE(server->update_at) - now; - if (diff > 0) { - _leave(" = t [not now %ld]", diff); - return true; - } + if (test_bit(AFS_SERVER_FL_UPDATING, &server->flags)) + goto wait; + if (test_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags)) + goto update; + _leave(" = t [good]"); + return true; +update: if (!test_and_set_bit_lock(AFS_SERVER_FL_UPDATING, &server->flags)) { + clear_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags); success = afs_update_server_record(fc, server); clear_bit_unlock(AFS_SERVER_FL_UPDATING, &server->flags); wake_up_bit(&server->flags, AFS_SERVER_FL_UPDATING); @@ -593,6 +594,7 @@ retry: return success; } +wait: ret = wait_on_bit(&server->flags, AFS_SERVER_FL_UPDATING, (fc->flags & AFS_FS_CURSOR_INTR) ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index 888d91d195d9..f567732df5cc 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -51,7 +51,8 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, if (!(vldb->fs_mask[i] & type_mask)) continue; - server = afs_lookup_server(cell, key, &vldb->fs_server[i]); + server = afs_lookup_server(cell, key, &vldb->fs_server[i], + vldb->addr_version[i]); if (IS_ERR(server)) { ret = PTR_ERR(server); if (ret == -ENOENT || diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index 516e9a3bb5b4..972dc5512f33 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -82,6 +82,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) for (j = 0; j < 6; j++) uuid->node[j] = (u8)ntohl(xdr->node[j]); + entry->addr_version[n] = ntohl(uvldb->serverUnique[i]); entry->nr_servers++; } From 977e5f8ed0ab2786755f8d2a96b78a3c7320f7c4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 17 Apr 2020 17:31:26 +0100 Subject: [PATCH 236/427] afs: Split the usage count on struct afs_server Split the usage count on the afs_server struct to have an active count that registers who's actually using it separately from the reference count on the object. This allows a future patch to dispatch polling probes without advancing the "unuse" time into the future each time we emit a probe, which would otherwise prevent unused server records from expiring. Included in this: (1) The latter part of afs_destroy_server() in which the RCU destruction of afs_server objects is invoked and the outstanding server count is decremented is split out into __afs_put_server(). (2) afs_put_server() now calls __afs_put_server() rather then setting the management timer. (3) The calls begun by afs_fs_give_up_all_callbacks() and afs_fs_get_capabilities() can now take a ref on the server record, so afs_destroy_server() can just drop its ref and needn't wait for the completion of these calls. They'll put the ref when they're done. (4) Because of (3), afs_fs_probe_done() no longer needs to wake up afs_destroy_server() with server->probe_outstanding. (5) afs_gc_servers can be simplified. It only needs to check if server->active is 0 rather than playing games with the refcount. (6) afs_manage_servers() can propose a server for gc if usage == 0 rather than if ref == 1. The gc is effected by (5). Signed-off-by: David Howells --- fs/afs/cmservice.c | 4 +- fs/afs/fs_probe.c | 1 - fs/afs/fsclient.c | 5 +- fs/afs/internal.h | 8 +- fs/afs/proc.c | 9 ++- fs/afs/rxrpc.c | 2 +- fs/afs/server.c | 151 ++++++++++++++++++++++++------------- fs/afs/server_list.c | 4 +- include/trace/events/afs.h | 18 +++-- 9 files changed, 131 insertions(+), 71 deletions(-) diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 380ad5ace7cf..7dcbca3bf828 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -268,7 +268,9 @@ static void SRXAFSCB_CallBack(struct work_struct *work) * to maintain cache coherency. */ if (call->server) { - trace_afs_server(call->server, atomic_read(&call->server->usage), + trace_afs_server(call->server, + atomic_read(&call->server->ref), + atomic_read(&call->server->active), afs_server_trace_callback); afs_break_callbacks(call->server, call->count, call->request); } diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index 37d1bba57b00..d37d78eb84bd 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -16,7 +16,6 @@ static bool afs_fs_probe_done(struct afs_server *server) if (!atomic_dec_and_test(&server->probe_outstanding)) return false; - wake_up_var(&server->probe_outstanding); clear_bit_unlock(AFS_SERVER_FL_PROBING, &server->flags); wake_up_bit(&server->flags, AFS_SERVER_FL_PROBING); return true; diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index d2b3798c1932..3854d16e14b1 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -1842,7 +1842,7 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net, bp = call->request; *bp++ = htonl(FSGIVEUPALLCALLBACKS); - /* Can't take a ref on server */ + call->server = afs_use_server(server, afs_server_trace_give_up_cb); afs_make_call(ac, call, GFP_NOFS); return afs_wait_for_call_to_complete(call, ac); } @@ -1924,7 +1924,7 @@ struct afs_call *afs_fs_get_capabilities(struct afs_net *net, return ERR_PTR(-ENOMEM); call->key = key; - call->server = afs_get_server(server, afs_server_trace_get_caps); + call->server = afs_use_server(server, afs_server_trace_get_caps); call->server_index = server_index; call->upgrade = true; call->async = true; @@ -1934,7 +1934,6 @@ struct afs_call *afs_fs_get_capabilities(struct afs_net *net, bp = call->request; *bp++ = htonl(FSGETCAPABILITIES); - /* Can't take a ref on server */ trace_afs_make_fs_call(call, NULL); afs_make_call(ac, call, GFP_NOFS); return call; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index ee17c868ad2c..cb70e1c234cc 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -498,7 +498,7 @@ struct afs_server { struct hlist_node addr6_link; /* Link in net->fs_addresses6 */ struct hlist_node proc_link; /* Link in net->fs_proc */ struct afs_server *gc_next; /* Next server in manager's list */ - time64_t put_time; /* Time at which last put */ + time64_t unuse_time; /* Time at which last unused */ unsigned long flags; #define AFS_SERVER_FL_NOT_READY 1 /* The record is not ready for use */ #define AFS_SERVER_FL_NOT_FOUND 2 /* VL server says no such server */ @@ -512,7 +512,8 @@ struct afs_server { #define AFS_SERVER_FL_NO_RM2 10 /* Fileserver doesn't support YFS.RemoveFile2 */ #define AFS_SERVER_FL_HAVE_EPOCH 11 /* ->epoch is valid */ #define AFS_SERVER_FL_NEEDS_UPDATE 12 /* Fileserver address list is out of date */ - atomic_t usage; + atomic_t ref; /* Object refcount */ + atomic_t active; /* Active user count */ u32 addr_version; /* Address list version */ u32 cm_epoch; /* Server RxRPC epoch */ unsigned int debug_id; /* Debugging ID for traces */ @@ -1244,6 +1245,9 @@ extern struct afs_server *afs_find_server(struct afs_net *, extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *); extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32); extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace); +extern struct afs_server *afs_use_server(struct afs_server *, enum afs_server_trace); +extern void afs_unuse_server(struct afs_net *, struct afs_server *, enum afs_server_trace); +extern void afs_unuse_server_notime(struct afs_net *, struct afs_server *, enum afs_server_trace); extern void afs_put_server(struct afs_net *, struct afs_server *, enum afs_server_trace); extern void afs_manage_servers(struct work_struct *); extern void afs_servers_timer(struct timer_list *); diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 468e1713bce1..9bce7898cd7d 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -378,19 +378,20 @@ static int afs_proc_servers_show(struct seq_file *m, void *v) int i; if (v == SEQ_START_TOKEN) { - seq_puts(m, "UUID USE ADDR\n"); + seq_puts(m, "UUID REF ACT ADDR\n"); return 0; } server = list_entry(v, struct afs_server, proc_link); alist = rcu_dereference(server->addresses); - seq_printf(m, "%pU %3d %pISpc%s\n", + seq_printf(m, "%pU %3d %3d %pISpc%s\n", &server->uuid, - atomic_read(&server->usage), + atomic_read(&server->ref), + atomic_read(&server->active), &alist->addrs[0].transport, alist->preferred == 0 ? "*" : ""); for (i = 1; i < alist->nr_addrs; i++) - seq_printf(m, " %pISpc%s\n", + seq_printf(m, " %pISpc%s\n", &alist->addrs[i].transport, alist->preferred == i ? "*" : ""); return 0; diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 1ecc67da6c1a..ab2962fff1fb 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -183,7 +183,7 @@ void afs_put_call(struct afs_call *call) if (call->type->destructor) call->type->destructor(call); - afs_put_server(call->net, call->server, afs_server_trace_put_call); + afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call); afs_put_cb_interest(call->net, call->cbi); afs_put_addrlist(call->alist); kfree(call->request); diff --git a/fs/afs/server.c b/fs/afs/server.c index 9e50ccde5d37..4969a681f8f5 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -25,6 +25,10 @@ static void afs_dec_servers_outstanding(struct afs_net *net) wake_up_var(&net->servers_outstanding); } +static struct afs_server *afs_maybe_use_server(struct afs_server *, + enum afs_server_trace); +static void __afs_put_server(struct afs_net *, struct afs_server *); + /* * Find a server by one of its addresses. */ @@ -40,7 +44,7 @@ struct afs_server *afs_find_server(struct afs_net *net, do { if (server) - afs_put_server(net, server, afs_server_trace_put_find_rsq); + afs_unuse_server_notime(net, server, afs_server_trace_put_find_rsq); server = NULL; read_seqbegin_or_lock(&net->fs_addr_lock, &seq); @@ -78,9 +82,9 @@ struct afs_server *afs_find_server(struct afs_net *net, } server = NULL; + continue; found: - if (server && !atomic_inc_not_zero(&server->usage)) - server = NULL; + server = afs_maybe_use_server(server, afs_server_trace_get_by_addr); } while (need_seqretry(&net->fs_addr_lock, seq)); @@ -91,7 +95,7 @@ struct afs_server *afs_find_server(struct afs_net *net, } /* - * Look up a server by its UUID + * Look up a server by its UUID and mark it active. */ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uuid) { @@ -107,7 +111,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu * changes. */ if (server) - afs_put_server(net, server, afs_server_trace_put_uuid_rsq); + afs_unuse_server(net, server, afs_server_trace_put_uuid_rsq); server = NULL; read_seqbegin_or_lock(&net->fs_lock, &seq); @@ -122,7 +126,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu } else if (diff > 0) { p = p->rb_right; } else { - afs_get_server(server, afs_server_trace_get_by_uuid); + afs_use_server(server, afs_server_trace_get_by_uuid); break; } @@ -198,7 +202,7 @@ exists: } /* - * allocate a new server record + * Allocate a new server record and mark it active. */ static struct afs_server *afs_alloc_server(struct afs_net *net, const uuid_t *uuid, @@ -212,7 +216,8 @@ static struct afs_server *afs_alloc_server(struct afs_net *net, if (!server) goto enomem; - atomic_set(&server->usage, 1); + atomic_set(&server->ref, 1); + atomic_set(&server->active, 1); server->debug_id = atomic_inc_return(&afs_server_debug_id); RCU_INIT_POINTER(server->addresses, alist); server->addr_version = alist->version; @@ -224,7 +229,7 @@ static struct afs_server *afs_alloc_server(struct afs_net *net, spin_lock_init(&server->probe_lock); afs_inc_servers_outstanding(net); - trace_afs_server(server, 1, afs_server_trace_alloc); + trace_afs_server(server, 1, 1, afs_server_trace_alloc); _leave(" = %p", server); return server; @@ -292,7 +297,6 @@ struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key, kfree(candidate); } - _leave(" = %p{%d}", server, atomic_read(&server->usage)); return server; } @@ -328,9 +332,38 @@ void afs_servers_timer(struct timer_list *timer) struct afs_server *afs_get_server(struct afs_server *server, enum afs_server_trace reason) { - unsigned int u = atomic_inc_return(&server->usage); + unsigned int u = atomic_inc_return(&server->ref); - trace_afs_server(server, u, reason); + trace_afs_server(server, u, atomic_read(&server->active), reason); + return server; +} + +/* + * Try to get a reference on a server object. + */ +static struct afs_server *afs_maybe_use_server(struct afs_server *server, + enum afs_server_trace reason) +{ + unsigned int r = atomic_fetch_add_unless(&server->ref, 1, 0); + unsigned int a; + + if (r == 0) + return NULL; + + a = atomic_inc_return(&server->active); + trace_afs_server(server, r, a, reason); + return server; +} + +/* + * Get an active count on a server object. + */ +struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_trace reason) +{ + unsigned int r = atomic_inc_return(&server->ref); + unsigned int a = atomic_inc_return(&server->active); + + trace_afs_server(server, r, a, reason); return server; } @@ -345,28 +378,56 @@ void afs_put_server(struct afs_net *net, struct afs_server *server, if (!server) return; - server->put_time = ktime_get_real_seconds(); + usage = atomic_dec_return(&server->ref); + trace_afs_server(server, usage, atomic_read(&server->active), reason); + if (unlikely(usage == 0)) + __afs_put_server(net, server); +} - usage = atomic_dec_return(&server->usage); +/* + * Drop an active count on a server object without updating the last-unused + * time. + */ +void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server, + enum afs_server_trace reason) +{ + if (server) { + unsigned int active = atomic_dec_return(&server->active); - trace_afs_server(server, usage, reason); + if (active == 0) + afs_set_server_timer(net, afs_server_gc_delay); + afs_put_server(net, server, reason); + } +} - if (likely(usage > 0)) - return; - - afs_set_server_timer(net, afs_server_gc_delay); +/* + * Drop an active count on a server object. + */ +void afs_unuse_server(struct afs_net *net, struct afs_server *server, + enum afs_server_trace reason) +{ + if (server) { + server->unuse_time = ktime_get_real_seconds(); + afs_unuse_server_notime(net, server, reason); + } } static void afs_server_rcu(struct rcu_head *rcu) { struct afs_server *server = container_of(rcu, struct afs_server, rcu); - trace_afs_server(server, atomic_read(&server->usage), - afs_server_trace_free); + trace_afs_server(server, atomic_read(&server->ref), + atomic_read(&server->active), afs_server_trace_free); afs_put_addrlist(rcu_access_pointer(server->addresses)); kfree(server); } +static void __afs_put_server(struct afs_net *net, struct afs_server *server) +{ + call_rcu(&server->rcu, afs_server_rcu); + afs_dec_servers_outstanding(net); +} + /* * destroy a dead server */ @@ -379,19 +440,10 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server) .error = 0, }; - trace_afs_server(server, atomic_read(&server->usage), - afs_server_trace_give_up_cb); - if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags)) afs_fs_give_up_all_callbacks(net, server, &ac, NULL); - wait_var_event(&server->probe_outstanding, - atomic_read(&server->probe_outstanding) == 0); - - trace_afs_server(server, atomic_read(&server->usage), - afs_server_trace_destroy); - call_rcu(&server->rcu, afs_server_rcu); - afs_dec_servers_outstanding(net); + afs_put_server(net, server, afs_server_trace_destroy); } /* @@ -400,31 +452,28 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server) static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list) { struct afs_server *server; - bool deleted; - int usage; + int active; while ((server = gc_list)) { gc_list = server->gc_next; write_seqlock(&net->fs_lock); - usage = 1; - deleted = atomic_try_cmpxchg(&server->usage, &usage, 0); - trace_afs_server(server, usage, afs_server_trace_gc); - if (deleted) { + + active = atomic_read(&server->active); + if (active == 0) { + trace_afs_server(server, atomic_read(&server->ref), + active, afs_server_trace_gc); rb_erase(&server->uuid_rb, &net->fs_servers); hlist_del_rcu(&server->proc_link); - } - write_sequnlock(&net->fs_lock); - - if (deleted) { - write_seqlock(&net->fs_addr_lock); if (!hlist_unhashed(&server->addr4_link)) hlist_del_rcu(&server->addr4_link); if (!hlist_unhashed(&server->addr6_link)) hlist_del_rcu(&server->addr6_link); - write_sequnlock(&net->fs_addr_lock); - afs_destroy_server(net, server); } + write_sequnlock(&net->fs_lock); + + if (active == 0) + afs_destroy_server(net, server); } } @@ -453,15 +502,14 @@ void afs_manage_servers(struct work_struct *work) for (cursor = rb_first(&net->fs_servers); cursor; cursor = rb_next(cursor)) { struct afs_server *server = rb_entry(cursor, struct afs_server, uuid_rb); - int usage = atomic_read(&server->usage); + int active = atomic_read(&server->active); - _debug("manage %pU %u", &server->uuid, usage); + _debug("manage %pU %u", &server->uuid, active); - ASSERTCMP(usage, >=, 1); - ASSERTIFCMP(purging, usage, ==, 1); + ASSERTIFCMP(purging, active, ==, 0); - if (usage == 1) { - time64_t expire_at = server->put_time; + if (active == 0) { + time64_t expire_at = server->unuse_time; if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) && !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags)) @@ -532,7 +580,8 @@ static noinline bool afs_update_server_record(struct afs_fs_cursor *fc, struct a _enter(""); - trace_afs_server(server, atomic_read(&server->usage), afs_server_trace_update); + trace_afs_server(server, atomic_read(&server->ref), atomic_read(&server->active), + afs_server_trace_update); alist = afs_vl_lookup_addrs(fc->vnode->volume->cell, fc->key, &server->uuid); diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index f567732df5cc..b77e50f62459 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -16,8 +16,8 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist) if (slist && refcount_dec_and_test(&slist->usage)) { for (i = 0; i < slist->nr_servers; i++) { afs_put_cb_interest(net, slist->servers[i].cb_interest); - afs_put_server(net, slist->servers[i].server, - afs_server_trace_put_slist); + afs_unuse_server(net, slist->servers[i].server, + afs_server_trace_put_slist); } kfree(slist); } diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index c612cabbc378..f9691f69b2d6 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -33,6 +33,7 @@ enum afs_server_trace { afs_server_trace_destroy, afs_server_trace_free, afs_server_trace_gc, + afs_server_trace_get_by_addr, afs_server_trace_get_by_uuid, afs_server_trace_get_caps, afs_server_trace_get_install, @@ -241,6 +242,7 @@ enum afs_cb_break_reason { EM(afs_server_trace_destroy, "DESTROY ") \ EM(afs_server_trace_free, "FREE ") \ EM(afs_server_trace_gc, "GC ") \ + EM(afs_server_trace_get_by_addr, "GET addr ") \ EM(afs_server_trace_get_by_uuid, "GET uuid ") \ EM(afs_server_trace_get_caps, "GET caps ") \ EM(afs_server_trace_get_install, "GET inst ") \ @@ -1271,26 +1273,30 @@ TRACE_EVENT(afs_cb_miss, ); TRACE_EVENT(afs_server, - TP_PROTO(struct afs_server *server, int usage, enum afs_server_trace reason), + TP_PROTO(struct afs_server *server, int ref, int active, + enum afs_server_trace reason), - TP_ARGS(server, usage, reason), + TP_ARGS(server, ref, active, reason), TP_STRUCT__entry( __field(unsigned int, server ) - __field(int, usage ) + __field(int, ref ) + __field(int, active ) __field(int, reason ) ), TP_fast_assign( __entry->server = server->debug_id; - __entry->usage = usage; + __entry->ref = ref; + __entry->active = active; __entry->reason = reason; ), - TP_printk("s=%08x %s u=%d", + TP_printk("s=%08x %s u=%d a=%d", __entry->server, __print_symbolic(__entry->reason, afs_server_traces), - __entry->usage) + __entry->ref, + __entry->active) ); #endif /* _TRACE_AFS_H */ From f6cbb368bcb0bc4fa7c11554d5293658bb4b26a2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Apr 2020 15:10:00 +0100 Subject: [PATCH 237/427] afs: Actively poll fileservers to maintain NAT or firewall openings When an AFS client accesses a file, it receives a limited-duration callback promise that the server will notify it if another client changes a file. This callback duration can be a few hours in length. If a client mounts a volume and then an application prevents it from being unmounted, say by chdir'ing into it, but then does nothing for some time, the rxrpc_peer record will expire and rxrpc-level keepalive will cease. If there is NAT or a firewall between the client and the server, the route back for the server may close after a comparatively short duration, meaning that attempts by the server to notify the client may then bounce. The client, however, may (so far as it knows) still have a valid unexpired promise and will then rely on its cached data and will not see changes made on the server by a third party until it incidentally rechecks the status or the promise needs renewal. To deal with this, the client needs to regularly probe the server. This has two effects: firstly, it keeps a route open back for the server, and secondly, it causes the server to disgorge any notifications that got queued up because they couldn't be sent. Fix this by adding a mechanism to emit regular probes. Two levels of probing are made available: Under normal circumstances the 'slow' queue will be used for a fileserver - this just probes the preferred address once every 5 mins or so; however, if server fails to respond to any probes, the server will shift to the 'fast' queue from which all its interfaces will be probed every 30s. When it finally responds, the record will switch back to the slow queue. Further notes: (1) Probing is now no longer driven from the fileserver rotation algorithm. (2) Probes are dispatched to all interfaces on a fileserver when that an afs_server object is set up to record it. (3) The afs_server object is removed from the probe queues when we start to probe it. afs_is_probing_server() returns true if it's not listed - ie. it's undergoing probing. (4) The afs_server object is added back on to the probe queue when the final outstanding probe completes, but the probed_at time is set when we're about to launch a probe so that it's not dependent on the probe duration. (5) The timer and the work item added for this must be handed a count on net->servers_outstanding, which they hand on or release. This makes sure that network namespace cleanup waits for them. Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation") Reported-by: Dave Botsch Signed-off-by: David Howells --- fs/afs/cmservice.c | 2 +- fs/afs/fs_probe.c | 279 ++++++++++++++++++++++++++++--------- fs/afs/fsclient.c | 19 ++- fs/afs/internal.h | 41 ++++-- fs/afs/main.c | 5 +- fs/afs/rotate.c | 7 +- fs/afs/server.c | 19 ++- fs/afs/volume.c | 22 +-- include/trace/events/afs.h | 4 + 9 files changed, 280 insertions(+), 118 deletions(-) diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 7dcbca3bf828..7ae88958051f 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -157,7 +157,7 @@ static int afs_record_cm_probe(struct afs_call *call, struct afs_server *server) _enter(""); if (test_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags) && - !test_bit(AFS_SERVER_FL_PROBING, &server->flags)) { + !afs_is_probing_server(server)) { if (server->cm_epoch == call->epoch) return 0; diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index d37d78eb84bd..442b5e7944ff 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* AFS fileserver probing * - * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2018, 2020 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ @@ -11,14 +11,83 @@ #include "internal.h" #include "protocol_yfs.h" -static bool afs_fs_probe_done(struct afs_server *server) -{ - if (!atomic_dec_and_test(&server->probe_outstanding)) - return false; +static unsigned int afs_fs_probe_fast_poll_interval = 30 * HZ; +static unsigned int afs_fs_probe_slow_poll_interval = 5 * 60 * HZ; - clear_bit_unlock(AFS_SERVER_FL_PROBING, &server->flags); - wake_up_bit(&server->flags, AFS_SERVER_FL_PROBING); - return true; +/* + * Start the probe polling timer. We have to supply it with an inc on the + * outstanding server count. + */ +static void afs_schedule_fs_probe(struct afs_net *net, + struct afs_server *server, bool fast) +{ + unsigned long atj; + + if (!net->live) + return; + + atj = server->probed_at; + atj += fast ? afs_fs_probe_fast_poll_interval : afs_fs_probe_slow_poll_interval; + + afs_inc_servers_outstanding(net); + if (timer_reduce(&net->fs_probe_timer, atj)) + afs_dec_servers_outstanding(net); +} + +/* + * Handle the completion of a set of probes. + */ +static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server) +{ + bool responded = server->probe.responded; + + write_seqlock(&net->fs_lock); + if (responded) + list_add_tail(&server->probe_link, &net->fs_probe_slow); + else + list_add_tail(&server->probe_link, &net->fs_probe_fast); + write_sequnlock(&net->fs_lock); + + afs_schedule_fs_probe(net, server, !responded); +} + +/* + * Handle the completion of a probe. + */ +static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server) +{ + _enter(""); + + if (atomic_dec_and_test(&server->probe_outstanding)) + afs_finished_fs_probe(net, server); + + wake_up_all(&server->probe_wq); +} + +/* + * Handle inability to send a probe due to ENOMEM when trying to allocate a + * call struct. + */ +static void afs_fs_probe_not_done(struct afs_net *net, + struct afs_server *server, + struct afs_addr_cursor *ac) +{ + struct afs_addr_list *alist = ac->alist; + unsigned int index = ac->index; + + _enter(""); + + trace_afs_io_error(0, -ENOMEM, afs_io_error_fs_probe_fail); + spin_lock(&server->probe_lock); + + server->probe.local_failure = true; + if (server->probe.error == 0) + server->probe.error = -ENOMEM; + + set_bit(index, &alist->failed); + + spin_unlock(&server->probe_lock); + return afs_done_one_fs_probe(net, server); } /* @@ -29,10 +98,8 @@ void afs_fileserver_probe_result(struct afs_call *call) { struct afs_addr_list *alist = call->alist; struct afs_server *server = call->server; - unsigned int server_index = call->server_index; unsigned int index = call->addr_ix; unsigned int rtt_us = 0; - bool have_result = false; int ret = call->error; _enter("%pU,%u", &server->uuid, index); @@ -51,8 +118,9 @@ void afs_fileserver_probe_result(struct afs_call *call) goto responded; case -ENOMEM: case -ENONET: + clear_bit(index, &alist->responded); server->probe.local_failure = true; - afs_io_error(call, afs_io_error_fs_probe_fail); + trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail); goto out; case -ECONNRESET: /* Responded, but call expired. */ case -ERFKILL: @@ -71,12 +139,11 @@ void afs_fileserver_probe_result(struct afs_call *call) server->probe.error == -ETIMEDOUT || server->probe.error == -ETIME)) server->probe.error = ret; - afs_io_error(call, afs_io_error_fs_probe_fail); + trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail); goto out; } responded: - set_bit(index, &alist->responded); clear_bit(index, &alist->failed); if (call->service_id == YFS_FS_SERVICE) { @@ -95,38 +162,31 @@ responded: if (rtt_us < server->probe.rtt) { server->probe.rtt = rtt_us; alist->preferred = index; - have_result = true; } smp_wmb(); /* Set rtt before responded. */ server->probe.responded = true; - set_bit(AFS_SERVER_FL_PROBED, &server->flags); + set_bit(index, &alist->responded); out: spin_unlock(&server->probe_lock); - _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", - server_index, index, &alist->addrs[index].transport, rtt_us, ret); + _debug("probe %pU [%u] %pISpc rtt=%u ret=%d", + &server->uuid, index, &alist->addrs[index].transport, + rtt_us, ret); - have_result |= afs_fs_probe_done(server); - if (have_result) - wake_up_all(&server->probe_wq); + return afs_done_one_fs_probe(call->net, server); } /* - * Probe all of a fileserver's addresses to find out the best route and to - * query its capabilities. + * Probe one or all of a fileserver's addresses to find out the best route and + * to query its capabilities. */ -static int afs_do_probe_fileserver(struct afs_net *net, - struct afs_server *server, - struct key *key, - unsigned int server_index, - struct afs_error *_e) +void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, + struct key *key, bool all) { struct afs_addr_cursor ac = { .index = 0, }; - struct afs_call *call; - bool in_progress = false; _enter("%pU", &server->uuid); @@ -136,50 +196,25 @@ static int afs_do_probe_fileserver(struct afs_net *net, afs_get_addrlist(ac.alist); read_unlock(&server->fs_lock); - atomic_set(&server->probe_outstanding, ac.alist->nr_addrs); + server->probed_at = jiffies; + atomic_set(&server->probe_outstanding, all ? ac.alist->nr_addrs : 1); memset(&server->probe, 0, sizeof(server->probe)); server->probe.rtt = UINT_MAX; - for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) { - call = afs_fs_get_capabilities(net, server, &ac, key, server_index); - if (!IS_ERR(call)) { - afs_put_call(call); - in_progress = true; - } else { - afs_prioritise_error(_e, PTR_ERR(call), ac.abort_code); - } + ac.index = ac.alist->preferred; + if (ac.index < 0 || ac.index >= ac.alist->nr_addrs) + all = true; + + if (all) { + for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) + if (!afs_fs_get_capabilities(net, server, &ac, key)) + afs_fs_probe_not_done(net, server, &ac); + } else { + if (!afs_fs_get_capabilities(net, server, &ac, key)) + afs_fs_probe_not_done(net, server, &ac); } - if (!in_progress) - afs_fs_probe_done(server); afs_put_addrlist(ac.alist); - return in_progress; -} - -/* - * Send off probes to all unprobed servers. - */ -int afs_probe_fileservers(struct afs_net *net, struct key *key, - struct afs_server_list *list) -{ - struct afs_server *server; - struct afs_error e; - bool in_progress = false; - int i; - - e.error = 0; - e.responded = false; - for (i = 0; i < list->nr_servers; i++) { - server = list->servers[i].server; - if (test_bit(AFS_SERVER_FL_PROBED, &server->flags)) - continue; - - if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &server->flags) && - afs_do_probe_fileserver(net, server, key, i, &e)) - in_progress = true; - } - - return in_progress ? 0 : e.error; } /* @@ -199,7 +234,7 @@ int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried) for (i = 0; i < slist->nr_servers; i++) { if (test_bit(i, &untried)) { server = slist->servers[i].server; - if (!test_bit(AFS_SERVER_FL_PROBING, &server->flags)) + if (!atomic_read(&server->probe_outstanding)) __clear_bit(i, &untried); if (server->probe.responded) have_responders = true; @@ -229,7 +264,7 @@ int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried) server = slist->servers[i].server; if (server->probe.responded) goto stop; - if (test_bit(AFS_SERVER_FL_PROBING, &server->flags)) + if (atomic_read(&server->probe_outstanding)) still_probing = true; } } @@ -264,3 +299,109 @@ stop: slist->preferred = pref; return 0; } + +/* + * Probe timer. We have an increment on fs_outstanding that we need to pass + * along to the work item. + */ +void afs_fs_probe_timer(struct timer_list *timer) +{ + struct afs_net *net = container_of(timer, struct afs_net, fs_probe_timer); + + if (!queue_work(afs_wq, &net->fs_prober)) + afs_dec_servers_outstanding(net); +} + +/* + * Dispatch a probe to a server. + */ +static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server, bool all) + __releases(&net->fs_lock) +{ + struct key *key = NULL; + + /* We remove it from the queues here - it will be added back to + * one of the queues on the completion of the probe. + */ + list_del_init(&server->probe_link); + + afs_get_server(server, afs_server_trace_get_probe); + write_sequnlock(&net->fs_lock); + + afs_fs_probe_fileserver(net, server, key, all); + afs_put_server(net, server, afs_server_trace_put_probe); +} + +/* + * Probe dispatcher to regularly dispatch probes to keep NAT alive. + */ +void afs_fs_probe_dispatcher(struct work_struct *work) +{ + struct afs_net *net = container_of(work, struct afs_net, fs_prober); + struct afs_server *fast, *slow, *server; + unsigned long nowj, timer_at, poll_at; + bool first_pass = true, set_timer = false; + + if (!net->live) + return; + + _enter(""); + + if (list_empty(&net->fs_probe_fast) && list_empty(&net->fs_probe_slow)) { + _leave(" [none]"); + return; + } + +again: + write_seqlock(&net->fs_lock); + + fast = slow = server = NULL; + nowj = jiffies; + timer_at = nowj + MAX_JIFFY_OFFSET; + + if (!list_empty(&net->fs_probe_fast)) { + fast = list_first_entry(&net->fs_probe_fast, struct afs_server, probe_link); + poll_at = fast->probed_at + afs_fs_probe_fast_poll_interval; + if (time_before(nowj, poll_at)) { + timer_at = poll_at; + set_timer = true; + fast = NULL; + } + } + + if (!list_empty(&net->fs_probe_slow)) { + slow = list_first_entry(&net->fs_probe_slow, struct afs_server, probe_link); + poll_at = slow->probed_at + afs_fs_probe_slow_poll_interval; + if (time_before(nowj, poll_at)) { + if (time_before(poll_at, timer_at)) + timer_at = poll_at; + set_timer = true; + slow = NULL; + } + } + + server = fast ?: slow; + if (server) + _debug("probe %pU", &server->uuid); + + if (server && (first_pass || !need_resched())) { + afs_dispatch_fs_probe(net, server, server == fast); + first_pass = false; + goto again; + } + + write_sequnlock(&net->fs_lock); + + if (server) { + if (!queue_work(afs_wq, &net->fs_prober)) + afs_dec_servers_outstanding(net); + _leave(" [requeue]"); + } else if (set_timer) { + if (timer_reduce(&net->fs_probe_timer, timer_at)) + afs_dec_servers_outstanding(net); + _leave(" [timer]"); + } else { + afs_dec_servers_outstanding(net); + _leave(" [quiesce]"); + } +} diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 3854d16e14b1..401de063996c 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -1905,14 +1905,13 @@ static const struct afs_call_type afs_RXFSGetCapabilities = { }; /* - * Probe a fileserver for the capabilities that it supports. This can - * return up to 196 words. + * Probe a fileserver for the capabilities that it supports. This RPC can + * reply with up to 196 words. The operation is asynchronous and if we managed + * to allocate a call, true is returned the result is delivered through the + * ->done() - otherwise we return false to indicate we didn't even try. */ -struct afs_call *afs_fs_get_capabilities(struct afs_net *net, - struct afs_server *server, - struct afs_addr_cursor *ac, - struct key *key, - unsigned int server_index) +bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, + struct afs_addr_cursor *ac, struct key *key) { struct afs_call *call; __be32 *bp; @@ -1921,11 +1920,10 @@ struct afs_call *afs_fs_get_capabilities(struct afs_net *net, call = afs_alloc_flat_call(net, &afs_RXFSGetCapabilities, 1 * 4, 16 * 4); if (!call) - return ERR_PTR(-ENOMEM); + return false; call->key = key; call->server = afs_use_server(server, afs_server_trace_get_caps); - call->server_index = server_index; call->upgrade = true; call->async = true; call->max_lifespan = AFS_PROBE_MAX_LIFESPAN; @@ -1936,7 +1934,8 @@ struct afs_call *afs_fs_get_capabilities(struct afs_net *net, trace_afs_make_fs_call(call, NULL); afs_make_call(ac, call, GFP_NOFS); - return call; + afs_put_call(call); + return true; } /* diff --git a/fs/afs/internal.h b/fs/afs/internal.h index cb70e1c234cc..61320a632e15 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -90,7 +90,6 @@ struct afs_addr_list { unsigned char nr_ipv4; /* Number of IPv4 addresses */ enum dns_record_source source:8; enum dns_lookup_status status:8; - unsigned long probed; /* Mask of servers that have been probed */ unsigned long failed; /* Mask of addrs that failed locally/ICMP */ unsigned long responded; /* Mask of addrs that responded */ struct sockaddr_rxrpc addrs[]; @@ -299,9 +298,10 @@ struct afs_net { * cell, but in practice, people create aliases and subsets and there's * no easy way to distinguish them. */ - seqlock_t fs_lock; /* For fs_servers */ + seqlock_t fs_lock; /* For fs_servers, fs_probe_*, fs_proc */ struct rb_root fs_servers; /* afs_server (by server UUID or address) */ - struct list_head fs_updates; /* afs_server (by update_at) */ + struct list_head fs_probe_fast; /* List of afs_server to probe at 30s intervals */ + struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */ struct hlist_head fs_proc; /* procfs servers list */ struct hlist_head fs_addresses4; /* afs_server (by lowest IPv4 addr) */ @@ -310,6 +310,9 @@ struct afs_net { struct work_struct fs_manager; struct timer_list fs_timer; + + struct work_struct fs_prober; + struct timer_list fs_probe_timer; atomic_t servers_outstanding; /* File locking renewal management */ @@ -493,7 +496,8 @@ struct afs_server { }; struct afs_addr_list __rcu *addresses; - struct rb_node uuid_rb; /* Link in net->servers */ + struct rb_node uuid_rb; /* Link in net->fs_servers */ + struct list_head probe_link; /* Link in net->fs_probe_list */ struct hlist_node addr4_link; /* Link in net->fs_addresses4 */ struct hlist_node addr6_link; /* Link in net->fs_addresses6 */ struct hlist_node proc_link; /* Link in net->fs_proc */ @@ -504,8 +508,6 @@ struct afs_server { #define AFS_SERVER_FL_NOT_FOUND 2 /* VL server says no such server */ #define AFS_SERVER_FL_VL_FAIL 3 /* Failed to access VL server */ #define AFS_SERVER_FL_UPDATING 4 -#define AFS_SERVER_FL_PROBED 5 /* The fileserver has been probed */ -#define AFS_SERVER_FL_PROBING 6 /* Fileserver is being probed */ #define AFS_SERVER_FL_NO_IBULK 7 /* Fileserver doesn't support FS.InlineBulkStatus */ #define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */ #define AFS_SERVER_FL_IS_YFS 9 /* Server is YFS not AFS */ @@ -527,6 +529,7 @@ struct afs_server { rwlock_t cb_break_lock; /* Volume finding lock */ /* Probe state */ + unsigned long probed_at; /* Time last probe was dispatched (jiffies) */ wait_queue_head_t probe_wq; atomic_t probe_outstanding; spinlock_t probe_lock; @@ -956,7 +959,6 @@ extern int afs_flock(struct file *, int, struct file_lock *); */ extern int afs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_status_cb *, struct afs_volsync *); -extern int afs_fs_give_up_callbacks(struct afs_net *, struct afs_server *); extern int afs_fs_fetch_data(struct afs_fs_cursor *, struct afs_status_cb *, struct afs_read *); extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t, struct afs_status_cb *, struct afs_fid *, struct afs_status_cb *); @@ -978,9 +980,8 @@ extern int afs_fs_extend_lock(struct afs_fs_cursor *, struct afs_status_cb *); extern int afs_fs_release_lock(struct afs_fs_cursor *, struct afs_status_cb *); extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *, struct afs_addr_cursor *, struct key *); -extern struct afs_call *afs_fs_get_capabilities(struct afs_net *, struct afs_server *, - struct afs_addr_cursor *, struct key *, - unsigned int); +extern bool afs_fs_get_capabilities(struct afs_net *, struct afs_server *, + struct afs_addr_cursor *, struct key *); extern int afs_fs_inline_bulk_status(struct afs_fs_cursor *, struct afs_net *, struct afs_fid *, struct afs_status_cb *, unsigned int, struct afs_volsync *); @@ -1001,8 +1002,9 @@ extern int afs_fs_store_acl(struct afs_fs_cursor *, const struct afs_acl *, * fs_probe.c */ extern void afs_fileserver_probe_result(struct afs_call *); -extern int afs_probe_fileservers(struct afs_net *, struct key *, struct afs_server_list *); +extern void afs_fs_probe_fileserver(struct afs_net *, struct afs_server *, struct key *, bool); extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long); +extern void afs_fs_probe_dispatcher(struct work_struct *); /* * inode.c @@ -1251,9 +1253,26 @@ extern void afs_unuse_server_notime(struct afs_net *, struct afs_server *, enum extern void afs_put_server(struct afs_net *, struct afs_server *, enum afs_server_trace); extern void afs_manage_servers(struct work_struct *); extern void afs_servers_timer(struct timer_list *); +extern void afs_fs_probe_timer(struct timer_list *); extern void __net_exit afs_purge_servers(struct afs_net *); extern bool afs_check_server_record(struct afs_fs_cursor *, struct afs_server *); +static inline void afs_inc_servers_outstanding(struct afs_net *net) +{ + atomic_inc(&net->servers_outstanding); +} + +static inline void afs_dec_servers_outstanding(struct afs_net *net) +{ + if (atomic_dec_and_test(&net->servers_outstanding)) + wake_up_var(&net->servers_outstanding); +} + +static inline bool afs_is_probing_server(struct afs_server *server) +{ + return list_empty(&server->probe_link); +} + /* * server_list.c */ diff --git a/fs/afs/main.c b/fs/afs/main.c index c9c45d7078bd..56b52f8dbf15 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -87,7 +87,8 @@ static int __net_init afs_net_init(struct net *net_ns) seqlock_init(&net->fs_lock); net->fs_servers = RB_ROOT; - INIT_LIST_HEAD(&net->fs_updates); + INIT_LIST_HEAD(&net->fs_probe_fast); + INIT_LIST_HEAD(&net->fs_probe_slow); INIT_HLIST_HEAD(&net->fs_proc); INIT_HLIST_HEAD(&net->fs_addresses4); @@ -96,6 +97,8 @@ static int __net_init afs_net_init(struct net *net_ns) INIT_WORK(&net->fs_manager, afs_manage_servers); timer_setup(&net->fs_timer, afs_servers_timer, 0); + INIT_WORK(&net->fs_prober, afs_fs_probe_dispatcher); + timer_setup(&net->fs_probe_timer, afs_fs_probe_timer, 0); ret = -ENOMEM; sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL); diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index 2a3305e42b14..46b68da89faa 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -349,9 +349,6 @@ start: goto failed; _debug("__ VOL %llx __", vnode->volume->vid); - error = afs_probe_fileservers(afs_v2net(vnode), fc->key, fc->server_list); - if (error < 0) - goto failed_set_error; pick_server: _debug("pick [%lx]", fc->untried); @@ -596,8 +593,8 @@ static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc) a->version, a->nr_ipv4, a->nr_addrs, a->max_addrs, a->preferred); - pr_notice("FC: - pr=%lx R=%lx F=%lx\n", - a->probed, a->responded, a->failed); + pr_notice("FC: - R=%lx F=%lx\n", + a->responded, a->failed); if (a == fc->ac.alist) pr_notice("FC: - current\n"); } diff --git a/fs/afs/server.c b/fs/afs/server.c index 4969a681f8f5..3f707b5ecb62 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -14,17 +14,6 @@ static unsigned afs_server_gc_delay = 10; /* Server record timeout in seconds */ static atomic_t afs_server_debug_id; -static void afs_inc_servers_outstanding(struct afs_net *net) -{ - atomic_inc(&net->servers_outstanding); -} - -static void afs_dec_servers_outstanding(struct afs_net *net) -{ - if (atomic_dec_and_test(&net->servers_outstanding)) - wake_up_var(&net->servers_outstanding); -} - static struct afs_server *afs_maybe_use_server(struct afs_server *, enum afs_server_trace); static void __afs_put_server(struct afs_net *, struct afs_server *); @@ -226,6 +215,7 @@ static struct afs_server *afs_alloc_server(struct afs_net *net, INIT_HLIST_HEAD(&server->cb_volumes); rwlock_init(&server->cb_break_lock); init_waitqueue_head(&server->probe_wq); + INIT_LIST_HEAD(&server->probe_link); spin_lock_init(&server->probe_lock); afs_inc_servers_outstanding(net); @@ -295,6 +285,12 @@ struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key, if (server != candidate) { afs_put_addrlist(alist); kfree(candidate); + } else { + /* Immediately dispatch an asynchronous probe to each interface + * on the fileserver. This will make sure the repeat-probing + * service is started. + */ + afs_fs_probe_fileserver(cell->net, server, key, true); } return server; @@ -464,6 +460,7 @@ static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list) trace_afs_server(server, atomic_read(&server->ref), active, afs_server_trace_gc); rb_erase(&server->uuid_rb, &net->fs_servers); + list_del(&server->probe_link); hlist_del_rcu(&server->proc_link); if (!hlist_unhashed(&server->addr4_link)) hlist_del_rcu(&server->addr4_link); diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 4310336b9bb8..249000195f8a 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -266,7 +266,6 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) } volume->update_at = ktime_get_real_seconds() + afs_volume_record_life; - clear_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags); write_unlock(&volume->servers_lock); ret = 0; @@ -283,23 +282,25 @@ error: */ int afs_check_volume_status(struct afs_volume *volume, struct afs_fs_cursor *fc) { - time64_t now = ktime_get_real_seconds(); int ret, retries = 0; _enter(""); - if (volume->update_at <= now) - set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags); - retry: - if (!test_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags) && - !test_bit(AFS_VOLUME_WAIT, &volume->flags)) { - _leave(" = 0"); - return 0; - } + if (test_bit(AFS_VOLUME_WAIT, &volume->flags)) + goto wait; + if (volume->update_at <= ktime_get_real_seconds() || + test_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags)) + goto update; + _leave(" = 0"); + return 0; +update: if (!test_and_set_bit_lock(AFS_VOLUME_UPDATING, &volume->flags)) { + clear_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags); ret = afs_update_volume_status(volume, fc->key); + if (ret < 0) + set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags); clear_bit_unlock(AFS_VOLUME_WAIT, &volume->flags); clear_bit_unlock(AFS_VOLUME_UPDATING, &volume->flags); wake_up_bit(&volume->flags, AFS_VOLUME_WAIT); @@ -307,6 +308,7 @@ retry: return ret; } +wait: if (!test_bit(AFS_VOLUME_WAIT, &volume->flags)) { _leave(" = 0 [no wait]"); return 0; diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index f9691f69b2d6..19a07fbf35df 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -38,10 +38,12 @@ enum afs_server_trace { afs_server_trace_get_caps, afs_server_trace_get_install, afs_server_trace_get_new_cbi, + afs_server_trace_get_probe, afs_server_trace_give_up_cb, afs_server_trace_put_call, afs_server_trace_put_cbi, afs_server_trace_put_find_rsq, + afs_server_trace_put_probe, afs_server_trace_put_slist, afs_server_trace_put_slist_isort, afs_server_trace_put_uuid_rsq, @@ -247,10 +249,12 @@ enum afs_cb_break_reason { EM(afs_server_trace_get_caps, "GET caps ") \ EM(afs_server_trace_get_install, "GET inst ") \ EM(afs_server_trace_get_new_cbi, "GET cbi ") \ + EM(afs_server_trace_get_probe, "GET probe") \ EM(afs_server_trace_give_up_cb, "giveup-cb") \ EM(afs_server_trace_put_call, "PUT call ") \ EM(afs_server_trace_put_cbi, "PUT cbi ") \ EM(afs_server_trace_put_find_rsq, "PUT f-rsq") \ + EM(afs_server_trace_put_probe, "PUT probe") \ EM(afs_server_trace_put_slist, "PUT slist") \ EM(afs_server_trace_put_slist_isort, "PUT isort") \ EM(afs_server_trace_put_uuid_rsq, "PUT u-req") \ From 6d043a578265e8c24384648f9c74c8874b429f28 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 20 Apr 2020 22:34:12 +0100 Subject: [PATCH 238/427] afs: Show more information in /proc/net/afs/servers Show more information in /proc/net/afs/servers to make it easier to see what's going on with the server probing. Signed-off-by: David Howells --- fs/afs/proc.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 9bce7898cd7d..1d21465a4108 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -378,21 +378,22 @@ static int afs_proc_servers_show(struct seq_file *m, void *v) int i; if (v == SEQ_START_TOKEN) { - seq_puts(m, "UUID REF ACT ADDR\n"); + seq_puts(m, "UUID REF ACT\n"); return 0; } server = list_entry(v, struct afs_server, proc_link); alist = rcu_dereference(server->addresses); - seq_printf(m, "%pU %3d %3d %pISpc%s\n", + seq_printf(m, "%pU %3d %3d\n", &server->uuid, atomic_read(&server->ref), - atomic_read(&server->active), - &alist->addrs[0].transport, - alist->preferred == 0 ? "*" : ""); - for (i = 1; i < alist->nr_addrs; i++) - seq_printf(m, " %pISpc%s\n", - &alist->addrs[i].transport, + atomic_read(&server->active)); + seq_printf(m, " - ALIST v=%u osp=%u r=%lx f=%lx\n", + alist->version, atomic_read(&server->probe_outstanding), + alist->responded, alist->failed); + for (i = 0; i < alist->nr_addrs; i++) + seq_printf(m, " [%x] %pISpc%s\n", + i, &alist->addrs[i].transport, alist->preferred == i ? "*" : ""); return 0; } From 8230fd8217b7ea76f838ae88e4a5a8e54f37099f Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 27 Mar 2020 15:02:44 +0000 Subject: [PATCH 239/427] afs: Make callback processing more efficient. afs_vol_interest objects represent the volume IDs currently being accessed from a fileserver. These hold lists of afs_cb_interest objects that repesent the superblocks using that volume ID on that server. When a callback notification from the server telling of a modification by another client arrives, the volume ID specified in the notification is looked up in the server's afs_vol_interest list. Through the afs_cb_interest list, the relevant superblocks can be iterated over and the specific inode looked up and marked in each one. Make the following efficiency improvements: (1) Hold rcu_read_lock() over the entire processing rather than locking it each time. (2) Do all the callbacks for each vid together rather than individually. Each volume then only needs to be looked up once. (3) afs_vol_interest objects are now stored in an rb_tree rather than a flat list to reduce the lookup step count. (4) afs_vol_interest lookup is now done with RCU, but because it's in an rb_tree which may rotate under us, a seqlock is used so that if it changes during the walk, we repeat the walk with a lock held. With this and the preceding patch which adds RCU-based lookups in the inode cache, target volumes/vnodes can be taken without the need to take any locks, except on the target itself. Signed-off-by: David Howells --- fs/afs/callback.c | 150 +++++++++++++++++++++++++++++----------------- fs/afs/internal.h | 6 +- fs/afs/server.c | 4 +- 3 files changed, 100 insertions(+), 60 deletions(-) diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 0dcbd40732d1..b16781e1683e 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -28,7 +28,7 @@ static struct afs_cb_interest *afs_create_interest(struct afs_server *server, { struct afs_vol_interest *new_vi, *vi; struct afs_cb_interest *new; - struct hlist_node **pp; + struct rb_node *parent, **pp; new_vi = kzalloc(sizeof(struct afs_vol_interest), GFP_KERNEL); if (!new_vi) @@ -42,7 +42,6 @@ static struct afs_cb_interest *afs_create_interest(struct afs_server *server, new_vi->usage = 1; new_vi->vid = vnode->volume->vid; - INIT_HLIST_NODE(&new_vi->srv_link); INIT_HLIST_HEAD(&new_vi->cb_interests); refcount_set(&new->usage, 1); @@ -51,31 +50,31 @@ static struct afs_cb_interest *afs_create_interest(struct afs_server *server, new->server = afs_get_server(server, afs_server_trace_get_new_cbi); INIT_HLIST_NODE(&new->cb_vlink); - write_lock(&server->cb_break_lock); + write_seqlock(&server->cb_break_lock); - for (pp = &server->cb_volumes.first; *pp; pp = &(*pp)->next) { - vi = hlist_entry(*pp, struct afs_vol_interest, srv_link); - if (vi->vid < new_vi->vid) - continue; - if (vi->vid > new_vi->vid) - break; - vi->usage++; - goto found_vi; + pp = &server->cb_volumes.rb_node; + while ((parent = *pp)) { + vi = rb_entry(parent, struct afs_vol_interest, srv_node); + if (vi->vid < new_vi->vid) { + pp = &(*pp)->rb_left; + } else if (vi->vid > new_vi->vid) { + pp = &(*pp)->rb_right; + } else { + vi->usage++; + goto found_vi; + } } - new_vi->srv_link.pprev = pp; - new_vi->srv_link.next = *pp; - if (*pp) - (*pp)->pprev = &new_vi->srv_link.next; - *pp = &new_vi->srv_link; vi = new_vi; new_vi = NULL; -found_vi: + rb_link_node_rcu(&vi->srv_node, parent, pp); + rb_insert_color(&vi->srv_node, &server->cb_volumes); +found_vi: new->vol_interest = vi; hlist_add_head(&new->cb_vlink, &vi->cb_interests); - write_unlock(&server->cb_break_lock); + write_sequnlock(&server->cb_break_lock); kfree(new_vi); return new; } @@ -182,17 +181,17 @@ void afs_put_cb_interest(struct afs_net *net, struct afs_cb_interest *cbi) if (cbi && refcount_dec_and_test(&cbi->usage)) { if (!hlist_unhashed(&cbi->cb_vlink)) { - write_lock(&cbi->server->cb_break_lock); + write_seqlock(&cbi->server->cb_break_lock); hlist_del_init(&cbi->cb_vlink); vi = cbi->vol_interest; cbi->vol_interest = NULL; if (--vi->usage == 0) - hlist_del(&vi->srv_link); + rb_erase(&vi->srv_node, &cbi->server->cb_volumes); else vi = NULL; - write_unlock(&cbi->server->cb_break_lock); + write_sequnlock(&cbi->server->cb_break_lock); if (vi) kfree_rcu(vi, rcu); afs_put_server(net, cbi->server, afs_server_trace_put_cbi); @@ -237,6 +236,45 @@ void afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reason write_sequnlock(&vnode->cb_lock); } +/* + * Look up a volume interest by volume ID under RCU conditions. + */ +static struct afs_vol_interest *afs_lookup_vol_interest_rcu(struct afs_server *server, + afs_volid_t vid) +{ + struct afs_vol_interest *vi = NULL; + struct rb_node *p; + int seq = 0; + + do { + /* Unfortunately, rbtree walking doesn't give reliable results + * under just the RCU read lock, so we have to check for + * changes. + */ + read_seqbegin_or_lock(&server->cb_break_lock, &seq); + + p = rcu_dereference_raw(server->cb_volumes.rb_node); + while (p) { + vi = rb_entry(p, struct afs_vol_interest, srv_node); + + if (vi->vid < vid) + p = rcu_dereference_raw(p->rb_left); + else if (vi->vid > vid) + p = rcu_dereference_raw(p->rb_right); + else + break; + /* We want to repeat the search, this time with the + * lock properly locked. + */ + vi = NULL; + } + + } while (need_seqretry(&server->cb_break_lock, seq)); + + done_seqretry(&server->cb_break_lock, seq); + return vi; +} + /* * allow the fileserver to explicitly break one callback * - happens when @@ -244,37 +282,18 @@ void afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reason * - a lock is released */ static void afs_break_one_callback(struct afs_server *server, - struct afs_fid *fid) + struct afs_fid *fid, + struct afs_vol_interest *vi) { - struct afs_vol_interest *vi; struct afs_cb_interest *cbi; struct afs_iget_data data; struct afs_vnode *vnode; struct inode *inode; - rcu_read_lock(); - read_lock(&server->cb_break_lock); - hlist_for_each_entry(vi, &server->cb_volumes, srv_link) { - if (vi->vid < fid->vid) - continue; - if (vi->vid > fid->vid) { - vi = NULL; - break; - } - //atomic_inc(&vi->usage); - break; - } - - /* TODO: Find all matching volumes if we couldn't match the server and - * break them anyway. - */ - if (!vi) - goto out; - /* Step through all interested superblocks. There may be more than one * because of cell aliasing. */ - hlist_for_each_entry(cbi, &vi->cb_interests, cb_vlink) { + hlist_for_each_entry_rcu(cbi, &vi->cb_interests, cb_vlink) { if (fid->vnode == 0 && fid->unique == 0) { /* The callback break applies to an entire volume. */ struct afs_super_info *as = AFS_FS_S(cbi->sb); @@ -303,10 +322,36 @@ static void afs_break_one_callback(struct afs_server *server, } } } +} -out: - read_unlock(&server->cb_break_lock); - rcu_read_unlock(); +static void afs_break_some_callbacks(struct afs_server *server, + struct afs_callback_break *cbb, + size_t *_count) +{ + struct afs_callback_break *residue = cbb; + struct afs_vol_interest *vi; + afs_volid_t vid = cbb->fid.vid; + size_t i; + + vi = afs_lookup_vol_interest_rcu(server, vid); + + /* TODO: Find all matching volumes if we couldn't match the server and + * break them anyway. + */ + + for (i = *_count; i > 0; cbb++, i--) { + if (cbb->fid.vid == vid) { + _debug("- Fid { vl=%08llx n=%llu u=%u }", + cbb->fid.vid, + cbb->fid.vnode, + cbb->fid.unique); + --*_count; + if (vi) + afs_break_one_callback(server, &cbb->fid, vi); + } else { + *residue++ = *cbb; + } + } } /* @@ -319,17 +364,12 @@ void afs_break_callbacks(struct afs_server *server, size_t count, ASSERT(server != NULL); - /* TODO: Sort the callback break list by volume ID */ + rcu_read_lock(); - for (; count > 0; callbacks++, count--) { - _debug("- Fid { vl=%08llx n=%llu u=%u }", - callbacks->fid.vid, - callbacks->fid.vnode, - callbacks->fid.unique); - afs_break_one_callback(server, &callbacks->fid); - } + while (count > 0) + afs_break_some_callbacks(server, callbacks, &count); - _leave(""); + rcu_read_unlock(); return; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 61320a632e15..b6665fc5d355 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -524,9 +524,9 @@ struct afs_server { rwlock_t fs_lock; /* access lock */ /* callback promise management */ - struct hlist_head cb_volumes; /* List of volume interests on this server */ + struct rb_root cb_volumes; /* List of volume interests on this server */ unsigned cb_s_break; /* Break-everything counter. */ - rwlock_t cb_break_lock; /* Volume finding lock */ + seqlock_t cb_break_lock; /* Volume finding lock */ /* Probe state */ unsigned long probed_at; /* Time last probe was dispatched (jiffies) */ @@ -552,7 +552,7 @@ struct afs_server { * Volume collation in the server's callback interest list. */ struct afs_vol_interest { - struct hlist_node srv_link; /* Link in server->cb_volumes */ + struct rb_node srv_node; /* Link in server->cb_volumes */ struct hlist_head cb_interests; /* List of callback interests on the server */ union { struct rcu_head rcu; diff --git a/fs/afs/server.c b/fs/afs/server.c index 3f707b5ecb62..5ed90f419c54 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -212,8 +212,8 @@ static struct afs_server *afs_alloc_server(struct afs_net *net, server->addr_version = alist->version; server->uuid = *uuid; rwlock_init(&server->fs_lock); - INIT_HLIST_HEAD(&server->cb_volumes); - rwlock_init(&server->cb_break_lock); + server->cb_volumes = RB_ROOT; + seqlock_init(&server->cb_break_lock); init_waitqueue_head(&server->probe_wq); INIT_LIST_HEAD(&server->probe_link); spin_lock_init(&server->probe_lock); From 38355eec6a7d2b8f2f313f9174736dc877744e59 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 8 Apr 2020 16:13:20 +0100 Subject: [PATCH 240/427] afs: Set error flag rather than return error from file status decode Set a flag in the call struct to indicate an unmarshalling error rather than return and handle an error from the decoding of file statuses. This flag is checked on a successful return from the delivery function. Signed-off-by: David Howells --- fs/afs/fsclient.c | 88 +++++++++++++--------------------------------- fs/afs/internal.h | 1 + fs/afs/rxrpc.c | 4 +++ fs/afs/yfsclient.c | 85 +++++++++++++------------------------------- 4 files changed, 55 insertions(+), 123 deletions(-) diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 401de063996c..b1d8d8f780d2 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -56,16 +56,15 @@ static void xdr_dump_bad(const __be32 *bp) /* * decode an AFSFetchStatus block */ -static int xdr_decode_AFSFetchStatus(const __be32 **_bp, - struct afs_call *call, - struct afs_status_cb *scb) +static void xdr_decode_AFSFetchStatus(const __be32 **_bp, + struct afs_call *call, + struct afs_status_cb *scb) { const struct afs_xdr_AFSFetchStatus *xdr = (const void *)*_bp; struct afs_file_status *status = &scb->status; bool inline_error = (call->operation_ID == afs_FS_InlineBulkStatus); u64 data_version, size; u32 type, abort_code; - int ret; abort_code = ntohl(xdr->abort_code); @@ -79,7 +78,7 @@ static int xdr_decode_AFSFetchStatus(const __be32 **_bp, */ status->abort_code = abort_code; scb->have_error = true; - goto good; + goto advance; } pr_warn("Unknown AFSFetchStatus version %u\n", ntohl(xdr->if_version)); @@ -89,7 +88,7 @@ static int xdr_decode_AFSFetchStatus(const __be32 **_bp, if (abort_code != 0 && inline_error) { status->abort_code = abort_code; scb->have_error = true; - goto good; + goto advance; } type = ntohl(xdr->type); @@ -125,15 +124,13 @@ static int xdr_decode_AFSFetchStatus(const __be32 **_bp, data_version |= (u64)ntohl(xdr->data_version_hi) << 32; status->data_version = data_version; scb->have_status = true; -good: - ret = 0; advance: *_bp = (const void *)*_bp + sizeof(*xdr); - return ret; + return; bad: xdr_dump_bad(*_bp); - ret = afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status); + afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status); goto advance; } @@ -254,9 +251,7 @@ static int afs_deliver_fs_fetch_status_vnode(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; + xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); xdr_decode_AFSCallBack(&bp, call, call->out_scb); xdr_decode_AFSVolSync(&bp, call->out_volsync); @@ -419,9 +414,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) return ret; bp = call->buffer; - ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; + xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); xdr_decode_AFSCallBack(&bp, call, call->out_scb); xdr_decode_AFSVolSync(&bp, call->out_volsync); @@ -577,12 +570,8 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; xdr_decode_AFSFid(&bp, call->out_fid); - ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; - ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb); - if (ret < 0) - return ret; + xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); + xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb); xdr_decode_AFSCallBack(&bp, call, call->out_scb); xdr_decode_AFSVolSync(&bp, call->out_volsync); @@ -691,9 +680,7 @@ static int afs_deliver_fs_dir_status_and_vol(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb); - if (ret < 0) - return ret; + xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb); xdr_decode_AFSVolSync(&bp, call->out_volsync); _leave(" = 0 [done]"); @@ -784,12 +771,8 @@ static int afs_deliver_fs_link(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; - ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb); - if (ret < 0) - return ret; + xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); + xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb); xdr_decode_AFSVolSync(&bp, call->out_volsync); _leave(" = 0 [done]"); @@ -878,12 +861,8 @@ static int afs_deliver_fs_symlink(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; xdr_decode_AFSFid(&bp, call->out_fid); - ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; - ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb); - if (ret < 0) - return ret; + xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); + xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb); xdr_decode_AFSVolSync(&bp, call->out_volsync); _leave(" = 0 [done]"); @@ -986,16 +965,12 @@ static int afs_deliver_fs_rename(struct afs_call *call) if (ret < 0) return ret; + bp = call->buffer; /* If the two dirs are the same, we have two copies of the same status * report, so we just decode it twice. */ - bp = call->buffer; - ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb); - if (ret < 0) - return ret; - ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; + xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb); + xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); xdr_decode_AFSVolSync(&bp, call->out_volsync); _leave(" = 0 [done]"); @@ -1103,9 +1078,7 @@ static int afs_deliver_fs_store_data(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; + xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); xdr_decode_AFSVolSync(&bp, call->out_volsync); _leave(" = 0 [done]"); @@ -1283,9 +1256,7 @@ static int afs_deliver_fs_store_status(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; + xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); xdr_decode_AFSVolSync(&bp, call->out_volsync); _leave(" = 0 [done]"); @@ -1952,9 +1923,7 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; + xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); xdr_decode_AFSCallBack(&bp, call, call->out_scb); xdr_decode_AFSVolSync(&bp, call->out_volsync); @@ -2060,10 +2029,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) bp = call->buffer; scb = &call->out_scb[call->count]; - ret = xdr_decode_AFSFetchStatus(&bp, call, scb); - if (ret < 0) - return ret; - + xdr_decode_AFSFetchStatus(&bp, call, scb); call->count++; if (call->count < call->count2) goto more_counts; @@ -2241,9 +2207,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call) return ret; bp = call->buffer; - ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; + xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); xdr_decode_AFSVolSync(&bp, call->out_volsync); call->unmarshall++; @@ -2324,9 +2288,7 @@ static int afs_deliver_fs_file_status_and_vol(struct afs_call *call) return ret; bp = call->buffer; - ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; + xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); xdr_decode_AFSVolSync(&bp, call->out_volsync); _leave(" = 0 [done]"); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index b6665fc5d355..6d5c66dd76de 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -160,6 +160,7 @@ struct afs_call { bool upgrade; /* T to request service upgrade */ bool have_reply_time; /* T if have got reply_time */ bool intr; /* T if interruptible */ + bool unmarshalling_error; /* T if an unmarshalling error occurred */ u16 service_id; /* Actual service ID (after upgrade) */ unsigned int debug_id; /* Trace ID */ u32 operation_ID; /* operation ID for an incoming call */ diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index ab2962fff1fb..c84d571782d7 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -540,6 +540,8 @@ static void afs_deliver_to_call(struct afs_call *call) ret = call->type->deliver(call); state = READ_ONCE(call->state); + if (ret == 0 && call->unmarshalling_error) + ret = -EBADMSG; switch (ret) { case 0: afs_queue_call_work(call); @@ -963,5 +965,7 @@ noinline int afs_protocol_error(struct afs_call *call, int error, enum afs_eproto_cause cause) { trace_afs_protocol_error(call, error, cause); + if (call) + call->unmarshalling_error = true; return error; } diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index fe413e7a5cf4..f118daa5f33a 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -179,21 +179,20 @@ static void xdr_dump_bad(const __be32 *bp) /* * Decode a YFSFetchStatus block */ -static int xdr_decode_YFSFetchStatus(const __be32 **_bp, - struct afs_call *call, - struct afs_status_cb *scb) +static void xdr_decode_YFSFetchStatus(const __be32 **_bp, + struct afs_call *call, + struct afs_status_cb *scb) { const struct yfs_xdr_YFSFetchStatus *xdr = (const void *)*_bp; struct afs_file_status *status = &scb->status; u32 type; - int ret; status->abort_code = ntohl(xdr->abort_code); if (status->abort_code != 0) { if (status->abort_code == VNOVNODE) status->nlink = 0; scb->have_error = true; - goto good; + goto advance; } type = ntohl(xdr->type); @@ -221,15 +220,13 @@ static int xdr_decode_YFSFetchStatus(const __be32 **_bp, status->size = xdr_to_u64(xdr->size); status->data_version = xdr_to_u64(xdr->data_version); scb->have_status = true; -good: - ret = 0; advance: *_bp += xdr_size(xdr); - return ret; + return; bad: xdr_dump_bad(*_bp); - ret = afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status); + afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status); goto advance; } @@ -348,9 +345,7 @@ static int yfs_deliver_fs_status_cb_and_volsync(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; + xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); xdr_decode_YFSCallBack(&bp, call, call->out_scb); xdr_decode_YFSVolSync(&bp, call->out_volsync); @@ -372,9 +367,7 @@ static int yfs_deliver_status_and_volsync(struct afs_call *call) return ret; bp = call->buffer; - ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; + xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); xdr_decode_YFSVolSync(&bp, call->out_volsync); _leave(" = 0 [done]"); @@ -534,9 +527,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) return ret; bp = call->buffer; - ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; + xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); xdr_decode_YFSCallBack(&bp, call, call->out_scb); xdr_decode_YFSVolSync(&bp, call->out_volsync); @@ -644,12 +635,8 @@ static int yfs_deliver_fs_create_vnode(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; xdr_decode_YFSFid(&bp, call->out_fid); - ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; - ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb); - if (ret < 0) - return ret; + xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); + xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb); xdr_decode_YFSCallBack(&bp, call, call->out_scb); xdr_decode_YFSVolSync(&bp, call->out_volsync); @@ -802,14 +789,9 @@ static int yfs_deliver_fs_remove_file2(struct afs_call *call) return ret; bp = call->buffer; - ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb); - if (ret < 0) - return ret; - + xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb); xdr_decode_YFSFid(&bp, &fid); - ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; + xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); /* Was deleted if vnode->status.abort_code == VNOVNODE. */ xdr_decode_YFSVolSync(&bp, call->out_volsync); @@ -889,10 +871,7 @@ static int yfs_deliver_fs_remove(struct afs_call *call) return ret; bp = call->buffer; - ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb); - if (ret < 0) - return ret; - + xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb); xdr_decode_YFSVolSync(&bp, call->out_volsync); return 0; } @@ -974,12 +953,8 @@ static int yfs_deliver_fs_link(struct afs_call *call) return ret; bp = call->buffer; - ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; - ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb); - if (ret < 0) - return ret; + xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); + xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb); xdr_decode_YFSVolSync(&bp, call->out_volsync); _leave(" = 0 [done]"); return 0; @@ -1061,12 +1036,8 @@ static int yfs_deliver_fs_symlink(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; xdr_decode_YFSFid(&bp, call->out_fid); - ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; - ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb); - if (ret < 0) - return ret; + xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); + xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb); xdr_decode_YFSVolSync(&bp, call->out_volsync); _leave(" = 0 [done]"); @@ -1154,13 +1125,11 @@ static int yfs_deliver_fs_rename(struct afs_call *call) return ret; bp = call->buffer; - ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb); - if (ret < 0) - return ret; - ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; - + /* If the two dirs are the same, we have two copies of the same status + * report, so we just decode it twice. + */ + xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb); + xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); xdr_decode_YFSVolSync(&bp, call->out_volsync); _leave(" = 0 [done]"); return 0; @@ -1845,9 +1814,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) bp = call->buffer; scb = &call->out_scb[call->count]; - ret = xdr_decode_YFSFetchStatus(&bp, call, scb); - if (ret < 0) - return ret; + xdr_decode_YFSFetchStatus(&bp, call, scb); call->count++; if (call->count < call->count2) @@ -2067,9 +2034,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call) bp = call->buffer; yacl->inherit_flag = ntohl(*bp++); yacl->num_cleaned = ntohl(*bp++); - ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; + xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); xdr_decode_YFSVolSync(&bp, call->out_volsync); call->unmarshall++; From 7126ead910aa9fcc9e16e9e7a8c9179658261f1d Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 8 Apr 2020 16:49:08 +0100 Subject: [PATCH 241/427] afs: Remove the error argument from afs_protocol_error() Remove the error argument from afs_protocol_error() as it's always -EBADMSG. Signed-off-by: David Howells --- fs/afs/cmservice.c | 9 +++------ fs/afs/fsclient.c | 17 ++++++----------- fs/afs/inode.c | 4 ++-- fs/afs/internal.h | 2 +- fs/afs/rxrpc.c | 6 +++--- fs/afs/vlclient.c | 34 ++++++++++++++-------------------- fs/afs/yfsclient.c | 17 ++++++----------- include/trace/events/afs.h | 10 ++++------ 8 files changed, 39 insertions(+), 60 deletions(-) diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 7ae88958051f..ed0fb34d77dd 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -307,8 +307,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) call->count = ntohl(call->tmp); _debug("FID count: %u", call->count); if (call->count > AFSCBMAX) - return afs_protocol_error(call, -EBADMSG, - afs_eproto_cb_fid_count); + return afs_protocol_error(call, afs_eproto_cb_fid_count); call->buffer = kmalloc(array3_size(call->count, 3, 4), GFP_KERNEL); @@ -353,8 +352,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) call->count2 = ntohl(call->tmp); _debug("CB count: %u", call->count2); if (call->count2 != call->count && call->count2 != 0) - return afs_protocol_error(call, -EBADMSG, - afs_eproto_cb_count); + return afs_protocol_error(call, afs_eproto_cb_count); call->iter = &call->def_iter; iov_iter_discard(&call->def_iter, READ, call->count2 * 3 * 4); call->unmarshall++; @@ -674,8 +672,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call) call->count = ntohl(call->tmp); _debug("FID count: %u", call->count); if (call->count > YFSCBMAX) - return afs_protocol_error(call, -EBADMSG, - afs_eproto_cb_fid_count); + return afs_protocol_error(call, afs_eproto_cb_fid_count); size = array_size(call->count, sizeof(struct yfs_xdr_YFSFid)); call->buffer = kmalloc(size, GFP_KERNEL); diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index b1d8d8f780d2..7d4503174dd1 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -130,7 +130,7 @@ advance: bad: xdr_dump_bad(*_bp); - afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status); + afs_protocol_error(call, afs_eproto_bad_status); goto advance; } @@ -1470,8 +1470,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) call->count = ntohl(call->tmp); _debug("volname length: %u", call->count); if (call->count >= AFSNAMEMAX) - return afs_protocol_error(call, -EBADMSG, - afs_eproto_volname_len); + return afs_protocol_error(call, afs_eproto_volname_len); size = (call->count + 3) & ~3; /* It's padded */ afs_extract_to_buf(call, size); call->unmarshall++; @@ -1500,8 +1499,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) call->count = ntohl(call->tmp); _debug("offline msg length: %u", call->count); if (call->count >= AFSNAMEMAX) - return afs_protocol_error(call, -EBADMSG, - afs_eproto_offline_msg_len); + return afs_protocol_error(call, afs_eproto_offline_msg_len); size = (call->count + 3) & ~3; /* It's padded */ afs_extract_to_buf(call, size); call->unmarshall++; @@ -1531,8 +1529,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) call->count = ntohl(call->tmp); _debug("motd length: %u", call->count); if (call->count >= AFSNAMEMAX) - return afs_protocol_error(call, -EBADMSG, - afs_eproto_motd_len); + return afs_protocol_error(call, afs_eproto_motd_len); size = (call->count + 3) & ~3; /* It's padded */ afs_extract_to_buf(call, size); call->unmarshall++; @@ -2012,8 +2009,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) tmp = ntohl(call->tmp); _debug("status count: %u/%u", tmp, call->count2); if (tmp != call->count2) - return afs_protocol_error(call, -EBADMSG, - afs_eproto_ibulkst_count); + return afs_protocol_error(call, afs_eproto_ibulkst_count); call->count = 0; call->unmarshall++; @@ -2049,8 +2045,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) tmp = ntohl(call->tmp); _debug("CB count: %u", tmp); if (tmp != call->count2) - return afs_protocol_error(call, -EBADMSG, - afs_eproto_ibulkst_cb_count); + return afs_protocol_error(call, afs_eproto_ibulkst_cb_count); call->count = 0; call->unmarshall++; more_cbs: diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 281470fe1183..07933d106e0e 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -130,7 +130,7 @@ static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key, default: dump_vnode(vnode, parent_vnode); write_sequnlock(&vnode->cb_lock); - return afs_protocol_error(NULL, -EBADMSG, afs_eproto_file_type); + return afs_protocol_error(NULL, afs_eproto_file_type); } afs_set_i_size(vnode, status->size); @@ -179,7 +179,7 @@ static void afs_apply_status(struct afs_fs_cursor *fc, vnode->fid.vnode, vnode->fid.unique, status->type, vnode->status.type); - afs_protocol_error(NULL, -EBADMSG, afs_eproto_bad_status); + afs_protocol_error(NULL, afs_eproto_bad_status); return; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 6d5c66dd76de..468bd2b0470d 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1133,7 +1133,7 @@ extern void afs_flat_call_destructor(struct afs_call *); extern void afs_send_empty_reply(struct afs_call *); extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); extern int afs_extract_data(struct afs_call *, bool); -extern int afs_protocol_error(struct afs_call *, int, enum afs_eproto_cause); +extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause); static inline void afs_set_fc_call(struct afs_call *call, struct afs_fs_cursor *fc) { diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index c84d571782d7..00b87bac4fec 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -961,11 +961,11 @@ int afs_extract_data(struct afs_call *call, bool want_more) /* * Log protocol error production. */ -noinline int afs_protocol_error(struct afs_call *call, int error, +noinline int afs_protocol_error(struct afs_call *call, enum afs_eproto_cause cause) { - trace_afs_protocol_error(call, error, cause); + trace_afs_protocol_error(call, cause); if (call) call->unmarshalling_error = true; - return error; + return -EBADMSG; } diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index 972dc5512f33..d0c85623ce8f 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -448,8 +448,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) call->count2 = ntohl(*bp); /* Type or next count */ if (call->count > YFS_MAXENDPOINTS) - return afs_protocol_error(call, -EBADMSG, - afs_eproto_yvl_fsendpt_num); + return afs_protocol_error(call, afs_eproto_yvl_fsendpt_num); alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT); if (!alist) @@ -469,8 +468,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) size = sizeof(__be32) * (1 + 4 + 1); break; default: - return afs_protocol_error(call, -EBADMSG, - afs_eproto_yvl_fsendpt_type); + return afs_protocol_error(call, afs_eproto_yvl_fsendpt_type); } size += sizeof(__be32); @@ -488,21 +486,20 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) switch (call->count2) { case YFS_ENDPOINT_IPV4: if (ntohl(bp[0]) != sizeof(__be32) * 2) - return afs_protocol_error(call, -EBADMSG, - afs_eproto_yvl_fsendpt4_len); + return afs_protocol_error( + call, afs_eproto_yvl_fsendpt4_len); afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2])); bp += 3; break; case YFS_ENDPOINT_IPV6: if (ntohl(bp[0]) != sizeof(__be32) * 5) - return afs_protocol_error(call, -EBADMSG, - afs_eproto_yvl_fsendpt6_len); + return afs_protocol_error( + call, afs_eproto_yvl_fsendpt6_len); afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5])); bp += 6; break; default: - return afs_protocol_error(call, -EBADMSG, - afs_eproto_yvl_fsendpt_type); + return afs_protocol_error(call, afs_eproto_yvl_fsendpt_type); } /* Got either the type of the next entry or the count of @@ -520,8 +517,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) if (!call->count) goto end; if (call->count > YFS_MAXENDPOINTS) - return afs_protocol_error(call, -EBADMSG, - afs_eproto_yvl_vlendpt_type); + return afs_protocol_error(call, afs_eproto_yvl_vlendpt_type); afs_extract_to_buf(call, 1 * sizeof(__be32)); call->unmarshall = 3; @@ -548,8 +544,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) size = sizeof(__be32) * (1 + 4 + 1); break; default: - return afs_protocol_error(call, -EBADMSG, - afs_eproto_yvl_vlendpt_type); + return afs_protocol_error(call, afs_eproto_yvl_vlendpt_type); } if (call->count > 1) @@ -567,19 +562,18 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) switch (call->count2) { case YFS_ENDPOINT_IPV4: if (ntohl(bp[0]) != sizeof(__be32) * 2) - return afs_protocol_error(call, -EBADMSG, - afs_eproto_yvl_vlendpt4_len); + return afs_protocol_error( + call, afs_eproto_yvl_vlendpt4_len); bp += 3; break; case YFS_ENDPOINT_IPV6: if (ntohl(bp[0]) != sizeof(__be32) * 5) - return afs_protocol_error(call, -EBADMSG, - afs_eproto_yvl_vlendpt6_len); + return afs_protocol_error( + call, afs_eproto_yvl_vlendpt6_len); bp += 6; break; default: - return afs_protocol_error(call, -EBADMSG, - afs_eproto_yvl_vlendpt_type); + return afs_protocol_error(call, afs_eproto_yvl_vlendpt_type); } /* Got either the type of the next entry or the count of diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index f118daa5f33a..bf74c679c02b 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -226,7 +226,7 @@ advance: bad: xdr_dump_bad(*_bp); - afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status); + afs_protocol_error(call, afs_eproto_bad_status); goto advance; } @@ -1426,8 +1426,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) call->count = ntohl(call->tmp); _debug("volname length: %u", call->count); if (call->count >= AFSNAMEMAX) - return afs_protocol_error(call, -EBADMSG, - afs_eproto_volname_len); + return afs_protocol_error(call, afs_eproto_volname_len); size = (call->count + 3) & ~3; /* It's padded */ afs_extract_to_buf(call, size); call->unmarshall++; @@ -1456,8 +1455,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) call->count = ntohl(call->tmp); _debug("offline msg length: %u", call->count); if (call->count >= AFSNAMEMAX) - return afs_protocol_error(call, -EBADMSG, - afs_eproto_offline_msg_len); + return afs_protocol_error(call, afs_eproto_offline_msg_len); size = (call->count + 3) & ~3; /* It's padded */ afs_extract_to_buf(call, size); call->unmarshall++; @@ -1487,8 +1485,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) call->count = ntohl(call->tmp); _debug("motd length: %u", call->count); if (call->count >= AFSNAMEMAX) - return afs_protocol_error(call, -EBADMSG, - afs_eproto_motd_len); + return afs_protocol_error(call, afs_eproto_motd_len); size = (call->count + 3) & ~3; /* It's padded */ afs_extract_to_buf(call, size); call->unmarshall++; @@ -1797,8 +1794,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) tmp = ntohl(call->tmp); _debug("status count: %u/%u", tmp, call->count2); if (tmp != call->count2) - return afs_protocol_error(call, -EBADMSG, - afs_eproto_ibulkst_count); + return afs_protocol_error(call, afs_eproto_ibulkst_count); call->count = 0; call->unmarshall++; @@ -1835,8 +1831,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) tmp = ntohl(call->tmp); _debug("CB count: %u", tmp); if (tmp != call->count2) - return afs_protocol_error(call, -EBADMSG, - afs_eproto_ibulkst_cb_count); + return afs_protocol_error(call, afs_eproto_ibulkst_cb_count); call->count = 0; call->unmarshall++; more_cbs: diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 19a07fbf35df..a6d8a9891164 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -994,24 +994,22 @@ TRACE_EVENT(afs_edit_dir, ); TRACE_EVENT(afs_protocol_error, - TP_PROTO(struct afs_call *call, int error, enum afs_eproto_cause cause), + TP_PROTO(struct afs_call *call, enum afs_eproto_cause cause), - TP_ARGS(call, error, cause), + TP_ARGS(call, cause), TP_STRUCT__entry( __field(unsigned int, call ) - __field(int, error ) __field(enum afs_eproto_cause, cause ) ), TP_fast_assign( __entry->call = call ? call->debug_id : 0; - __entry->error = error; __entry->cause = cause; ), - TP_printk("c=%08x r=%d %s", - __entry->call, __entry->error, + TP_printk("c=%08x %s", + __entry->call, __print_symbolic(__entry->cause, afs_eproto_causes)) ); From a310082f6d0afe28797e148726cd52118a8a4428 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 20 Mar 2020 09:32:50 +0000 Subject: [PATCH 242/427] afs: Rename struct afs_fs_cursor to afs_operation As a prelude to implementing asynchronous fileserver operations in the afs filesystem, rename struct afs_fs_cursor to afs_operation. This struct is going to form the core of the operation management and is going to acquire more members in later. Signed-off-by: David Howells --- fs/afs/dir.c | 22 ++-- fs/afs/dir_silly.c | 4 +- fs/afs/file.c | 2 +- fs/afs/flock.c | 6 +- fs/afs/fsclient.c | 42 +++---- fs/afs/inode.c | 10 +- fs/afs/internal.h | 112 ++++++++--------- fs/afs/rotate.c | 292 ++++++++++++++++++++++----------------------- fs/afs/server.c | 8 +- fs/afs/super.c | 4 +- fs/afs/volume.c | 4 +- fs/afs/write.c | 2 +- fs/afs/xattr.c | 8 +- fs/afs/yfsclient.c | 40 +++---- 14 files changed, 278 insertions(+), 278 deletions(-) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 3c486340b220..ff421db40cf2 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -643,7 +643,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, struct afs_super_info *as = dir->i_sb->s_fs_info; struct afs_status_cb *scb; struct afs_iget_data iget_data; - struct afs_fs_cursor fc; + struct afs_operation fc; struct afs_server *server; struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode; struct inode *inode = NULL, *ti; @@ -1220,7 +1220,7 @@ void afs_d_release(struct dentry *dentry) /* * Create a new inode for create/mkdir/symlink */ -static void afs_vnode_new_inode(struct afs_fs_cursor *fc, +static void afs_vnode_new_inode(struct afs_operation *fc, struct dentry *new_dentry, struct afs_iget_data *new_data, struct afs_status_cb *new_scb) @@ -1248,7 +1248,7 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc, d_instantiate(new_dentry, inode); } -static void afs_prep_for_new_inode(struct afs_fs_cursor *fc, +static void afs_prep_for_new_inode(struct afs_operation *fc, struct afs_iget_data *iget_data) { iget_data->volume = fc->vnode->volume; @@ -1261,7 +1261,7 @@ static void afs_prep_for_new_inode(struct afs_fs_cursor *fc, * number derived from the result of the operation. It doesn't matter if * d_fsdata goes backwards as we'll just revalidate. */ -static void afs_update_dentry_version(struct afs_fs_cursor *fc, +static void afs_update_dentry_version(struct afs_operation *fc, struct dentry *dentry, struct afs_status_cb *scb) { @@ -1277,7 +1277,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct afs_iget_data iget_data; struct afs_status_cb *scb; - struct afs_fs_cursor fc; + struct afs_operation fc; struct afs_vnode *dvnode = AFS_FS_I(dir); struct key *key; afs_dataversion_t data_version; @@ -1367,7 +1367,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry) static int afs_rmdir(struct inode *dir, struct dentry *dentry) { struct afs_status_cb *scb; - struct afs_fs_cursor fc; + struct afs_operation fc; struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode = NULL; struct key *key; afs_dataversion_t data_version; @@ -1483,7 +1483,7 @@ static int afs_dir_remove_link(struct afs_vnode *dvnode, struct dentry *dentry, */ static int afs_unlink(struct inode *dir, struct dentry *dentry) { - struct afs_fs_cursor fc; + struct afs_operation fc; struct afs_status_cb *scb; struct afs_vnode *dvnode = AFS_FS_I(dir); struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); @@ -1588,7 +1588,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct afs_iget_data iget_data; - struct afs_fs_cursor fc; + struct afs_operation fc; struct afs_status_cb *scb; struct afs_vnode *dvnode = AFS_FS_I(dir); struct key *key; @@ -1666,7 +1666,7 @@ error: static int afs_link(struct dentry *from, struct inode *dir, struct dentry *dentry) { - struct afs_fs_cursor fc; + struct afs_operation fc; struct afs_status_cb *scb; struct afs_vnode *dvnode = AFS_FS_I(dir); struct afs_vnode *vnode = AFS_FS_I(d_inode(from)); @@ -1755,7 +1755,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, const char *content) { struct afs_iget_data iget_data; - struct afs_fs_cursor fc; + struct afs_operation fc; struct afs_status_cb *scb; struct afs_vnode *dvnode = AFS_FS_I(dir); struct key *key; @@ -1837,7 +1837,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - struct afs_fs_cursor fc; + struct afs_operation fc; struct afs_status_cb *scb; struct afs_vnode *orig_dvnode, *new_dvnode, *vnode; struct dentry *tmp = NULL, *rehash = NULL; diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index d94e2b7cddff..0a82b134aa0d 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -19,7 +19,7 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode struct dentry *old, struct dentry *new, struct key *key) { - struct afs_fs_cursor fc; + struct afs_operation fc; struct afs_status_cb *scb; afs_dataversion_t dir_data_version; int ret = -ERESTARTSYS; @@ -145,7 +145,7 @@ out: static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode, struct dentry *dentry, struct key *key) { - struct afs_fs_cursor fc; + struct afs_operation fc; struct afs_status_cb *scb; int ret = -ERESTARTSYS; diff --git a/fs/afs/file.c b/fs/afs/file.c index 8415733f7bc1..0c0ccc1412ee 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -225,7 +225,7 @@ static void afs_file_readpage_read_complete(struct page *page, */ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *req) { - struct afs_fs_cursor fc; + struct afs_operation fc; struct afs_status_cb *scb; int ret; diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 0f2a94ba73cb..682fe745f10e 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -179,7 +179,7 @@ static int afs_set_lock(struct afs_vnode *vnode, struct key *key, afs_lock_type_t type) { struct afs_status_cb *scb; - struct afs_fs_cursor fc; + struct afs_operation fc; int ret; _enter("%s{%llx:%llu.%u},%x,%u", @@ -216,7 +216,7 @@ static int afs_set_lock(struct afs_vnode *vnode, struct key *key, static int afs_extend_lock(struct afs_vnode *vnode, struct key *key) { struct afs_status_cb *scb; - struct afs_fs_cursor fc; + struct afs_operation fc; int ret; _enter("%s{%llx:%llu.%u},%x", @@ -253,7 +253,7 @@ static int afs_extend_lock(struct afs_vnode *vnode, struct key *key) static int afs_release_lock(struct afs_vnode *vnode, struct key *key) { struct afs_status_cb *scb; - struct afs_fs_cursor fc; + struct afs_operation fc; int ret; _enter("%s{%llx:%llu.%u},%x", diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 7d4503174dd1..3e423e9daa24 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -272,7 +272,7 @@ static const struct afs_call_type afs_RXFSFetchStatus_vnode = { /* * fetch the status information for a file */ -int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_status_cb *scb, +int afs_fs_fetch_file_status(struct afs_operation *fc, struct afs_status_cb *scb, struct afs_volsync *volsync) { struct afs_vnode *vnode = fc->vnode; @@ -470,7 +470,7 @@ static const struct afs_call_type afs_RXFSFetchData64 = { /* * fetch data from a very large file */ -static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, +static int afs_fs_fetch_data64(struct afs_operation *fc, struct afs_status_cb *scb, struct afs_read *req) { @@ -511,7 +511,7 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, /* * fetch data from a file */ -int afs_fs_fetch_data(struct afs_fs_cursor *fc, +int afs_fs_fetch_data(struct afs_operation *fc, struct afs_status_cb *scb, struct afs_read *req) { @@ -599,7 +599,7 @@ static const struct afs_call_type afs_RXFSMakeDir = { /* * create a file or make a directory */ -int afs_fs_create(struct afs_fs_cursor *fc, +int afs_fs_create(struct afs_operation *fc, const char *name, umode_t mode, struct afs_status_cb *dvnode_scb, @@ -707,7 +707,7 @@ static const struct afs_call_type afs_RXFSRemoveDir = { /* * remove a file or directory */ -int afs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode, +int afs_fs_remove(struct afs_operation *fc, struct afs_vnode *vnode, const char *name, bool isdir, struct afs_status_cb *dvnode_scb) { struct afs_vnode *dvnode = fc->vnode; @@ -792,7 +792,7 @@ static const struct afs_call_type afs_RXFSLink = { /* * make a hard link */ -int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode, +int afs_fs_link(struct afs_operation *fc, struct afs_vnode *vnode, const char *name, struct afs_status_cb *dvnode_scb, struct afs_status_cb *vnode_scb) @@ -882,7 +882,7 @@ static const struct afs_call_type afs_RXFSSymlink = { /* * create a symbolic link */ -int afs_fs_symlink(struct afs_fs_cursor *fc, +int afs_fs_symlink(struct afs_operation *fc, const char *name, const char *contents, struct afs_status_cb *dvnode_scb, @@ -990,7 +990,7 @@ static const struct afs_call_type afs_RXFSRename = { /* * Rename/move a file or directory. */ -int afs_fs_rename(struct afs_fs_cursor *fc, +int afs_fs_rename(struct afs_operation *fc, const char *orig_name, struct afs_vnode *new_dvnode, const char *new_name, @@ -1105,7 +1105,7 @@ static const struct afs_call_type afs_RXFSStoreData64 = { /* * store a set of pages to a very large file */ -static int afs_fs_store_data64(struct afs_fs_cursor *fc, +static int afs_fs_store_data64(struct afs_operation *fc, struct address_space *mapping, pgoff_t first, pgoff_t last, unsigned offset, unsigned to, @@ -1165,7 +1165,7 @@ static int afs_fs_store_data64(struct afs_fs_cursor *fc, /* * store a set of pages */ -int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping, +int afs_fs_store_data(struct afs_operation *fc, struct address_space *mapping, pgoff_t first, pgoff_t last, unsigned offset, unsigned to, struct afs_status_cb *scb) @@ -1291,7 +1291,7 @@ static const struct afs_call_type afs_RXFSStoreData64_as_Status = { * set the attributes on a very large file, using FS.StoreData rather than * FS.StoreStatus so as to alter the file size also */ -static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr, +static int afs_fs_setattr_size64(struct afs_operation *fc, struct iattr *attr, struct afs_status_cb *scb) { struct afs_vnode *vnode = fc->vnode; @@ -1340,7 +1340,7 @@ static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr, * set the attributes on a file, using FS.StoreData rather than FS.StoreStatus * so as to alter the file size also */ -static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr, +static int afs_fs_setattr_size(struct afs_operation *fc, struct iattr *attr, struct afs_status_cb *scb) { struct afs_vnode *vnode = fc->vnode; @@ -1388,7 +1388,7 @@ static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr, * set the attributes on a file, using FS.StoreData if there's a change in file * size, and FS.StoreStatus otherwise */ -int afs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr, +int afs_fs_setattr(struct afs_operation *fc, struct iattr *attr, struct afs_status_cb *scb) { struct afs_vnode *vnode = fc->vnode; @@ -1569,7 +1569,7 @@ static const struct afs_call_type afs_RXFSGetVolumeStatus = { /* * fetch the status of a volume */ -int afs_fs_get_volume_status(struct afs_fs_cursor *fc, +int afs_fs_get_volume_status(struct afs_operation *fc, struct afs_volume_status *vs) { struct afs_vnode *vnode = fc->vnode; @@ -1659,7 +1659,7 @@ static const struct afs_call_type afs_RXFSReleaseLock = { /* * Set a lock on a file */ -int afs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type, +int afs_fs_set_lock(struct afs_operation *fc, afs_lock_type_t type, struct afs_status_cb *scb) { struct afs_vnode *vnode = fc->vnode; @@ -1698,7 +1698,7 @@ int afs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type, /* * extend a lock on a file */ -int afs_fs_extend_lock(struct afs_fs_cursor *fc, struct afs_status_cb *scb) +int afs_fs_extend_lock(struct afs_operation *fc, struct afs_status_cb *scb) { struct afs_vnode *vnode = fc->vnode; struct afs_call *call; @@ -1735,7 +1735,7 @@ int afs_fs_extend_lock(struct afs_fs_cursor *fc, struct afs_status_cb *scb) /* * release a lock on a file */ -int afs_fs_release_lock(struct afs_fs_cursor *fc, struct afs_status_cb *scb) +int afs_fs_release_lock(struct afs_operation *fc, struct afs_status_cb *scb) { struct afs_vnode *vnode = fc->vnode; struct afs_call *call; @@ -1941,7 +1941,7 @@ static const struct afs_call_type afs_RXFSFetchStatus = { /* * Fetch the status information for a fid without needing a vnode handle. */ -int afs_fs_fetch_status(struct afs_fs_cursor *fc, +int afs_fs_fetch_status(struct afs_operation *fc, struct afs_net *net, struct afs_fid *fid, struct afs_status_cb *scb, @@ -2101,7 +2101,7 @@ static const struct afs_call_type afs_RXFSInlineBulkStatus = { /* * Fetch the status information for up to 50 files */ -int afs_fs_inline_bulk_status(struct afs_fs_cursor *fc, +int afs_fs_inline_bulk_status(struct afs_operation *fc, struct afs_net *net, struct afs_fid *fids, struct afs_status_cb *statuses, @@ -2234,7 +2234,7 @@ static const struct afs_call_type afs_RXFSFetchACL = { /* * Fetch the ACL for a file. */ -struct afs_acl *afs_fs_fetch_acl(struct afs_fs_cursor *fc, +struct afs_acl *afs_fs_fetch_acl(struct afs_operation *fc, struct afs_status_cb *scb) { struct afs_vnode *vnode = fc->vnode; @@ -2303,7 +2303,7 @@ static const struct afs_call_type afs_RXFSStoreACL = { /* * Fetch the ACL for a file. */ -int afs_fs_store_acl(struct afs_fs_cursor *fc, const struct afs_acl *acl, +int afs_fs_store_acl(struct afs_operation *fc, const struct afs_acl *acl, struct afs_status_cb *scb) { struct afs_vnode *vnode = fc->vnode; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 07933d106e0e..d2dbb3aef611 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -161,7 +161,7 @@ static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key, /* * Update the core inode struct from a returned status record. */ -static void afs_apply_status(struct afs_fs_cursor *fc, +static void afs_apply_status(struct afs_operation *fc, struct afs_vnode *vnode, struct afs_status_cb *scb, const afs_dataversion_t *expected_version) @@ -243,7 +243,7 @@ static void afs_apply_status(struct afs_fs_cursor *fc, /* * Apply a callback to a vnode. */ -static void afs_apply_callback(struct afs_fs_cursor *fc, +static void afs_apply_callback(struct afs_operation *fc, struct afs_vnode *vnode, struct afs_status_cb *scb, unsigned int cb_break) @@ -267,7 +267,7 @@ static void afs_apply_callback(struct afs_fs_cursor *fc, * Apply the received status and callback to an inode all in the same critical * section to avoid races with afs_validate(). */ -void afs_vnode_commit_status(struct afs_fs_cursor *fc, +void afs_vnode_commit_status(struct afs_operation *fc, struct afs_vnode *vnode, unsigned int cb_break, const afs_dataversion_t *expected_version, @@ -304,7 +304,7 @@ int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool is_new, afs_access_t *_caller_access) { struct afs_status_cb *scb; - struct afs_fs_cursor fc; + struct afs_operation fc; int ret; _enter("%s,{%llx:%llu.%u,S=%lx}", @@ -813,7 +813,7 @@ void afs_evict_inode(struct inode *inode) */ int afs_setattr(struct dentry *dentry, struct iattr *attr) { - struct afs_fs_cursor fc; + struct afs_operation fc; struct afs_status_cb *scb; struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); struct key *key; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 468bd2b0470d..0551dedb0371 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -766,7 +766,7 @@ struct afs_vl_cursor { /* * Cursor for iterating over a set of fileservers. */ -struct afs_fs_cursor { +struct afs_operation { const struct afs_call_type *type; /* Type of call done */ struct afs_addr_cursor ac; struct afs_vnode *vnode; @@ -779,13 +779,13 @@ struct afs_fs_cursor { short index; /* Current server */ short error; unsigned short flags; -#define AFS_FS_CURSOR_STOP 0x0001 /* Set to cease iteration */ -#define AFS_FS_CURSOR_VBUSY 0x0002 /* Set if seen VBUSY */ -#define AFS_FS_CURSOR_VMOVED 0x0004 /* Set if seen VMOVED */ -#define AFS_FS_CURSOR_VNOVOL 0x0008 /* Set if seen VNOVOL */ -#define AFS_FS_CURSOR_CUR_ONLY 0x0010 /* Set if current server only (file lock held) */ -#define AFS_FS_CURSOR_NO_VSLEEP 0x0020 /* Set to prevent sleep on VBUSY, VOFFLINE, ... */ -#define AFS_FS_CURSOR_INTR 0x0040 /* Set if op is interruptible */ +#define AFS_OPERATION_STOP 0x0001 /* Set to cease iteration */ +#define AFS_OPERATION_VBUSY 0x0002 /* Set if seen VBUSY */ +#define AFS_OPERATION_VMOVED 0x0004 /* Set if seen VMOVED */ +#define AFS_OPERATION_VNOVOL 0x0008 /* Set if seen VNOVOL */ +#define AFS_OPERATION_CUR_ONLY 0x0010 /* Set if current server only (file lock held) */ +#define AFS_OPERATION_NO_VSLEEP 0x0020 /* Set to prevent sleep on VBUSY, VOFFLINE, ... */ +#define AFS_OPERATION_INTR 0x0040 /* Set if op is interruptible */ unsigned short nr_iterations; /* Number of server iterations */ }; @@ -958,35 +958,35 @@ extern int afs_flock(struct file *, int, struct file_lock *); /* * fsclient.c */ -extern int afs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_status_cb *, +extern int afs_fs_fetch_file_status(struct afs_operation *, struct afs_status_cb *, struct afs_volsync *); -extern int afs_fs_fetch_data(struct afs_fs_cursor *, struct afs_status_cb *, struct afs_read *); -extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t, +extern int afs_fs_fetch_data(struct afs_operation *, struct afs_status_cb *, struct afs_read *); +extern int afs_fs_create(struct afs_operation *, const char *, umode_t, struct afs_status_cb *, struct afs_fid *, struct afs_status_cb *); -extern int afs_fs_remove(struct afs_fs_cursor *, struct afs_vnode *, const char *, bool, +extern int afs_fs_remove(struct afs_operation *, struct afs_vnode *, const char *, bool, struct afs_status_cb *); -extern int afs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *, +extern int afs_fs_link(struct afs_operation *, struct afs_vnode *, const char *, struct afs_status_cb *, struct afs_status_cb *); -extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, +extern int afs_fs_symlink(struct afs_operation *, const char *, const char *, struct afs_status_cb *, struct afs_fid *, struct afs_status_cb *); -extern int afs_fs_rename(struct afs_fs_cursor *, const char *, +extern int afs_fs_rename(struct afs_operation *, const char *, struct afs_vnode *, const char *, struct afs_status_cb *, struct afs_status_cb *); -extern int afs_fs_store_data(struct afs_fs_cursor *, struct address_space *, +extern int afs_fs_store_data(struct afs_operation *, struct address_space *, pgoff_t, pgoff_t, unsigned, unsigned, struct afs_status_cb *); -extern int afs_fs_setattr(struct afs_fs_cursor *, struct iattr *, struct afs_status_cb *); -extern int afs_fs_get_volume_status(struct afs_fs_cursor *, struct afs_volume_status *); -extern int afs_fs_set_lock(struct afs_fs_cursor *, afs_lock_type_t, struct afs_status_cb *); -extern int afs_fs_extend_lock(struct afs_fs_cursor *, struct afs_status_cb *); -extern int afs_fs_release_lock(struct afs_fs_cursor *, struct afs_status_cb *); +extern int afs_fs_setattr(struct afs_operation *, struct iattr *, struct afs_status_cb *); +extern int afs_fs_get_volume_status(struct afs_operation *, struct afs_volume_status *); +extern int afs_fs_set_lock(struct afs_operation *, afs_lock_type_t, struct afs_status_cb *); +extern int afs_fs_extend_lock(struct afs_operation *, struct afs_status_cb *); +extern int afs_fs_release_lock(struct afs_operation *, struct afs_status_cb *); extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *, struct afs_addr_cursor *, struct key *); extern bool afs_fs_get_capabilities(struct afs_net *, struct afs_server *, struct afs_addr_cursor *, struct key *); -extern int afs_fs_inline_bulk_status(struct afs_fs_cursor *, struct afs_net *, +extern int afs_fs_inline_bulk_status(struct afs_operation *, struct afs_net *, struct afs_fid *, struct afs_status_cb *, unsigned int, struct afs_volsync *); -extern int afs_fs_fetch_status(struct afs_fs_cursor *, struct afs_net *, +extern int afs_fs_fetch_status(struct afs_operation *, struct afs_net *, struct afs_fid *, struct afs_status_cb *, struct afs_volsync *); @@ -995,8 +995,8 @@ struct afs_acl { u8 data[]; }; -extern struct afs_acl *afs_fs_fetch_acl(struct afs_fs_cursor *, struct afs_status_cb *); -extern int afs_fs_store_acl(struct afs_fs_cursor *, const struct afs_acl *, +extern struct afs_acl *afs_fs_fetch_acl(struct afs_operation *, struct afs_status_cb *); +extern int afs_fs_store_acl(struct afs_operation *, const struct afs_acl *, struct afs_status_cb *); /* @@ -1010,7 +1010,7 @@ extern void afs_fs_probe_dispatcher(struct work_struct *); /* * inode.c */ -extern void afs_vnode_commit_status(struct afs_fs_cursor *, +extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode *, unsigned int, const afs_dataversion_t *, @@ -1109,11 +1109,11 @@ static inline void afs_put_sysnames(struct afs_sysnames *sysnames) {} /* * rotate.c */ -extern bool afs_begin_vnode_operation(struct afs_fs_cursor *, struct afs_vnode *, +extern bool afs_begin_vnode_operation(struct afs_operation *, struct afs_vnode *, struct key *, bool); -extern bool afs_select_fileserver(struct afs_fs_cursor *); -extern bool afs_select_current_fileserver(struct afs_fs_cursor *); -extern int afs_end_vnode_operation(struct afs_fs_cursor *); +extern bool afs_select_fileserver(struct afs_operation *); +extern bool afs_select_current_fileserver(struct afs_operation *); +extern int afs_end_vnode_operation(struct afs_operation *); /* * rxrpc.c @@ -1135,10 +1135,10 @@ extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); extern int afs_extract_data(struct afs_call *, bool); extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause); -static inline void afs_set_fc_call(struct afs_call *call, struct afs_fs_cursor *fc) +static inline void afs_set_fc_call(struct afs_call *call, struct afs_operation *op) { - call->intr = fc->flags & AFS_FS_CURSOR_INTR; - fc->type = call->type; + call->intr = op->flags & AFS_OPERATION_INTR; + op->type = call->type; } static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size) @@ -1256,7 +1256,7 @@ extern void afs_manage_servers(struct work_struct *); extern void afs_servers_timer(struct timer_list *); extern void afs_fs_probe_timer(struct timer_list *); extern void __net_exit afs_purge_servers(struct afs_net *); -extern bool afs_check_server_record(struct afs_fs_cursor *, struct afs_server *); +extern bool afs_check_server_record(struct afs_operation *, struct afs_server *); static inline void afs_inc_servers_outstanding(struct afs_net *net) { @@ -1358,7 +1358,7 @@ extern struct afs_volume *afs_create_volume(struct afs_fs_context *); extern void afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); extern void afs_put_volume(struct afs_cell *, struct afs_volume *); -extern int afs_check_volume_status(struct afs_volume *, struct afs_fs_cursor *); +extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); /* * write.c @@ -1387,34 +1387,34 @@ extern ssize_t afs_listxattr(struct dentry *, char *, size_t); /* * yfsclient.c */ -extern int yfs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_status_cb *, +extern int yfs_fs_fetch_file_status(struct afs_operation *, struct afs_status_cb *, struct afs_volsync *); -extern int yfs_fs_fetch_data(struct afs_fs_cursor *, struct afs_status_cb *, struct afs_read *); -extern int yfs_fs_create_file(struct afs_fs_cursor *, const char *, umode_t, struct afs_status_cb *, +extern int yfs_fs_fetch_data(struct afs_operation *, struct afs_status_cb *, struct afs_read *); +extern int yfs_fs_create_file(struct afs_operation *, const char *, umode_t, struct afs_status_cb *, struct afs_fid *, struct afs_status_cb *); -extern int yfs_fs_make_dir(struct afs_fs_cursor *, const char *, umode_t, struct afs_status_cb *, +extern int yfs_fs_make_dir(struct afs_operation *, const char *, umode_t, struct afs_status_cb *, struct afs_fid *, struct afs_status_cb *); -extern int yfs_fs_remove_file2(struct afs_fs_cursor *, struct afs_vnode *, const char *, +extern int yfs_fs_remove_file2(struct afs_operation *, struct afs_vnode *, const char *, struct afs_status_cb *, struct afs_status_cb *); -extern int yfs_fs_remove(struct afs_fs_cursor *, struct afs_vnode *, const char *, bool, +extern int yfs_fs_remove(struct afs_operation *, struct afs_vnode *, const char *, bool, struct afs_status_cb *); -extern int yfs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *, +extern int yfs_fs_link(struct afs_operation *, struct afs_vnode *, const char *, struct afs_status_cb *, struct afs_status_cb *); -extern int yfs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, +extern int yfs_fs_symlink(struct afs_operation *, const char *, const char *, struct afs_status_cb *, struct afs_fid *, struct afs_status_cb *); -extern int yfs_fs_rename(struct afs_fs_cursor *, const char *, struct afs_vnode *, const char *, +extern int yfs_fs_rename(struct afs_operation *, const char *, struct afs_vnode *, const char *, struct afs_status_cb *, struct afs_status_cb *); -extern int yfs_fs_store_data(struct afs_fs_cursor *, struct address_space *, +extern int yfs_fs_store_data(struct afs_operation *, struct address_space *, pgoff_t, pgoff_t, unsigned, unsigned, struct afs_status_cb *); -extern int yfs_fs_setattr(struct afs_fs_cursor *, struct iattr *, struct afs_status_cb *); -extern int yfs_fs_get_volume_status(struct afs_fs_cursor *, struct afs_volume_status *); -extern int yfs_fs_set_lock(struct afs_fs_cursor *, afs_lock_type_t, struct afs_status_cb *); -extern int yfs_fs_extend_lock(struct afs_fs_cursor *, struct afs_status_cb *); -extern int yfs_fs_release_lock(struct afs_fs_cursor *, struct afs_status_cb *); -extern int yfs_fs_fetch_status(struct afs_fs_cursor *, struct afs_net *, +extern int yfs_fs_setattr(struct afs_operation *, struct iattr *, struct afs_status_cb *); +extern int yfs_fs_get_volume_status(struct afs_operation *, struct afs_volume_status *); +extern int yfs_fs_set_lock(struct afs_operation *, afs_lock_type_t, struct afs_status_cb *); +extern int yfs_fs_extend_lock(struct afs_operation *, struct afs_status_cb *); +extern int yfs_fs_release_lock(struct afs_operation *, struct afs_status_cb *); +extern int yfs_fs_fetch_status(struct afs_operation *, struct afs_net *, struct afs_fid *, struct afs_status_cb *, struct afs_volsync *); -extern int yfs_fs_inline_bulk_status(struct afs_fs_cursor *, struct afs_net *, +extern int yfs_fs_inline_bulk_status(struct afs_operation *, struct afs_net *, struct afs_fid *, struct afs_status_cb *, unsigned int, struct afs_volsync *); @@ -1429,9 +1429,9 @@ struct yfs_acl { }; extern void yfs_free_opaque_acl(struct yfs_acl *); -extern struct yfs_acl *yfs_fs_fetch_opaque_acl(struct afs_fs_cursor *, struct yfs_acl *, +extern struct yfs_acl *yfs_fs_fetch_opaque_acl(struct afs_operation *, struct yfs_acl *, struct afs_status_cb *); -extern int yfs_fs_store_opaque_acl2(struct afs_fs_cursor *, const struct afs_acl *, +extern int yfs_fs_store_opaque_acl2(struct afs_operation *, const struct afs_acl *, struct afs_status_cb *); /* @@ -1447,10 +1447,10 @@ static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode) return &vnode->vfs_inode; } -static inline void afs_check_for_remote_deletion(struct afs_fs_cursor *fc, +static inline void afs_check_for_remote_deletion(struct afs_operation *op, struct afs_vnode *vnode) { - if (fc->ac.error == -ENOENT) { + if (op->ac.error == -ENOENT) { set_bit(AFS_VNODE_DELETED, &vnode->flags); afs_break_callback(vnode, afs_cb_break_for_deleted); } diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index 46b68da89faa..c930033473f6 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -20,20 +20,20 @@ * Fileserver operations are serialised on the server by vnode, so we serialise * them here also using the io_lock. */ -bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode, +bool afs_begin_vnode_operation(struct afs_operation *op, struct afs_vnode *vnode, struct key *key, bool intr) { - memset(fc, 0, sizeof(*fc)); - fc->vnode = vnode; - fc->key = key; - fc->ac.error = SHRT_MAX; - fc->error = -EDESTADDRREQ; + memset(op, 0, sizeof(*op)); + op->vnode = vnode; + op->key = key; + op->ac.error = SHRT_MAX; + op->error = -EDESTADDRREQ; if (intr) { - fc->flags |= AFS_FS_CURSOR_INTR; + op->flags |= AFS_OPERATION_INTR; if (mutex_lock_interruptible(&vnode->io_lock) < 0) { - fc->error = -EINTR; - fc->flags |= AFS_FS_CURSOR_STOP; + op->error = -EINTR; + op->flags |= AFS_OPERATION_STOP; return false; } } else { @@ -41,7 +41,7 @@ bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode } if (vnode->lock_state != AFS_VNODE_LOCK_NONE) - fc->flags |= AFS_FS_CURSOR_CUR_ONLY; + op->flags |= AFS_OPERATION_CUR_ONLY; return true; } @@ -49,26 +49,26 @@ bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode * Begin iteration through a server list, starting with the vnode's last used * server if possible, or the last recorded good server if not. */ -static bool afs_start_fs_iteration(struct afs_fs_cursor *fc, +static bool afs_start_fs_iteration(struct afs_operation *op, struct afs_vnode *vnode) { struct afs_cb_interest *cbi; int i; read_lock(&vnode->volume->servers_lock); - fc->server_list = afs_get_serverlist(vnode->volume->servers); + op->server_list = afs_get_serverlist(vnode->volume->servers); read_unlock(&vnode->volume->servers_lock); - fc->untried = (1UL << fc->server_list->nr_servers) - 1; - fc->index = READ_ONCE(fc->server_list->preferred); + op->untried = (1UL << op->server_list->nr_servers) - 1; + op->index = READ_ONCE(op->server_list->preferred); cbi = rcu_dereference_protected(vnode->cb_interest, lockdep_is_held(&vnode->io_lock)); if (cbi) { /* See if the vnode's preferred record is still available */ - for (i = 0; i < fc->server_list->nr_servers; i++) { - if (fc->server_list->servers[i].cb_interest == cbi) { - fc->index = i; + for (i = 0; i < op->server_list->nr_servers; i++) { + if (op->server_list->servers[i].cb_interest == cbi) { + op->index = i; goto found_interest; } } @@ -77,8 +77,8 @@ static bool afs_start_fs_iteration(struct afs_fs_cursor *fc, * serving this vnode, then we can't switch to another server * and have to return an error. */ - if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { - fc->error = -ESTALE; + if (op->flags & AFS_OPERATION_CUR_ONLY) { + op->error = -ESTALE; return false; } @@ -118,12 +118,12 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code) /* * Sleep and retry the operation to the same fileserver. */ -static bool afs_sleep_and_retry(struct afs_fs_cursor *fc) +static bool afs_sleep_and_retry(struct afs_operation *op) { - if (fc->flags & AFS_FS_CURSOR_INTR) { + if (op->flags & AFS_OPERATION_INTR) { msleep_interruptible(1000); if (signal_pending(current)) { - fc->error = -ERESTARTSYS; + op->error = -ERESTARTSYS; return false; } } else { @@ -137,26 +137,26 @@ static bool afs_sleep_and_retry(struct afs_fs_cursor *fc) * Select the fileserver to use. May be called multiple times to rotate * through the fileservers. */ -bool afs_select_fileserver(struct afs_fs_cursor *fc) +bool afs_select_fileserver(struct afs_operation *op) { struct afs_addr_list *alist; struct afs_server *server; - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode *vnode = op->vnode; struct afs_error e; u32 rtt; - int error = fc->ac.error, i; + int error = op->ac.error, i; _enter("%lx[%d],%lx[%d],%d,%d", - fc->untried, fc->index, - fc->ac.tried, fc->ac.index, - error, fc->ac.abort_code); + op->untried, op->index, + op->ac.tried, op->ac.index, + error, op->ac.abort_code); - if (fc->flags & AFS_FS_CURSOR_STOP) { + if (op->flags & AFS_OPERATION_STOP) { _leave(" = f [stopped]"); return false; } - fc->nr_iterations++; + op->nr_iterations++; /* Evaluate the result of the previous operation, if there was one. */ switch (error) { @@ -166,8 +166,8 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) case 0: default: /* Success or local failure. Stop. */ - fc->error = error; - fc->flags |= AFS_FS_CURSOR_STOP; + op->error = error; + op->flags |= AFS_OPERATION_STOP; _leave(" = f [okay/local %d]", error); return false; @@ -175,42 +175,42 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) /* The far side rejected the operation on some grounds. This * might involve the server being busy or the volume having been moved. */ - switch (fc->ac.abort_code) { + switch (op->ac.abort_code) { case VNOVOL: /* This fileserver doesn't know about the volume. * - May indicate that the VL is wrong - retry once and compare * the results. * - May indicate that the fileserver couldn't attach to the vol. */ - if (fc->flags & AFS_FS_CURSOR_VNOVOL) { - fc->error = -EREMOTEIO; + if (op->flags & AFS_OPERATION_VNOVOL) { + op->error = -EREMOTEIO; goto next_server; } write_lock(&vnode->volume->servers_lock); - fc->server_list->vnovol_mask |= 1 << fc->index; + op->server_list->vnovol_mask |= 1 << op->index; write_unlock(&vnode->volume->servers_lock); set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); - error = afs_check_volume_status(vnode->volume, fc); + error = afs_check_volume_status(vnode->volume, op); if (error < 0) goto failed_set_error; if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) { - fc->error = -ENOMEDIUM; + op->error = -ENOMEDIUM; goto failed; } /* If the server list didn't change, then assume that * it's the fileserver having trouble. */ - if (vnode->volume->servers == fc->server_list) { - fc->error = -EREMOTEIO; + if (vnode->volume->servers == op->server_list) { + op->error = -EREMOTEIO; goto next_server; } /* Try again */ - fc->flags |= AFS_FS_CURSOR_VNOVOL; + op->flags |= AFS_OPERATION_VNOVOL; _leave(" = t [vnovol]"); return true; @@ -220,20 +220,20 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) case VONLINE: case VDISKFULL: case VOVERQUOTA: - fc->error = afs_abort_to_error(fc->ac.abort_code); + op->error = afs_abort_to_error(op->ac.abort_code); goto next_server; case VOFFLINE: if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags)) { - afs_busy(vnode->volume, fc->ac.abort_code); + afs_busy(vnode->volume, op->ac.abort_code); clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags); } - if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) { - fc->error = -EADV; + if (op->flags & AFS_OPERATION_NO_VSLEEP) { + op->error = -EADV; goto failed; } - if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { - fc->error = -ESTALE; + if (op->flags & AFS_OPERATION_CUR_ONLY) { + op->error = -ESTALE; goto failed; } goto busy; @@ -244,17 +244,17 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) /* Retry after going round all the servers unless we * have a file lock we need to maintain. */ - if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) { - fc->error = -EBUSY; + if (op->flags & AFS_OPERATION_NO_VSLEEP) { + op->error = -EBUSY; goto failed; } if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) { - afs_busy(vnode->volume, fc->ac.abort_code); + afs_busy(vnode->volume, op->ac.abort_code); clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags); } busy: - if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { - if (!afs_sleep_and_retry(fc)) + if (op->flags & AFS_OPERATION_CUR_ONLY) { + if (!afs_sleep_and_retry(op)) goto failed; /* Retry with same server & address */ @@ -262,7 +262,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) return true; } - fc->flags |= AFS_FS_CURSOR_VBUSY; + op->flags |= AFS_OPERATION_VBUSY; goto next_server; case VMOVED: @@ -273,15 +273,15 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) * We also limit the number of VMOVED hops we will * honour, just in case someone sets up a loop. */ - if (fc->flags & AFS_FS_CURSOR_VMOVED) { - fc->error = -EREMOTEIO; + if (op->flags & AFS_OPERATION_VMOVED) { + op->error = -EREMOTEIO; goto failed; } - fc->flags |= AFS_FS_CURSOR_VMOVED; + op->flags |= AFS_OPERATION_VMOVED; set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags); set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); - error = afs_check_volume_status(vnode->volume, fc); + error = afs_check_volume_status(vnode->volume, op); if (error < 0) goto failed_set_error; @@ -294,8 +294,8 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) * * TODO: Retry a few times with sleeps. */ - if (vnode->volume->servers == fc->server_list) { - fc->error = -ENOMEDIUM; + if (vnode->volume->servers == op->server_list) { + op->error = -ENOMEDIUM; goto failed; } @@ -304,13 +304,13 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) default: clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags); clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags); - fc->error = afs_abort_to_error(fc->ac.abort_code); + op->error = afs_abort_to_error(op->ac.abort_code); goto failed; } case -ETIMEDOUT: case -ETIME: - if (fc->error != -EDESTADDRREQ) + if (op->error != -EDESTADDRREQ) goto iterate_address; /* Fall through */ case -ERFKILL: @@ -320,83 +320,83 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) case -EHOSTDOWN: case -ECONNREFUSED: _debug("no conn"); - fc->error = error; + op->error = error; goto iterate_address; case -ECONNRESET: _debug("call reset"); - fc->error = error; + op->error = error; goto failed; } restart_from_beginning: _debug("restart"); - afs_end_cursor(&fc->ac); - afs_put_cb_interest(afs_v2net(vnode), fc->cbi); - fc->cbi = NULL; - afs_put_serverlist(afs_v2net(vnode), fc->server_list); - fc->server_list = NULL; + afs_end_cursor(&op->ac); + afs_put_cb_interest(afs_v2net(vnode), op->cbi); + op->cbi = NULL; + afs_put_serverlist(afs_v2net(vnode), op->server_list); + op->server_list = NULL; start: _debug("start"); /* See if we need to do an update of the volume record. Note that the * volume may have moved or even have been deleted. */ - error = afs_check_volume_status(vnode->volume, fc); + error = afs_check_volume_status(vnode->volume, op); if (error < 0) goto failed_set_error; - if (!afs_start_fs_iteration(fc, vnode)) + if (!afs_start_fs_iteration(op, vnode)) goto failed; _debug("__ VOL %llx __", vnode->volume->vid); pick_server: - _debug("pick [%lx]", fc->untried); + _debug("pick [%lx]", op->untried); - error = afs_wait_for_fs_probes(fc->server_list, fc->untried); + error = afs_wait_for_fs_probes(op->server_list, op->untried); if (error < 0) goto failed_set_error; /* Pick the untried server with the lowest RTT. If we have outstanding * callbacks, we stick with the server we're already using if we can. */ - if (fc->cbi) { - _debug("cbi %u", fc->index); - if (test_bit(fc->index, &fc->untried)) + if (op->cbi) { + _debug("cbi %u", op->index); + if (test_bit(op->index, &op->untried)) goto selected_server; - afs_put_cb_interest(afs_v2net(vnode), fc->cbi); - fc->cbi = NULL; + afs_put_cb_interest(afs_v2net(vnode), op->cbi); + op->cbi = NULL; _debug("nocbi"); } - fc->index = -1; + op->index = -1; rtt = U32_MAX; - for (i = 0; i < fc->server_list->nr_servers; i++) { - struct afs_server *s = fc->server_list->servers[i].server; + for (i = 0; i < op->server_list->nr_servers; i++) { + struct afs_server *s = op->server_list->servers[i].server; - if (!test_bit(i, &fc->untried) || !s->probe.responded) + if (!test_bit(i, &op->untried) || !s->probe.responded) continue; if (s->probe.rtt < rtt) { - fc->index = i; + op->index = i; rtt = s->probe.rtt; } } - if (fc->index == -1) + if (op->index == -1) goto no_more_servers; selected_server: - _debug("use %d", fc->index); - __clear_bit(fc->index, &fc->untried); + _debug("use %d", op->index); + __clear_bit(op->index, &op->untried); /* We're starting on a different fileserver from the list. We need to * check it, create a callback intercept, find its address list and * probe its capabilities before we use it. */ - ASSERTCMP(fc->ac.alist, ==, NULL); - server = fc->server_list->servers[fc->index].server; + ASSERTCMP(op->ac.alist, ==, NULL); + server = op->server_list->servers[op->index].server; - if (!afs_check_server_record(fc, server)) + if (!afs_check_server_record(op, server)) goto failed; _debug("USING SERVER: %pU", &server->uuid); @@ -406,12 +406,12 @@ selected_server: * break request before we've finished decoding the reply and * installing the vnode. */ - error = afs_register_server_cb_interest(vnode, fc->server_list, - fc->index); + error = afs_register_server_cb_interest(vnode, op->server_list, + op->index); if (error < 0) goto failed_set_error; - fc->cbi = afs_get_cb_interest( + op->cbi = afs_get_cb_interest( rcu_dereference_protected(vnode->cb_interest, lockdep_is_held(&vnode->io_lock))); @@ -421,44 +421,44 @@ selected_server: afs_get_addrlist(alist); read_unlock(&server->fs_lock); - memset(&fc->ac, 0, sizeof(fc->ac)); + memset(&op->ac, 0, sizeof(op->ac)); - if (!fc->ac.alist) - fc->ac.alist = alist; + if (!op->ac.alist) + op->ac.alist = alist; else afs_put_addrlist(alist); - fc->ac.index = -1; + op->ac.index = -1; iterate_address: - ASSERT(fc->ac.alist); + ASSERT(op->ac.alist); /* Iterate over the current server's address list to try and find an * address on which it will respond to us. */ - if (!afs_iterate_addresses(&fc->ac)) + if (!afs_iterate_addresses(&op->ac)) goto next_server; - _debug("address [%u] %u/%u", fc->index, fc->ac.index, fc->ac.alist->nr_addrs); + _debug("address [%u] %u/%u", op->index, op->ac.index, op->ac.alist->nr_addrs); _leave(" = t"); return true; next_server: _debug("next"); - afs_end_cursor(&fc->ac); + afs_end_cursor(&op->ac); goto pick_server; no_more_servers: /* That's all the servers poked to no good effect. Try again if some * of them were busy. */ - if (fc->flags & AFS_FS_CURSOR_VBUSY) + if (op->flags & AFS_OPERATION_VBUSY) goto restart_from_beginning; e.error = -EDESTADDRREQ; e.responded = false; - for (i = 0; i < fc->server_list->nr_servers; i++) { - struct afs_server *s = fc->server_list->servers[i].server; + for (i = 0; i < op->server_list->nr_servers; i++) { + struct afs_server *s = op->server_list->servers[i].server; afs_prioritise_error(&e, READ_ONCE(s->probe.error), s->probe.abort_code); @@ -467,11 +467,11 @@ no_more_servers: error = e.error; failed_set_error: - fc->error = error; + op->error = error; failed: - fc->flags |= AFS_FS_CURSOR_STOP; - afs_end_cursor(&fc->ac); - _leave(" = f [failed %d]", fc->error); + op->flags |= AFS_OPERATION_STOP; + afs_end_cursor(&op->ac); + _leave(" = f [failed %d]", op->error); return false; } @@ -480,12 +480,12 @@ failed: * fileserver. We use this when we have a lock on that file, which is backed * only by the fileserver we obtained it from. */ -bool afs_select_current_fileserver(struct afs_fs_cursor *fc) +bool afs_select_current_fileserver(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode *vnode = op->vnode; struct afs_cb_interest *cbi; struct afs_addr_list *alist; - int error = fc->ac.error; + int error = op->ac.error; _enter(""); @@ -495,12 +495,12 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc) switch (error) { case SHRT_MAX: if (!cbi) { - fc->error = -ESTALE; - fc->flags |= AFS_FS_CURSOR_STOP; + op->error = -ESTALE; + op->flags |= AFS_OPERATION_STOP; return false; } - fc->cbi = afs_get_cb_interest(cbi); + op->cbi = afs_get_cb_interest(cbi); read_lock(&cbi->server->fs_lock); alist = rcu_dereference_protected(cbi->server->addresses, @@ -508,27 +508,27 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc) afs_get_addrlist(alist); read_unlock(&cbi->server->fs_lock); if (!alist) { - fc->error = -ESTALE; - fc->flags |= AFS_FS_CURSOR_STOP; + op->error = -ESTALE; + op->flags |= AFS_OPERATION_STOP; return false; } - memset(&fc->ac, 0, sizeof(fc->ac)); - fc->ac.alist = alist; - fc->ac.index = -1; + memset(&op->ac, 0, sizeof(op->ac)); + op->ac.alist = alist; + op->ac.index = -1; goto iterate_address; case 0: default: /* Success or local failure. Stop. */ - fc->error = error; - fc->flags |= AFS_FS_CURSOR_STOP; + op->error = error; + op->flags |= AFS_OPERATION_STOP; _leave(" = f [okay/local %d]", error); return false; case -ECONNABORTED: - fc->error = afs_abort_to_error(fc->ac.abort_code); - fc->flags |= AFS_FS_CURSOR_STOP; + op->error = afs_abort_to_error(op->ac.abort_code); + op->flags |= AFS_OPERATION_STOP; _leave(" = f [abort]"); return false; @@ -541,7 +541,7 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc) case -ETIMEDOUT: case -ETIME: _debug("no conn"); - fc->error = error; + op->error = error; goto iterate_address; } @@ -549,19 +549,19 @@ iterate_address: /* Iterate over the current server's address list to try and find an * address on which it will respond to us. */ - if (afs_iterate_addresses(&fc->ac)) { + if (afs_iterate_addresses(&op->ac)) { _leave(" = t"); return true; } - afs_end_cursor(&fc->ac); + afs_end_cursor(&op->ac); return false; } /* * Dump cursor state in the case of the error being EDESTADDRREQ. */ -static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc) +static void afs_dump_edestaddrreq(const struct afs_operation *op) { static int count; int i; @@ -574,12 +574,12 @@ static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc) pr_notice("EDESTADDR occurred\n"); pr_notice("FC: cbb=%x cbb2=%x fl=%hx err=%hd\n", - fc->cb_break, fc->cb_break_2, fc->flags, fc->error); + op->cb_break, op->cb_break_2, op->flags, op->error); pr_notice("FC: ut=%lx ix=%d ni=%u\n", - fc->untried, fc->index, fc->nr_iterations); + op->untried, op->index, op->nr_iterations); - if (fc->server_list) { - const struct afs_server_list *sl = fc->server_list; + if (op->server_list) { + const struct afs_server_list *sl = op->server_list; pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n", sl->nr_servers, sl->preferred, sl->vnovol_mask); for (i = 0; i < sl->nr_servers; i++) { @@ -595,39 +595,39 @@ static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc) a->preferred); pr_notice("FC: - R=%lx F=%lx\n", a->responded, a->failed); - if (a == fc->ac.alist) + if (a == op->ac.alist) pr_notice("FC: - current\n"); } } } pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n", - fc->ac.tried, fc->ac.index, fc->ac.abort_code, fc->ac.error, - fc->ac.responded, fc->ac.nr_iterations); + op->ac.tried, op->ac.index, op->ac.abort_code, op->ac.error, + op->ac.responded, op->ac.nr_iterations); rcu_read_unlock(); } /* * Tidy up a filesystem cursor and unlock the vnode. */ -int afs_end_vnode_operation(struct afs_fs_cursor *fc) +int afs_end_vnode_operation(struct afs_operation *op) { - struct afs_net *net = afs_v2net(fc->vnode); + struct afs_net *net = afs_v2net(op->vnode); - if (fc->error == -EDESTADDRREQ || - fc->error == -EADDRNOTAVAIL || - fc->error == -ENETUNREACH || - fc->error == -EHOSTUNREACH) - afs_dump_edestaddrreq(fc); + if (op->error == -EDESTADDRREQ || + op->error == -EADDRNOTAVAIL || + op->error == -ENETUNREACH || + op->error == -EHOSTUNREACH) + afs_dump_edestaddrreq(op); - mutex_unlock(&fc->vnode->io_lock); + mutex_unlock(&op->vnode->io_lock); - afs_end_cursor(&fc->ac); - afs_put_cb_interest(net, fc->cbi); - afs_put_serverlist(net, fc->server_list); + afs_end_cursor(&op->ac); + afs_put_cb_interest(net, op->cbi); + afs_put_serverlist(net, op->server_list); - if (fc->error == -ECONNABORTED) - fc->error = afs_abort_to_error(fc->ac.abort_code); + if (op->error == -ECONNABORTED) + op->error = afs_abort_to_error(op->ac.abort_code); - return fc->error; + return op->error; } diff --git a/fs/afs/server.c b/fs/afs/server.c index 5ed90f419c54..3008f2ecfeee 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -571,7 +571,7 @@ void afs_purge_servers(struct afs_net *net) /* * Get an update for a server's address list. */ -static noinline bool afs_update_server_record(struct afs_fs_cursor *fc, struct afs_server *server) +static noinline bool afs_update_server_record(struct afs_operation *fc, struct afs_server *server) { struct afs_addr_list *alist, *discard; @@ -585,7 +585,7 @@ static noinline bool afs_update_server_record(struct afs_fs_cursor *fc, struct a if (IS_ERR(alist)) { if ((PTR_ERR(alist) == -ERESTARTSYS || PTR_ERR(alist) == -EINTR) && - !(fc->flags & AFS_FS_CURSOR_INTR) && + !(fc->flags & AFS_OPERATION_INTR) && server->addresses) { _leave(" = t [intr]"); return true; @@ -613,7 +613,7 @@ static noinline bool afs_update_server_record(struct afs_fs_cursor *fc, struct a /* * See if a server's address list needs updating. */ -bool afs_check_server_record(struct afs_fs_cursor *fc, struct afs_server *server) +bool afs_check_server_record(struct afs_operation *fc, struct afs_server *server) { bool success; int ret, retries = 0; @@ -642,7 +642,7 @@ update: wait: ret = wait_on_bit(&server->flags, AFS_SERVER_FL_UPDATING, - (fc->flags & AFS_FS_CURSOR_INTR) ? + (fc->flags & AFS_OPERATION_INTR) ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); if (ret == -ERESTARTSYS) { fc->error = ret; diff --git a/fs/afs/super.c b/fs/afs/super.c index dda7a9a66848..9f412d7e7edf 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -715,7 +715,7 @@ static void afs_destroy_inode(struct inode *inode) static int afs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct afs_super_info *as = AFS_FS_S(dentry->d_sb); - struct afs_fs_cursor fc; + struct afs_operation fc; struct afs_volume_status vs; struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); struct key *key; @@ -738,7 +738,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, key, true)) { - fc.flags |= AFS_FS_CURSOR_NO_VSLEEP; + fc.flags |= AFS_OPERATION_NO_VSLEEP; while (afs_select_fileserver(&fc)) { fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_get_volume_status(&fc, &vs); diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 249000195f8a..96351088a578 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -280,7 +280,7 @@ error: /* * Make sure the volume record is up to date. */ -int afs_check_volume_status(struct afs_volume *volume, struct afs_fs_cursor *fc) +int afs_check_volume_status(struct afs_volume *volume, struct afs_operation *fc) { int ret, retries = 0; @@ -315,7 +315,7 @@ wait: } ret = wait_on_bit(&volume->flags, AFS_VOLUME_WAIT, - (fc->flags & AFS_FS_CURSOR_INTR) ? + (fc->flags & AFS_OPERATION_INTR) ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); if (ret == -ERESTARTSYS) { _leave(" = %d", ret); diff --git a/fs/afs/write.c b/fs/afs/write.c index cb76566763db..1a8af44ea36b 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -356,7 +356,7 @@ static int afs_store_data(struct address_space *mapping, unsigned offset, unsigned to) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct afs_fs_cursor fc; + struct afs_operation fc; struct afs_status_cb *scb; struct afs_wb_key *wbk = NULL; struct list_head *p; diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index 7af41fd5f3ee..bf645f1c90b0 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -42,7 +42,7 @@ static int afs_xattr_get_acl(const struct xattr_handler *handler, struct inode *inode, const char *name, void *buffer, size_t size) { - struct afs_fs_cursor fc; + struct afs_operation fc; struct afs_status_cb *scb; struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_acl *acl = NULL; @@ -100,7 +100,7 @@ static int afs_xattr_set_acl(const struct xattr_handler *handler, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { - struct afs_fs_cursor fc; + struct afs_operation fc; struct afs_status_cb *scb; struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_acl *acl = NULL; @@ -165,7 +165,7 @@ static int afs_xattr_get_yfs(const struct xattr_handler *handler, struct inode *inode, const char *name, void *buffer, size_t size) { - struct afs_fs_cursor fc; + struct afs_operation fc; struct afs_status_cb *scb; struct afs_vnode *vnode = AFS_FS_I(inode); struct yfs_acl *yacl = NULL; @@ -270,7 +270,7 @@ static int afs_xattr_set_yfs(const struct xattr_handler *handler, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { - struct afs_fs_cursor fc; + struct afs_operation fc; struct afs_status_cb *scb; struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_acl *acl = NULL; diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index bf74c679c02b..360b4a560ba7 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -387,7 +387,7 @@ static const struct afs_call_type yfs_RXYFSFetchStatus_vnode = { /* * Fetch the status information for a file. */ -int yfs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_status_cb *scb, +int yfs_fs_fetch_file_status(struct afs_operation *fc, struct afs_status_cb *scb, struct afs_volsync *volsync) { struct afs_vnode *vnode = fc->vnode; @@ -575,7 +575,7 @@ static const struct afs_call_type yfs_RXYFSFetchData64 = { /* * Fetch data from a file. */ -int yfs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_status_cb *scb, +int yfs_fs_fetch_data(struct afs_operation *fc, struct afs_status_cb *scb, struct afs_read *req) { struct afs_vnode *vnode = fc->vnode; @@ -657,7 +657,7 @@ static const struct afs_call_type afs_RXFSCreateFile = { /* * Create a file. */ -int yfs_fs_create_file(struct afs_fs_cursor *fc, +int yfs_fs_create_file(struct afs_operation *fc, const char *name, umode_t mode, struct afs_status_cb *dvnode_scb, @@ -721,7 +721,7 @@ static const struct afs_call_type yfs_RXFSMakeDir = { /* * Make a directory. */ -int yfs_fs_make_dir(struct afs_fs_cursor *fc, +int yfs_fs_make_dir(struct afs_operation *fc, const char *name, umode_t mode, struct afs_status_cb *dvnode_scb, @@ -811,7 +811,7 @@ static const struct afs_call_type yfs_RXYFSRemoveFile2 = { /* * Remove a file and retrieve new file status. */ -int yfs_fs_remove_file2(struct afs_fs_cursor *fc, struct afs_vnode *vnode, +int yfs_fs_remove_file2(struct afs_operation *fc, struct afs_vnode *vnode, const char *name, struct afs_status_cb *dvnode_scb, struct afs_status_cb *vnode_scb) { @@ -896,7 +896,7 @@ static const struct afs_call_type yfs_RXYFSRemoveDir = { /* * remove a file or directory */ -int yfs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode, +int yfs_fs_remove(struct afs_operation *fc, struct afs_vnode *vnode, const char *name, bool isdir, struct afs_status_cb *dvnode_scb) { @@ -973,7 +973,7 @@ static const struct afs_call_type yfs_RXYFSLink = { /* * Make a hard link. */ -int yfs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode, +int yfs_fs_link(struct afs_operation *fc, struct afs_vnode *vnode, const char *name, struct afs_status_cb *dvnode_scb, struct afs_status_cb *vnode_scb) @@ -1057,7 +1057,7 @@ static const struct afs_call_type yfs_RXYFSSymlink = { /* * Create a symbolic link. */ -int yfs_fs_symlink(struct afs_fs_cursor *fc, +int yfs_fs_symlink(struct afs_operation *fc, const char *name, const char *contents, struct afs_status_cb *dvnode_scb, @@ -1148,7 +1148,7 @@ static const struct afs_call_type yfs_RXYFSRename = { /* * Rename a file or directory. */ -int yfs_fs_rename(struct afs_fs_cursor *fc, +int yfs_fs_rename(struct afs_operation *fc, const char *orig_name, struct afs_vnode *new_dvnode, const char *new_name, @@ -1212,7 +1212,7 @@ static const struct afs_call_type yfs_RXYFSStoreData64 = { /* * Store a set of pages to a large file. */ -int yfs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping, +int yfs_fs_store_data(struct afs_operation *fc, struct address_space *mapping, pgoff_t first, pgoff_t last, unsigned offset, unsigned to, struct afs_status_cb *scb) @@ -1299,7 +1299,7 @@ static const struct afs_call_type yfs_RXYFSStoreData64_as_Status = { * Set the attributes on a file, using YFS.StoreData64 rather than * YFS.StoreStatus so as to alter the file size also. */ -static int yfs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr, +static int yfs_fs_setattr_size(struct afs_operation *fc, struct iattr *attr, struct afs_status_cb *scb) { struct afs_vnode *vnode = fc->vnode; @@ -1345,7 +1345,7 @@ static int yfs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr, * Set the attributes on a file, using YFS.StoreData64 if there's a change in * file size, and YFS.StoreStatus otherwise. */ -int yfs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr, +int yfs_fs_setattr(struct afs_operation *fc, struct iattr *attr, struct afs_status_cb *scb) { struct afs_vnode *vnode = fc->vnode; @@ -1526,7 +1526,7 @@ static const struct afs_call_type yfs_RXYFSGetVolumeStatus = { /* * fetch the status of a volume */ -int yfs_fs_get_volume_status(struct afs_fs_cursor *fc, +int yfs_fs_get_volume_status(struct afs_operation *fc, struct afs_volume_status *vs) { struct afs_vnode *vnode = fc->vnode; @@ -1598,7 +1598,7 @@ static const struct afs_call_type yfs_RXYFSReleaseLock = { /* * Set a lock on a file */ -int yfs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type, +int yfs_fs_set_lock(struct afs_operation *fc, afs_lock_type_t type, struct afs_status_cb *scb) { struct afs_vnode *vnode = fc->vnode; @@ -1639,7 +1639,7 @@ int yfs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type, /* * extend a lock on a file */ -int yfs_fs_extend_lock(struct afs_fs_cursor *fc, struct afs_status_cb *scb) +int yfs_fs_extend_lock(struct afs_operation *fc, struct afs_status_cb *scb) { struct afs_vnode *vnode = fc->vnode; struct afs_call *call; @@ -1677,7 +1677,7 @@ int yfs_fs_extend_lock(struct afs_fs_cursor *fc, struct afs_status_cb *scb) /* * release a lock on a file */ -int yfs_fs_release_lock(struct afs_fs_cursor *fc, struct afs_status_cb *scb) +int yfs_fs_release_lock(struct afs_operation *fc, struct afs_status_cb *scb) { struct afs_vnode *vnode = fc->vnode; struct afs_call *call; @@ -1725,7 +1725,7 @@ static const struct afs_call_type yfs_RXYFSFetchStatus = { /* * Fetch the status information for a fid without needing a vnode handle. */ -int yfs_fs_fetch_status(struct afs_fs_cursor *fc, +int yfs_fs_fetch_status(struct afs_operation *fc, struct afs_net *net, struct afs_fid *fid, struct afs_status_cb *scb, @@ -1888,7 +1888,7 @@ static const struct afs_call_type yfs_RXYFSInlineBulkStatus = { /* * Fetch the status information for up to 1024 files */ -int yfs_fs_inline_bulk_status(struct afs_fs_cursor *fc, +int yfs_fs_inline_bulk_status(struct afs_operation *fc, struct afs_net *net, struct afs_fid *fids, struct afs_status_cb *statuses, @@ -2065,7 +2065,7 @@ static const struct afs_call_type yfs_RXYFSFetchOpaqueACL = { /* * Fetch the YFS advanced ACLs for a file. */ -struct yfs_acl *yfs_fs_fetch_opaque_acl(struct afs_fs_cursor *fc, +struct yfs_acl *yfs_fs_fetch_opaque_acl(struct afs_operation *fc, struct yfs_acl *yacl, struct afs_status_cb *scb) { @@ -2119,7 +2119,7 @@ static const struct afs_call_type yfs_RXYFSStoreOpaqueACL2 = { /* * Fetch the YFS ACL for a file. */ -int yfs_fs_store_opaque_acl2(struct afs_fs_cursor *fc, const struct afs_acl *acl, +int yfs_fs_store_opaque_acl2(struct afs_operation *fc, const struct afs_acl *acl, struct afs_status_cb *scb) { struct afs_vnode *vnode = fc->vnode; From 8fd6e1d6941ce623ae62420d264c1b5505971b2b Mon Sep 17 00:00:00 2001 From: Kenneth D'souza Date: Mon, 18 May 2020 13:01:34 +0530 Subject: [PATCH 243/427] cifs: handle "nolease" option for vers=1.0 The "nolease" mount option is only supported for SMB2+ mounts. Fail with appropriate error message if vers=1.0 option is passed. Signed-off-by: Kenneth D'souza Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/connect.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 28268ed461b8..62503fbed2ab 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3580,6 +3580,16 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) cifs_dbg(VFS, "cache=singleclient requested on mount but NO_CACHING flag set on share\n"); } + if (volume_info->no_lease) { + if (ses->server->vals->protocol_id == 0) { + cifs_dbg(VFS, + "SMB2 or later required for nolease option\n"); + rc = -EOPNOTSUPP; + goto out_fail; + } else + tcon->no_lease = volume_info->no_lease; + } + /* * We can have only one retry value for a connection to a share so for * resources mounted more than once to the same server share the last @@ -3589,7 +3599,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) tcon->nocase = volume_info->nocase; tcon->nohandlecache = volume_info->nohandlecache; tcon->local_lease = volume_info->local_lease; - tcon->no_lease = volume_info->no_lease; INIT_LIST_HEAD(&tcon->pending_opens); spin_lock(&cifs_tcp_ses_lock); From 136a5dc3309a53ee830cd649961d70c8bec55f38 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 27 May 2020 13:50:31 +0100 Subject: [PATCH 244/427] cifs: remove redundant initialization of variable rc The variable rc is being initialized with a value that is never read and it is being updated later with a new value. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Steve French --- fs/cifs/cifssmb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 5014a82391ff..d62f9175c546 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2375,7 +2375,7 @@ int CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, struct kvec *iov, int n_vec) { - int rc = -EACCES; + int rc; WRITE_REQ *pSMB = NULL; int wct; int smb_hdr_len; From aaa3aef34d3ab9499a5c7633823429f7a24e6dff Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 19 May 2020 15:38:27 -0300 Subject: [PATCH 245/427] cifs: set up next DFS target before generic_ip_connect() If we mount a very specific DFS link \\FS0.FOO.COM\dfs\link -> \FS0\share1, \FS1\share2 where its target list contains NB names ("FS0" & "FS1") rather than FQDN ones ("FS0.FOO.COM" & "FS1.FOO.COM"), we end up connecting to \FOO\share1 but server->hostname will have "FOO.COM". The reason is because both "FS0" and "FS0.FOO.COM" resolve to same IP address and they share same TCP server connection, but "FS0.FOO.COM" was the first hostname set -- which is OK. However, if the echo thread timeouts and we still have a good connection to "FS0", in cifs_reconnect() rc = generic_ip_connect(server) -> success if (rc) { ... reconn_inval_dfs_target(server, cifs_sb, &tgt_list, &tgt_it); ... } ... it successfully reconnects to "FS0" server but does not set up next DFS target - which should be the same target server "\FS0\share1" - and server->hostname remains set to "FS0.FOO.COM" rather than "FS0", as reconn_inval_dfs_target() would have it set to "FS0" if called earlier. Finally, in __smb2_reconnect(), the reconnect of tcons would fail because tcon->ses->server->hostname (FS0.FOO.COM) does not match DFS target's hostname (FS0). Fix that by calling reconn_inval_dfs_target() before generic_ip_connect() so server->hostname will get updated correctly prior to reconnecting its tcons in __smb2_reconnect(). With "cifs: handle hostnames that resolve to same ip in failover" patch - The above problem would not occur. - We could save an DNS query to find out that they both resolve to the same ip address. Signed-off-by: Paulo Alcantara (SUSE) Reviewed-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/connect.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 62503fbed2ab..aa1173f3fb12 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -572,26 +572,26 @@ cifs_reconnect(struct TCP_Server_Info *server) try_to_freeze(); mutex_lock(&server->srv_mutex); +#ifdef CONFIG_CIFS_DFS_UPCALL /* * Set up next DFS target server (if any) for reconnect. If DFS * feature is disabled, then we will retry last server we * connected to before. */ + reconn_inval_dfs_target(server, cifs_sb, &tgt_list, &tgt_it); +#endif + rc = reconn_set_ipaddr(server); + if (rc) { + cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n", + __func__, rc); + } + if (cifs_rdma_enabled(server)) rc = smbd_reconnect(server); else rc = generic_ip_connect(server); if (rc) { cifs_dbg(FYI, "reconnect error %d\n", rc); -#ifdef CONFIG_CIFS_DFS_UPCALL - reconn_inval_dfs_target(server, cifs_sb, &tgt_list, - &tgt_it); -#endif - rc = reconn_set_ipaddr(server); - if (rc) { - cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n", - __func__, rc); - } mutex_unlock(&server->srv_mutex); msleep(3000); } else { From e4af35fa55b072190711c11e2bfff8326d313948 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 19 May 2020 15:38:28 -0300 Subject: [PATCH 246/427] cifs: handle hostnames that resolve to same ip in failover In order to support reconnect to hostnames that resolve to same ip address, besides relying on the currently set hostname to match DFS targets, attempt to resolve the targets and then match their addresses with the reconnected server ip address. For instance, if we have two hostnames "FOO" and "BAR", and both resolve to the same ip address, we would be able to handle failover in DFS paths like \\FOO\dfs\link1 -> [ \BAZ\share2 (*), \BAR\share1 ] \\FOO\dfs\link2 -> [ \BAZ\share2 (*), \FOO\share1 ] so when "BAZ" is no longer accessible, link1 and link2 would get reconnected despite having different target hostnames. Signed-off-by: Paulo Alcantara (SUSE) Reviewed-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 5 ++++ fs/cifs/cifssmb.c | 55 +++++++++++++++++++++++++++++--------------- fs/cifs/connect.c | 6 ++--- fs/cifs/misc.c | 48 ++++++++++++++++++++++++++++++++++++++ fs/cifs/smb2pdu.c | 56 ++++++++++++++++++++++++++++++--------------- 5 files changed, 131 insertions(+), 39 deletions(-) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 12a895e02db4..311d8e86c5a8 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -89,6 +89,7 @@ extern void cifs_mid_q_entry_release(struct mid_q_entry *midEntry); extern void cifs_wake_up_task(struct mid_q_entry *mid); extern int cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid); +extern bool cifs_match_ipaddr(struct sockaddr *srcaddr, struct sockaddr *rhs); extern int cifs_discard_remaining_data(struct TCP_Server_Info *server); extern int cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, @@ -616,6 +617,10 @@ static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses, return dfs_cache_find(xid, ses, nls_codepage, remap, old_path, referral, NULL); } + +int match_target_ip(struct TCP_Server_Info *server, + const char *share, size_t share_len, + bool *result); #endif static inline int cifs_create_options(struct cifs_sb_info *cifs_sb, int options) diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index d62f9175c546..5a684f9ac883 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -129,6 +129,7 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc, struct cifs_tcon *tcon) { int rc; + struct TCP_Server_Info *server = tcon->ses->server; struct dfs_cache_tgt_list tl; struct dfs_cache_tgt_iterator *it = NULL; char *tree; @@ -141,15 +142,14 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc, if (!tree) return -ENOMEM; - if (tcon->ipc) { - scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", - tcon->ses->server->hostname); - rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc); - goto out; - } - if (!tcon->dfs_path) { - rc = CIFSTCon(0, tcon->ses, tcon->treeName, tcon, nlsc); + if (tcon->ipc) { + scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", + server->hostname); + rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc); + } else { + rc = CIFSTCon(0, tcon->ses, tcon->treeName, tcon, nlsc); + } goto out; } @@ -157,13 +157,13 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc, if (rc) goto out; - extract_unc_hostname(tcon->ses->server->hostname, &tcp_host, - &tcp_host_len); + extract_unc_hostname(server->hostname, &tcp_host, &tcp_host_len); for (it = dfs_cache_get_tgt_iterator(&tl); it; it = dfs_cache_get_next_tgt(&tl, it)) { const char *share, *prefix; size_t share_len, prefix_len; + bool target_match; rc = dfs_cache_get_tgt_share(it, &share, &share_len, &prefix, &prefix_len); @@ -177,19 +177,38 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc, if (dfs_host_len != tcp_host_len || strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) { - cifs_dbg(FYI, "%s: skipping %.*s, doesn't match %.*s", + cifs_dbg(FYI, "%s: %.*s doesn't match %.*s", __func__, (int)dfs_host_len, dfs_host, (int)tcp_host_len, tcp_host); - continue; + + rc = match_target_ip(server, dfs_host, dfs_host_len, + &target_match); + if (rc) { + cifs_dbg(VFS, "%s: failed to match target ip: %d\n", + __func__, rc); + break; + } + + if (!target_match) { + cifs_dbg(FYI, "%s: skipping target\n", __func__); + continue; + } } - scnprintf(tree, MAX_TREE_SIZE, "\\%.*s", (int)share_len, share); - - rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc); - if (!rc) { - rc = update_super_prepath(tcon, prefix, prefix_len); - break; + if (tcon->ipc) { + scnprintf(tree, MAX_TREE_SIZE, "\\\\%.*s\\IPC$", + (int)share_len, share); + rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc); + } else { + scnprintf(tree, MAX_TREE_SIZE, "\\%.*s", (int)share_len, + share); + rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc); + if (!rc) { + rc = update_super_prepath(tcon, prefix, + prefix_len); + break; + } } if (rc == -EREMOTE) break; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index aa1173f3fb12..ea3189a6ae10 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2496,8 +2496,8 @@ cifs_parse_mount_err: * specified, or if srcaddr is specified and * matches the IP address of the rhs argument. */ -static bool -srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs) +bool +cifs_match_ipaddr(struct sockaddr *srcaddr, struct sockaddr *rhs) { switch (srcaddr->sa_family) { case AF_UNSPEC: @@ -2588,7 +2588,7 @@ match_address(struct TCP_Server_Info *server, struct sockaddr *addr, return false; /* don't expect to be here */ } - if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr)) + if (!cifs_match_ipaddr(srcaddr, (struct sockaddr *)&server->srcaddr)) return false; return true; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 550ce9020a3e..1ec6a5543eda 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -32,6 +32,9 @@ #include "cifs_unicode.h" #include "smb2pdu.h" #include "cifsfs.h" +#ifdef CONFIG_CIFS_DFS_UPCALL +#include "dns_resolve.h" +#endif extern mempool_t *cifs_sm_req_poolp; extern mempool_t *cifs_req_poolp; @@ -1083,6 +1086,51 @@ void cifs_put_tcp_super(struct super_block *sb) } #ifdef CONFIG_CIFS_DFS_UPCALL +int match_target_ip(struct TCP_Server_Info *server, + const char *share, size_t share_len, + bool *result) +{ + int rc; + char *target, *tip = NULL; + struct sockaddr tipaddr; + + *result = false; + + target = kzalloc(share_len + 3, GFP_KERNEL); + if (!target) { + rc = -ENOMEM; + goto out; + } + + scnprintf(target, share_len + 3, "\\\\%.*s", (int)share_len, share); + + cifs_dbg(FYI, "%s: target name: %s\n", __func__, target + 2); + + rc = dns_resolve_server_name_to_ip(target, &tip); + if (rc < 0) + goto out; + + cifs_dbg(FYI, "%s: target ip: %s\n", __func__, tip); + + if (!cifs_convert_address(&tipaddr, tip, strlen(tip))) { + cifs_dbg(VFS, "%s: failed to convert target ip address\n", + __func__); + rc = -EINVAL; + goto out; + } + + *result = cifs_match_ipaddr((struct sockaddr *)&server->dstaddr, + &tipaddr); + cifs_dbg(FYI, "%s: ip addresses match: %u\n", __func__, *result); + rc = 0; + +out: + kfree(target); + kfree(tip); + + return rc; +} + static void tcon_super_cb(struct super_block *sb, void *arg) { struct super_cb_data *sd = arg; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index b30aa3cdd845..cabc19f404e6 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -160,6 +160,7 @@ static int __smb2_reconnect(const struct nls_table *nlsc, struct cifs_tcon *tcon) { int rc; + struct TCP_Server_Info *server = tcon->ses->server; struct dfs_cache_tgt_list tl; struct dfs_cache_tgt_iterator *it = NULL; char *tree; @@ -172,15 +173,15 @@ static int __smb2_reconnect(const struct nls_table *nlsc, if (!tree) return -ENOMEM; - if (tcon->ipc) { - scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", - tcon->ses->server->hostname); - rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc); - goto out; - } - if (!tcon->dfs_path) { - rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nlsc); + if (tcon->ipc) { + scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", + server->hostname); + rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc); + } else { + rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, + nlsc); + } goto out; } @@ -188,13 +189,13 @@ static int __smb2_reconnect(const struct nls_table *nlsc, if (rc) goto out; - extract_unc_hostname(tcon->ses->server->hostname, &tcp_host, - &tcp_host_len); + extract_unc_hostname(server->hostname, &tcp_host, &tcp_host_len); for (it = dfs_cache_get_tgt_iterator(&tl); it; it = dfs_cache_get_next_tgt(&tl, it)) { const char *share, *prefix; size_t share_len, prefix_len; + bool target_match; rc = dfs_cache_get_tgt_share(it, &share, &share_len, &prefix, &prefix_len); @@ -208,19 +209,38 @@ static int __smb2_reconnect(const struct nls_table *nlsc, if (dfs_host_len != tcp_host_len || strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) { - cifs_dbg(FYI, "%s: skipping %.*s, doesn't match %.*s", + cifs_dbg(FYI, "%s: %.*s doesn't match %.*s", __func__, (int)dfs_host_len, dfs_host, (int)tcp_host_len, tcp_host); - continue; + + rc = match_target_ip(server, dfs_host, dfs_host_len, + &target_match); + if (rc) { + cifs_dbg(VFS, "%s: failed to match target ip: %d\n", + __func__, rc); + break; + } + + if (!target_match) { + cifs_dbg(FYI, "%s: skipping target\n", __func__); + continue; + } } - scnprintf(tree, MAX_TREE_SIZE, "\\%.*s", (int)share_len, share); - - rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc); - if (!rc) { - rc = update_super_prepath(tcon, prefix, prefix_len); - break; + if (tcon->ipc) { + scnprintf(tree, MAX_TREE_SIZE, "\\\\%.*s\\IPC$", + (int)share_len, share); + rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc); + } else { + scnprintf(tree, MAX_TREE_SIZE, "\\%.*s", (int)share_len, + share); + rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc); + if (!rc) { + rc = update_super_prepath(tcon, prefix, + prefix_len); + break; + } } if (rc == -EREMOTE) break; From baf3f08ef4083b76ca67b143e135213a7f941879 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 19 May 2020 15:38:29 -0300 Subject: [PATCH 247/427] cifs: get rid of unused parameter in reconn_setup_dfs_targets() The target iterator parameter "it" is not used in reconn_setup_dfs_targets(), so just remove it. Signed-off-by: Paulo Alcantara (SUSE) Reviewed-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/connect.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ea3189a6ae10..329babc6b18a 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -426,8 +426,7 @@ static void reconn_inval_dfs_target(struct TCP_Server_Info *server, } static inline int reconn_setup_dfs_targets(struct cifs_sb_info *cifs_sb, - struct dfs_cache_tgt_list *tl, - struct dfs_cache_tgt_iterator **it) + struct dfs_cache_tgt_list *tl) { if (!cifs_sb->origin_fullpath) return -EOPNOTSUPP; @@ -472,7 +471,7 @@ cifs_reconnect(struct TCP_Server_Info *server) } else { cifs_sb = CIFS_SB(sb); - rc = reconn_setup_dfs_targets(cifs_sb, &tgt_list, &tgt_it); + rc = reconn_setup_dfs_targets(cifs_sb, &tgt_list); if (rc && (rc != -EOPNOTSUPP)) { cifs_server_dbg(VFS, "%s: no target servers for DFS failover\n", __func__); From a7d5c294628088781da9e91cbb034d61c3a71f71 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 20 May 2020 12:19:59 +1000 Subject: [PATCH 248/427] cifs: reduce stack use in smb2_compound_op Move a lot of structures and arrays off the stack and into a dynamically allocated structure instead. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/smb2inode.c | 90 +++++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 44 deletions(-) diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index a8c301ae00ed..19115a9088ea 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -47,6 +47,18 @@ free_set_inf_compound(struct smb_rqst *rqst) } +struct cop_vars { + struct cifs_open_parms oparms; + struct kvec rsp_iov[3]; + struct smb_rqst rqst[3]; + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + struct kvec qi_iov[1]; + struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; + struct kvec close_iov[1]; + struct smb2_file_rename_info rename_info; + struct smb2_file_link_info link_info; +}; + static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, @@ -54,35 +66,33 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, __u32 create_options, umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile) { + struct cop_vars *vars = NULL; + struct kvec *rsp_iov; + struct smb_rqst *rqst; int rc; __le16 *utf16_path = NULL; __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; - struct cifs_open_parms oparms; struct cifs_fid fid; struct cifs_ses *ses = tcon->ses; int num_rqst = 0; - struct smb_rqst rqst[3]; int resp_buftype[3]; - struct kvec rsp_iov[3]; - struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; - struct kvec qi_iov[1]; - struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; - struct kvec close_iov[1]; struct smb2_query_info_rsp *qi_rsp = NULL; int flags = 0; __u8 delete_pending[8] = {1, 0, 0, 0, 0, 0, 0, 0}; unsigned int size[2]; void *data[2]; - struct smb2_file_rename_info rename_info; - struct smb2_file_link_info link_info; int len; + vars = kzalloc(sizeof(*vars), GFP_ATOMIC); + if (vars == NULL) + return -ENOMEM; + rqst = &vars->rqst[0]; + rsp_iov = &vars->rsp_iov[0]; + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - memset(rqst, 0, sizeof(rqst)); resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; - memset(rsp_iov, 0, sizeof(rsp_iov)); /* We already have a handle so we can skip the open */ if (cfile) @@ -95,19 +105,17 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, goto finished; } - memset(&oparms, 0, sizeof(struct cifs_open_parms)); - oparms.tcon = tcon; - oparms.desired_access = desired_access; - oparms.disposition = create_disposition; - oparms.create_options = cifs_create_options(cifs_sb, create_options); - oparms.fid = &fid; - oparms.reconnect = false; - oparms.mode = mode; + vars->oparms.tcon = tcon; + vars->oparms.desired_access = desired_access; + vars->oparms.disposition = create_disposition; + vars->oparms.create_options = cifs_create_options(cifs_sb, create_options); + vars->oparms.fid = &fid; + vars->oparms.reconnect = false; + vars->oparms.mode = mode; - memset(&open_iov, 0, sizeof(open_iov)); - rqst[num_rqst].rq_iov = open_iov; + rqst[num_rqst].rq_iov = &vars->open_iov[0]; rqst[num_rqst].rq_nvec = SMB2_CREATE_IOV_SIZE; - rc = SMB2_open_init(tcon, &rqst[num_rqst], &oplock, &oparms, + rc = SMB2_open_init(tcon, &rqst[num_rqst], &oplock, &vars->oparms, utf16_path); kfree(utf16_path); if (rc) @@ -121,8 +129,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, /* Operation */ switch (command) { case SMB2_OP_QUERY_INFO: - memset(&qi_iov, 0, sizeof(qi_iov)); - rqst[num_rqst].rq_iov = qi_iov; + rqst[num_rqst].rq_iov = &vars->qi_iov[0]; rqst[num_rqst].rq_nvec = 1; if (cfile) @@ -164,8 +171,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_mkdir_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_RMDIR: - memset(&si_iov, 0, sizeof(si_iov)); - rqst[num_rqst].rq_iov = si_iov; + rqst[num_rqst].rq_iov = &vars->si_iov[0]; rqst[num_rqst].rq_nvec = 1; size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */ @@ -182,8 +188,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_SET_EOF: - memset(&si_iov, 0, sizeof(si_iov)); - rqst[num_rqst].rq_iov = si_iov; + rqst[num_rqst].rq_iov = &vars->si_iov[0]; rqst[num_rqst].rq_nvec = 1; size[0] = 8; /* sizeof __le64 */ @@ -200,8 +205,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_SET_INFO: - memset(&si_iov, 0, sizeof(si_iov)); - rqst[num_rqst].rq_iov = si_iov; + rqst[num_rqst].rq_iov = &vars->si_iov[0]; rqst[num_rqst].rq_nvec = 1; @@ -233,18 +237,17 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, full_path); break; case SMB2_OP_RENAME: - memset(&si_iov, 0, sizeof(si_iov)); - rqst[num_rqst].rq_iov = si_iov; + rqst[num_rqst].rq_iov = &vars->si_iov[0]; rqst[num_rqst].rq_nvec = 2; len = (2 * UniStrnlen((wchar_t *)ptr, PATH_MAX)); - rename_info.ReplaceIfExists = 1; - rename_info.RootDirectory = 0; - rename_info.FileNameLength = cpu_to_le32(len); + vars->rename_info.ReplaceIfExists = 1; + vars->rename_info.RootDirectory = 0; + vars->rename_info.FileNameLength = cpu_to_le32(len); size[0] = sizeof(struct smb2_file_rename_info); - data[0] = &rename_info; + data[0] = &vars->rename_info; size[1] = len + 2 /* null */; data[1] = (__le16 *)ptr; @@ -271,18 +274,17 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_HARDLINK: - memset(&si_iov, 0, sizeof(si_iov)); - rqst[num_rqst].rq_iov = si_iov; + rqst[num_rqst].rq_iov = &vars->si_iov[0]; rqst[num_rqst].rq_nvec = 2; len = (2 * UniStrnlen((wchar_t *)ptr, PATH_MAX)); - link_info.ReplaceIfExists = 0; - link_info.RootDirectory = 0; - link_info.FileNameLength = cpu_to_le32(len); + vars->link_info.ReplaceIfExists = 0; + vars->link_info.RootDirectory = 0; + vars->link_info.FileNameLength = cpu_to_le32(len); size[0] = sizeof(struct smb2_file_link_info); - data[0] = &link_info; + data[0] = &vars->link_info; size[1] = len + 2 /* null */; data[1] = (__le16 *)ptr; @@ -308,8 +310,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, if (cfile) goto after_close; /* Close */ - memset(&close_iov, 0, sizeof(close_iov)); - rqst[num_rqst].rq_iov = close_iov; + rqst[num_rqst].rq_iov = &vars->close_iov[0]; rqst[num_rqst].rq_nvec = 1; rc = SMB2_close_init(tcon, &rqst[num_rqst], COMPOUND_FID, COMPOUND_FID, false); @@ -420,6 +421,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + kfree(vars); return rc; } From b2ca6c2c9eddc41c09e49e8e83f8208bd80fdb8e Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Thu, 21 May 2020 15:03:15 +1000 Subject: [PATCH 249/427] cifs: move some variables off the stack in smb2_ioctl_query_info Move some large data structures off the stack and into dynamically allocated memory in the function smb2_ioctl_query_info Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 58 ++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f829f4165d38..fa5c79f64c0b 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1452,6 +1452,16 @@ req_res_key_exit: return rc; } +struct iqi_vars { + struct smb_rqst rqst[3]; + struct kvec rsp_iov[3]; + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + struct kvec qi_iov[1]; + struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; + struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; + struct kvec close_iov[1]; +}; + static int smb2_ioctl_query_info(const unsigned int xid, struct cifs_tcon *tcon, @@ -1459,6 +1469,9 @@ smb2_ioctl_query_info(const unsigned int xid, __le16 *path, int is_dir, unsigned long p) { + struct iqi_vars *vars; + struct smb_rqst *rqst; + struct kvec *rsp_iov; struct cifs_ses *ses = tcon->ses; char __user *arg = (char __user *)p; struct smb_query_info qi; @@ -1468,45 +1481,47 @@ smb2_ioctl_query_info(const unsigned int xid, struct smb2_query_info_rsp *qi_rsp = NULL; struct smb2_ioctl_rsp *io_rsp = NULL; void *buffer = NULL; - struct smb_rqst rqst[3]; int resp_buftype[3]; - struct kvec rsp_iov[3]; - struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; struct cifs_open_parms oparms; u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_fid fid; - struct kvec qi_iov[1]; - struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; - struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; - struct kvec close_iov[1]; unsigned int size[2]; void *data[2]; int create_options = is_dir ? CREATE_NOT_FILE : CREATE_NOT_DIR; - memset(rqst, 0, sizeof(rqst)); + vars = kzalloc(sizeof(*vars), GFP_ATOMIC); + if (vars == NULL) + return -ENOMEM; + rqst = &vars->rqst[0]; + rsp_iov = &vars->rsp_iov[0]; + resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; - memset(rsp_iov, 0, sizeof(rsp_iov)); if (copy_from_user(&qi, arg, sizeof(struct smb_query_info))) - return -EFAULT; + goto e_fault; - if (qi.output_buffer_length > 1024) + if (qi.output_buffer_length > 1024) { + kfree(vars); return -EINVAL; + } - if (!ses || !(ses->server)) + if (!ses || !(ses->server)) { + kfree(vars); return -EIO; + } if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; buffer = memdup_user(arg + sizeof(struct smb_query_info), qi.output_buffer_length); - if (IS_ERR(buffer)) + if (IS_ERR(buffer)) { + kfree(vars); return PTR_ERR(buffer); + } /* Open */ - memset(&open_iov, 0, sizeof(open_iov)); - rqst[0].rq_iov = open_iov; + rqst[0].rq_iov = &vars->open_iov[0]; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; memset(&oparms, 0, sizeof(oparms)); @@ -1548,8 +1563,7 @@ smb2_ioctl_query_info(const unsigned int xid, if (!capable(CAP_SYS_ADMIN)) rc = -EPERM; else { - memset(&io_iov, 0, sizeof(io_iov)); - rqst[1].rq_iov = io_iov; + rqst[1].rq_iov = &vars->io_iov[0]; rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; rc = SMB2_ioctl_init(tcon, &rqst[1], @@ -1565,8 +1579,7 @@ smb2_ioctl_query_info(const unsigned int xid, if (!capable(CAP_SYS_ADMIN)) rc = -EPERM; else { - memset(&si_iov, 0, sizeof(si_iov)); - rqst[1].rq_iov = si_iov; + rqst[1].rq_iov = &vars->si_iov[0]; rqst[1].rq_nvec = 1; size[0] = 8; @@ -1579,8 +1592,7 @@ smb2_ioctl_query_info(const unsigned int xid, SMB2_O_INFO_FILE, 0, data, size); } } else if (qi.flags == PASSTHRU_QUERY_INFO) { - memset(&qi_iov, 0, sizeof(qi_iov)); - rqst[1].rq_iov = qi_iov; + rqst[1].rq_iov = &vars->qi_iov[0]; rqst[1].rq_nvec = 1; rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, @@ -1599,8 +1611,7 @@ smb2_ioctl_query_info(const unsigned int xid, smb2_set_related(&rqst[1]); /* Close */ - memset(&close_iov, 0, sizeof(close_iov)); - rqst[2].rq_iov = close_iov; + rqst[2].rq_iov = &vars->close_iov[0]; rqst[2].rq_nvec = 1; rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID, false); @@ -1649,6 +1660,7 @@ smb2_ioctl_query_info(const unsigned int xid, } iqinf_exit: + kfree(vars); kfree(buffer); SMB2_open_free(&rqst[0]); if (qi.flags & PASSTHRU_FSCTL) From 82e9367c43890cb6a870f700c9180c7eb2035684 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 19 May 2020 03:06:57 -0500 Subject: [PATCH 250/427] smb3: Add new parm "nodelete" In order to handle workloads where it is important to make sure that a buggy app did not delete content on the drive, the new mount option "nodelete" allows standard permission checks on the server to work, but prevents on the client any attempts to unlink a file or delete a directory on that mount point. This can be helpful when running a little understood app on a network mount that contains important content that should not be deleted. Signed-off-by: Steve French CC: Stable Reviewed-by: Pavel Shilovsky --- fs/cifs/cifsfs.c | 2 ++ fs/cifs/cifsglob.h | 2 ++ fs/cifs/connect.c | 9 ++++++++- fs/cifs/inode.c | 11 +++++++++++ 4 files changed, 23 insertions(+), 1 deletion(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index c31f362fa098..889f9c71049b 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -534,6 +534,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",signloosely"); if (tcon->nocase) seq_puts(s, ",nocase"); + if (tcon->nodelete) + seq_puts(s, ",nodelete"); if (tcon->local_lease) seq_puts(s, ",locallease"); if (tcon->retry) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 39b708d9d86d..4d261fd78fcb 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -562,6 +562,7 @@ struct smb_vol { bool override_gid:1; bool dynperm:1; bool noperm:1; + bool nodelete:1; bool mode_ace:1; bool no_psx_acl:1; /* set if posix acl support should be disabled */ bool cifs_acl:1; @@ -1136,6 +1137,7 @@ struct cifs_tcon { bool retry:1; bool nocase:1; bool nohandlecache:1; /* if strange server resource prob can turn off */ + bool nodelete:1; bool seal:1; /* transport encryption for this mounted share */ bool unix_ext:1; /* if false disable Linux extensions to CIFS protocol for this mount even if server would support */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 329babc6b18a..57d1cc6bf86f 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -75,7 +75,7 @@ enum { Opt_forceuid, Opt_noforceuid, Opt_forcegid, Opt_noforcegid, Opt_noblocksend, Opt_noautotune, Opt_nolease, - Opt_hard, Opt_soft, Opt_perm, Opt_noperm, + Opt_hard, Opt_soft, Opt_perm, Opt_noperm, Opt_nodelete, Opt_mapposix, Opt_nomapposix, Opt_mapchars, Opt_nomapchars, Opt_sfu, Opt_nosfu, Opt_nodfs, Opt_posixpaths, @@ -141,6 +141,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_soft, "soft" }, { Opt_perm, "perm" }, { Opt_noperm, "noperm" }, + { Opt_nodelete, "nodelete" }, { Opt_mapchars, "mapchars" }, /* SFU style */ { Opt_nomapchars, "nomapchars" }, { Opt_mapposix, "mapposix" }, /* SFM style */ @@ -1760,6 +1761,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_noperm: vol->noperm = 1; break; + case Opt_nodelete: + vol->nodelete = 1; + break; case Opt_mapchars: vol->sfu_remap = true; vol->remap = false; /* disable SFM mapping */ @@ -3362,6 +3366,8 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb_vol *volume_info) return 0; if (tcon->no_lease != volume_info->no_lease) return 0; + if (tcon->nodelete != volume_info->nodelete) + return 0; return 1; } @@ -3597,6 +3603,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) tcon->retry = volume_info->retry; tcon->nocase = volume_info->nocase; tcon->nohandlecache = volume_info->nohandlecache; + tcon->nodelete = volume_info->nodelete; tcon->local_lease = volume_info->local_lease; INIT_LIST_HEAD(&tcon->pending_opens); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 5d2965a23730..873b1effd412 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1418,6 +1418,11 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) xid = get_xid(); + if (tcon->nodelete) { + rc = -EACCES; + goto unlink_out; + } + /* Unlink can be called from rename so we can not take the * sb->s_vfs_rename_mutex here */ full_path = build_path_from_dentry(dentry); @@ -1746,6 +1751,12 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) goto rmdir_exit; } + if (tcon->nodelete) { + rc = -EACCES; + cifs_put_tlink(tlink); + goto rmdir_exit; + } + rc = server->ops->rmdir(xid, tcon, full_path, cifs_sb); cifs_put_tlink(tlink); From a0a3036b81f1f66fa3333559ecfe18f5bbfa5076 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 14 Apr 2020 22:42:53 -0700 Subject: [PATCH 251/427] cifs: Standardize logging output Use pr_fmt to standardize all logging for fs/cifs. Some logging output had no CIFS: specific prefix. Now all output has one of three prefixes: o CIFS: o CIFS: VFS: o Root-CIFS: Miscellanea: o Convert printks to pr_ o Neaten macro definitions o Remove embedded CIFS: prefixes from formats o Convert "illegal" to "invalid" o Coalesce formats o Add missing '\n' format terminations o Consolidate multiple cifs_dbg continuations into single calls o More consistent use of upper case first word output logging o Multiline statement argument alignment and wrapping Signed-off-by: Joe Perches Signed-off-by: Steve French --- fs/cifs/cifs_debug.h | 145 ++++++++++++++++++------------------- fs/cifs/cifsencrypt.c | 8 +- fs/cifs/cifsproto.h | 26 +++---- fs/cifs/cifsroot.c | 6 +- fs/cifs/cifssmb.c | 24 +++--- fs/cifs/connect.c | 77 ++++++++------------ fs/cifs/dfs_cache.c | 14 ++-- fs/cifs/file.c | 24 +++--- fs/cifs/inode.c | 4 +- fs/cifs/misc.c | 12 +-- fs/cifs/netmisc.c | 6 +- fs/cifs/readdir.c | 10 +-- fs/cifs/sess.c | 28 +++---- fs/cifs/smb1ops.c | 2 +- fs/cifs/smb2inode.c | 3 +- fs/cifs/smb2misc.c | 20 ++--- fs/cifs/smb2ops.c | 31 ++++---- fs/cifs/smb2pdu.c | 70 +++++++++--------- fs/cifs/smbdirect.c | 165 +++++++++++++++++------------------------- fs/cifs/transport.c | 25 +++---- 20 files changed, 319 insertions(+), 381 deletions(-) diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index 100b0056a369..5e66dab712d0 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -8,6 +8,12 @@ #ifndef _H_CIFS_DEBUG #define _H_CIFS_DEBUG +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) "CIFS: " fmt + void cifs_dump_mem(char *label, void *data, int length); void cifs_dump_detail(void *buf, struct TCP_Server_Info *ptcp_info); void cifs_dump_mids(struct TCP_Server_Info *); @@ -46,92 +52,81 @@ extern int cifsFYI; */ /* Information level messages, minor events */ -#define cifs_info_func(ratefunc, fmt, ...) \ -do { \ - pr_info_ ## ratefunc("CIFS: " fmt, ##__VA_ARGS__); \ -} while (0) +#define cifs_info_func(ratefunc, fmt, ...) \ + pr_info_ ## ratefunc(fmt, ##__VA_ARGS__) -#define cifs_info(fmt, ...) \ -do { \ - cifs_info_func(ratelimited, fmt, ##__VA_ARGS__); \ -} while (0) +#define cifs_info(fmt, ...) \ + cifs_info_func(ratelimited, fmt, ##__VA_ARGS__) /* information message: e.g., configuration, major event */ -#define cifs_dbg_func(ratefunc, type, fmt, ...) \ -do { \ - if ((type) & FYI && cifsFYI & CIFS_INFO) { \ - pr_debug_ ## ratefunc("%s: " \ - fmt, __FILE__, ##__VA_ARGS__); \ - } else if ((type) & VFS) { \ - pr_err_ ## ratefunc("CIFS VFS: " \ - fmt, ##__VA_ARGS__); \ - } else if ((type) & NOISY && (NOISY != 0)) { \ - pr_debug_ ## ratefunc(fmt, ##__VA_ARGS__); \ - } \ +#define cifs_dbg_func(ratefunc, type, fmt, ...) \ +do { \ + if ((type) & FYI && cifsFYI & CIFS_INFO) { \ + pr_debug_ ## ratefunc("%s: " fmt, \ + __FILE__, ##__VA_ARGS__); \ + } else if ((type) & VFS) { \ + pr_err_ ## ratefunc("VFS: " fmt, ##__VA_ARGS__); \ + } else if ((type) & NOISY && (NOISY != 0)) { \ + pr_debug_ ## ratefunc(fmt, ##__VA_ARGS__); \ + } \ } while (0) -#define cifs_dbg(type, fmt, ...) \ -do { \ - if ((type) & ONCE) \ - cifs_dbg_func(once, \ - type, fmt, ##__VA_ARGS__); \ - else \ - cifs_dbg_func(ratelimited, \ - type, fmt, ##__VA_ARGS__); \ +#define cifs_dbg(type, fmt, ...) \ +do { \ + if ((type) & ONCE) \ + cifs_dbg_func(once, type, fmt, ##__VA_ARGS__); \ + else \ + cifs_dbg_func(ratelimited, type, fmt, ##__VA_ARGS__); \ } while (0) -#define cifs_server_dbg_func(ratefunc, type, fmt, ...) \ -do { \ - const char *sn = ""; \ - if (server && server->hostname) \ - sn = server->hostname; \ - if ((type) & FYI && cifsFYI & CIFS_INFO) { \ - pr_debug_ ## ratefunc("%s: \\\\%s " fmt, \ - __FILE__, sn, ##__VA_ARGS__); \ - } else if ((type) & VFS) { \ - pr_err_ ## ratefunc("CIFS VFS: \\\\%s " fmt, \ - sn, ##__VA_ARGS__); \ - } else if ((type) & NOISY && (NOISY != 0)) { \ - pr_debug_ ## ratefunc("\\\\%s " fmt, \ - sn, ##__VA_ARGS__); \ - } \ +#define cifs_server_dbg_func(ratefunc, type, fmt, ...) \ +do { \ + const char *sn = ""; \ + if (server && server->hostname) \ + sn = server->hostname; \ + if ((type) & FYI && cifsFYI & CIFS_INFO) { \ + pr_debug_ ## ratefunc("%s: \\\\%s " fmt, \ + __FILE__, sn, ##__VA_ARGS__); \ + } else if ((type) & VFS) { \ + pr_err_ ## ratefunc("VFS: \\\\%s " fmt, \ + sn, ##__VA_ARGS__); \ + } else if ((type) & NOISY && (NOISY != 0)) { \ + pr_debug_ ## ratefunc("\\\\%s " fmt, \ + sn, ##__VA_ARGS__); \ + } \ } while (0) -#define cifs_server_dbg(type, fmt, ...) \ -do { \ - if ((type) & ONCE) \ - cifs_server_dbg_func(once, \ - type, fmt, ##__VA_ARGS__); \ - else \ - cifs_server_dbg_func(ratelimited, \ - type, fmt, ##__VA_ARGS__); \ +#define cifs_server_dbg(type, fmt, ...) \ +do { \ + if ((type) & ONCE) \ + cifs_server_dbg_func(once, type, fmt, ##__VA_ARGS__); \ + else \ + cifs_server_dbg_func(ratelimited, type, fmt, \ + ##__VA_ARGS__); \ } while (0) -#define cifs_tcon_dbg_func(ratefunc, type, fmt, ...) \ -do { \ - const char *tn = ""; \ - if (tcon && tcon->treeName) \ - tn = tcon->treeName; \ - if ((type) & FYI && cifsFYI & CIFS_INFO) { \ - pr_debug_ ## ratefunc("%s: %s " fmt, \ - __FILE__, tn, ##__VA_ARGS__); \ - } else if ((type) & VFS) { \ - pr_err_ ## ratefunc("CIFS VFS: %s " fmt, \ - tn, ##__VA_ARGS__); \ - } else if ((type) & NOISY && (NOISY != 0)) { \ - pr_debug_ ## ratefunc("%s " fmt, \ - tn, ##__VA_ARGS__); \ - } \ +#define cifs_tcon_dbg_func(ratefunc, type, fmt, ...) \ +do { \ + const char *tn = ""; \ + if (tcon && tcon->treeName) \ + tn = tcon->treeName; \ + if ((type) & FYI && cifsFYI & CIFS_INFO) { \ + pr_debug_ ## ratefunc("%s: %s " fmt, \ + __FILE__, tn, ##__VA_ARGS__); \ + } else if ((type) & VFS) { \ + pr_err_ ## ratefunc("VFS: %s " fmt, tn, ##__VA_ARGS__); \ + } else if ((type) & NOISY && (NOISY != 0)) { \ + pr_debug_ ## ratefunc("%s " fmt, tn, ##__VA_ARGS__); \ + } \ } while (0) -#define cifs_tcon_dbg(type, fmt, ...) \ -do { \ - if ((type) & ONCE) \ - cifs_tcon_dbg_func(once, \ - type, fmt, ##__VA_ARGS__); \ - else \ - cifs_tcon_dbg_func(ratelimited, \ - type, fmt, ##__VA_ARGS__); \ +#define cifs_tcon_dbg(type, fmt, ...) \ +do { \ + if ((type) & ONCE) \ + cifs_tcon_dbg_func(once, type, fmt, ##__VA_ARGS__); \ + else \ + cifs_tcon_dbg_func(ratelimited, type, fmt, \ + ##__VA_ARGS__); \ } while (0) /* @@ -159,9 +154,7 @@ do { \ } while (0) #define cifs_info(fmt, ...) \ -do { \ - pr_info("CIFS: "fmt, ##__VA_ARGS__); \ -} while (0) + pr_info(fmt, ##__VA_ARGS__) #endif #endif /* _H_CIFS_DEBUG */ diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 97b7497c13ef..874a551f339c 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -520,7 +520,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { - cifs_dbg(VFS, "%s: could not init hmacmd5\n", __func__); + cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); return rc; } @@ -624,7 +624,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { - cifs_dbg(VFS, "%s: could not init hmacmd5\n", __func__); + cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); return rc; } @@ -723,7 +723,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) /* calculate ntlmv2_hash */ rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp); if (rc) { - cifs_dbg(VFS, "could not get v2 hash rc %d\n", rc); + cifs_dbg(VFS, "Could not get v2 hash rc %d\n", rc); goto unlock; } @@ -783,7 +783,7 @@ calc_seckey(struct cifs_ses *ses) ctx_arc4 = kmalloc(sizeof(*ctx_arc4), GFP_KERNEL); if (!ctx_arc4) { - cifs_dbg(VFS, "could not allocate arc4 context\n"); + cifs_dbg(VFS, "Could not allocate arc4 context\n"); return -ENOMEM; } diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 311d8e86c5a8..8036216ce434 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -45,25 +45,25 @@ extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *, unsigned int /* length */); extern unsigned int _get_xid(void); extern void _free_xid(unsigned int); -#define get_xid() \ -({ \ +#define get_xid() \ +({ \ unsigned int __xid = _get_xid(); \ - cifs_dbg(FYI, "CIFS VFS: in %s as Xid: %u with uid: %d\n", \ + cifs_dbg(FYI, "VFS: in %s as Xid: %u with uid: %d\n", \ __func__, __xid, \ from_kuid(&init_user_ns, current_fsuid())); \ - trace_smb3_enter(__xid, __func__); \ - __xid; \ + trace_smb3_enter(__xid, __func__); \ + __xid; \ }) -#define free_xid(curr_xid) \ -do { \ - _free_xid(curr_xid); \ - cifs_dbg(FYI, "CIFS VFS: leaving %s (xid = %u) rc = %d\n", \ - __func__, curr_xid, (int)rc); \ - if (rc) \ +#define free_xid(curr_xid) \ +do { \ + _free_xid(curr_xid); \ + cifs_dbg(FYI, "VFS: leaving %s (xid = %u) rc = %d\n", \ + __func__, curr_xid, (int)rc); \ + if (rc) \ trace_smb3_exit_err(curr_xid, __func__, (int)rc); \ - else \ - trace_smb3_exit_done(curr_xid, __func__); \ + else \ + trace_smb3_exit_done(curr_xid, __func__); \ } while (0) extern int init_cifs_idmap(void); extern void exit_cifs_idmap(void); diff --git a/fs/cifs/cifsroot.c b/fs/cifs/cifsroot.c index 37edbfb8e096..9e91a5a40aae 100644 --- a/fs/cifs/cifsroot.c +++ b/fs/cifs/cifsroot.c @@ -56,7 +56,7 @@ static int __init cifs_root_setup(char *line) /* len is strlen(unc) + '\0' */ len = s - line + 1; if (len > sizeof(root_dev)) { - printk(KERN_ERR "Root-CIFS: UNC path too long\n"); + pr_err("Root-CIFS: UNC path too long\n"); return 1; } strlcpy(root_dev, line, len); @@ -66,7 +66,7 @@ static int __init cifs_root_setup(char *line) sizeof(root_opts), "%s,%s", DEFAULT_MNT_OPTS, s + 1); if (n >= sizeof(root_opts)) { - printk(KERN_ERR "Root-CIFS: mount options string too long\n"); + pr_err("Root-CIFS: mount options string too long\n"); root_opts[sizeof(root_opts)-1] = '\0'; return 1; } @@ -83,7 +83,7 @@ __setup("cifsroot=", cifs_root_setup); int __init cifs_root_data(char **dev, char **opts) { if (!root_dev[0] || root_server_addr == htonl(INADDR_NONE)) { - printk(KERN_ERR "Root-CIFS: no SMB server address\n"); + pr_err("Root-CIFS: no SMB server address\n"); return -1; } diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 5a684f9ac883..5820f9569b7f 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -281,8 +281,8 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) (server->tcpStatus != CifsNeedReconnect), 10 * HZ); if (rc < 0) { - cifs_dbg(FYI, "%s: aborting reconnect due to a received" - " signal by the process\n", __func__); + cifs_dbg(FYI, "%s: aborting reconnect due to a received signal by the process\n", + __func__); return -ERESTARTSYS; } @@ -343,7 +343,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); if (rc) { - printk_once(KERN_WARNING "reconnect tcon failed rc = %d\n", rc); + pr_warn_once("reconnect tcon failed rc = %d\n", rc); goto out; } @@ -576,7 +576,7 @@ cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required) /* If server requires signing, does client allow it? */ if (srv_sign_required) { if (!mnt_sign_enabled) { - cifs_dbg(VFS, "Server requires signing, but it's disabled in SecurityFlags!"); + cifs_dbg(VFS, "Server requires signing, but it's disabled in SecurityFlags!\n"); return -ENOTSUPP; } server->sign = true; @@ -585,14 +585,14 @@ cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required) /* If client requires signing, does server allow it? */ if (mnt_sign_required) { if (!srv_sign_enabled) { - cifs_dbg(VFS, "Server does not support signing!"); + cifs_dbg(VFS, "Server does not support signing!\n"); return -ENOTSUPP; } server->sign = true; } if (cifs_rdma_enabled(server) && server->sign) - cifs_dbg(VFS, "Signing is enabled, and RDMA read/write will be disabled"); + cifs_dbg(VFS, "Signing is enabled, and RDMA read/write will be disabled\n"); return 0; } @@ -722,7 +722,7 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS); if (should_set_ext_sec_flag(ses->sectype)) { - cifs_dbg(FYI, "Requesting extended security."); + cifs_dbg(FYI, "Requesting extended security\n"); pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; } @@ -3887,7 +3887,7 @@ GetExtAttrRetry: struct file_chattr_info *pfinfo; /* BB Do we need a cast or hash here ? */ if (count != 16) { - cifs_dbg(FYI, "Illegal size ret in GetExtAttr\n"); + cifs_dbg(FYI, "Invalid size ret in GetExtAttr\n"); rc = -EIO; goto GetExtAttrOut; } @@ -4263,7 +4263,7 @@ QFileInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cifs_dbg(FYI, "Send error in QFileInfo = %d", rc); + cifs_dbg(FYI, "Send error in QFileInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4430,7 +4430,7 @@ UnixQFileInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cifs_dbg(FYI, "Send error in UnixQFileInfo = %d", rc); + cifs_dbg(FYI, "Send error in UnixQFileInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4512,7 +4512,7 @@ UnixQPathInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cifs_dbg(FYI, "Send error in UnixQPathInfo = %d", rc); + cifs_dbg(FYI, "Send error in UnixQPathInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4932,7 +4932,7 @@ GetInodeNumberRetry: struct file_internal_info *pfinfo; /* BB Do we need a cast or hash here ? */ if (count < 8) { - cifs_dbg(FYI, "Illegal size ret in QryIntrnlInf\n"); + cifs_dbg(FYI, "Invalid size ret in QryIntrnlInf\n"); rc = -EIO; goto GetInodeNumOut; } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 57d1cc6bf86f..daf90f988de1 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -879,8 +879,7 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed) * function has finished processing it is a bug. */ if (mid->mid_flags & MID_DELETED) - printk_once(KERN_WARNING - "trying to dequeue a deleted mid\n"); + pr_warn_once("trying to dequeue a deleted mid\n"); else { list_del_init(&mid->qhead); mid->mid_flags |= MID_DELETED; @@ -1229,9 +1228,8 @@ next_pdu: smb2_add_credits_from_hdr(bufs[i], server); cifs_dbg(FYI, "Received oplock break\n"); } else { - cifs_server_dbg(VFS, "No task to wake, unknown frame " - "received! NumMids %d\n", - atomic_read(&midCount)); + cifs_server_dbg(VFS, "No task to wake, unknown frame received! NumMids %d\n", + atomic_read(&midCount)); cifs_dump_mem("Received Data is: ", bufs[i], HEADER_SIZE(server)); smb2_add_credits_from_hdr(bufs[i], server); @@ -1476,9 +1474,7 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol, bool is_smb3) cifs_dbg(VFS, "vers=1.0 (cifs) not permitted when mounting with smb3\n"); return 1; } - cifs_dbg(VFS, "Use of the less secure dialect vers=1.0 " - "is not recommended unless required for " - "access to very old servers\n"); + cifs_dbg(VFS, "Use of the less secure dialect vers=1.0 is not recommended unless required for access to very old servers\n"); vol->ops = &smb1_operations; vol->vals = &smb1_values; break; @@ -1545,7 +1541,7 @@ cifs_parse_devname(const char *devname, struct smb_vol *vol) size_t len; if (unlikely(!devname || !*devname)) { - cifs_dbg(VFS, "Device name not specified.\n"); + cifs_dbg(VFS, "Device name not specified\n"); return -EINVAL; } @@ -1695,13 +1691,13 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case 0: break; case -ENOMEM: - cifs_dbg(VFS, "Unable to allocate memory for devname.\n"); + cifs_dbg(VFS, "Unable to allocate memory for devname\n"); goto cifs_parse_mount_err; case -EINVAL: - cifs_dbg(VFS, "Malformed UNC in devname.\n"); + cifs_dbg(VFS, "Malformed UNC in devname\n"); goto cifs_parse_mount_err; default: - cifs_dbg(VFS, "Unknown error parsing devname.\n"); + cifs_dbg(VFS, "Unknown error parsing devname\n"); goto cifs_parse_mount_err; } @@ -1912,7 +1908,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->seal = 1; break; case Opt_noac: - pr_warn("CIFS: Mount option noac not supported. Instead set /proc/fs/cifs/LookupCacheEnabled to 0\n"); + pr_warn("Mount option noac not supported. Instead set /proc/fs/cifs/LookupCacheEnabled to 0\n"); break; case Opt_fsc: #ifndef CONFIG_CIFS_FSCACHE @@ -2159,7 +2155,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (strnlen(string, CIFS_MAX_USERNAME_LEN) > CIFS_MAX_USERNAME_LEN) { - pr_warn("CIFS: username too long\n"); + pr_warn("username too long\n"); goto cifs_parse_mount_err; } @@ -2225,7 +2221,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, temp_len = strlen(value); vol->password = kzalloc(temp_len+1, GFP_KERNEL); if (vol->password == NULL) { - pr_warn("CIFS: no memory for password\n"); + pr_warn("no memory for password\n"); goto cifs_parse_mount_err; } @@ -2249,7 +2245,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (!cifs_convert_address(dstaddr, string, strlen(string))) { - pr_err("CIFS: bad ip= option (%s).\n", string); + pr_err("bad ip= option (%s)\n", string); goto cifs_parse_mount_err; } got_ip = true; @@ -2261,14 +2257,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (strnlen(string, CIFS_MAX_DOMAINNAME_LEN) == CIFS_MAX_DOMAINNAME_LEN) { - pr_warn("CIFS: domain name too long\n"); + pr_warn("domain name too long\n"); goto cifs_parse_mount_err; } kfree(vol->domainname); vol->domainname = kstrdup(string, GFP_KERNEL); if (!vol->domainname) { - pr_warn("CIFS: no memory for domainname\n"); + pr_warn("no memory for domainname\n"); goto cifs_parse_mount_err; } cifs_dbg(FYI, "Domain name set\n"); @@ -2281,7 +2277,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (!cifs_convert_address( (struct sockaddr *)&vol->srcaddr, string, strlen(string))) { - pr_warn("CIFS: Could not parse srcaddr: %s\n", + pr_warn("Could not parse srcaddr: %s\n", string); goto cifs_parse_mount_err; } @@ -2292,7 +2288,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto out_nomem; if (strnlen(string, 1024) >= 65) { - pr_warn("CIFS: iocharset name too long.\n"); + pr_warn("iocharset name too long\n"); goto cifs_parse_mount_err; } @@ -2301,7 +2297,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->iocharset = kstrdup(string, GFP_KERNEL); if (!vol->iocharset) { - pr_warn("CIFS: no memory for charset\n"); + pr_warn("no memory for charset\n"); goto cifs_parse_mount_err; } } @@ -2332,7 +2328,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, * set at top of the function */ if (i == RFC1001_NAME_LEN && string[i] != 0) - pr_warn("CIFS: netbiosname longer than 15 truncated.\n"); + pr_warn("netbiosname longer than 15 truncated\n"); break; case Opt_servern: /* servernetbiosname specified override *SMBSERVER */ @@ -2358,7 +2354,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, /* The string has 16th byte zero still from set at top of the function */ if (i == RFC1001_NAME_LEN && string[i] != 0) - pr_warn("CIFS: server netbiosname longer than 15 truncated.\n"); + pr_warn("server netbiosname longer than 15 truncated\n"); break; case Opt_ver: /* version of mount userspace tools, not dialect */ @@ -2369,17 +2365,15 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, /* If interface changes in mount.cifs bump to new ver */ if (strncasecmp(string, "1", 1) == 0) { if (strlen(string) > 1) { - pr_warn("Bad mount helper ver=%s. Did " - "you want SMB1 (CIFS) dialect " - "and mean to type vers=1.0 " - "instead?\n", string); + pr_warn("Bad mount helper ver=%s. Did you want SMB1 (CIFS) dialect and mean to type vers=1.0 instead?\n", + string); goto cifs_parse_mount_err; } /* This is the default */ break; } /* For all other value, error */ - pr_warn("CIFS: Invalid mount helper version specified\n"); + pr_warn("Invalid mount helper version specified\n"); goto cifs_parse_mount_err; case Opt_vers: /* protocol version (dialect) */ @@ -2422,7 +2416,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } if (!sloppy && invalid) { - pr_err("CIFS: Unknown mount option \"%s\"\n", invalid); + pr_err("Unknown mount option \"%s\"\n", invalid); goto cifs_parse_mount_err; } @@ -2458,7 +2452,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, slash = strchr(&vol->UNC[2], '\\'); len = slash - &vol->UNC[2]; if (!cifs_convert_address(dstaddr, &vol->UNC[2], len)) { - pr_err("Unable to determine destination address.\n"); + pr_err("Unable to determine destination address\n"); goto cifs_parse_mount_err; } } @@ -2469,20 +2463,15 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (uid_specified) vol->override_uid = override_uid; else if (override_uid == 1) - pr_notice("CIFS: ignoring forceuid mount option specified with no uid= option.\n"); + pr_notice("ignoring forceuid mount option specified with no uid= option\n"); if (gid_specified) vol->override_gid = override_gid; else if (override_gid == 1) - pr_notice("CIFS: ignoring forcegid mount option specified with no gid= option.\n"); + pr_notice("ignoring forcegid mount option specified with no gid= option\n"); if (got_version == false) - pr_warn_once("No dialect specified on mount. Default has changed" - " to a more secure dialect, SMB2.1 or later (e.g. " - "SMB3.1.1), from CIFS (SMB1). To use the less secure " - "SMB1 dialect to access old servers which do not " - "support SMB3.1.1 (or even SMB3 or SMB2.1) specify " - "vers=1.0 on mount.\n"); + pr_warn_once("No dialect specified on mount. Default has changed to a more secure dialect, SMB2.1 or later (e.g. SMB3.1.1), from CIFS (SMB1). To use the less secure SMB1 dialect to access old servers which do not support SMB3.1.1 (or even SMB3 or SMB2.1) specify vers=1.0 on mount.\n"); kfree(mountdata_copy); return 0; @@ -3200,8 +3189,8 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) strlen(ses->domainName), GFP_KERNEL); if (!vol->domainname) { - cifs_dbg(FYI, "Unable to allocate %zd bytes for " - "domain\n", len); + cifs_dbg(FYI, "Unable to allocate %zd bytes for domain\n", + len); rc = -ENOMEM; kfree(vol->username); vol->username = NULL; @@ -3524,10 +3513,9 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) if (volume_info->linux_ext) { if (ses->server->posix_ext_supported) { tcon->posix_extensions = true; - printk_once(KERN_WARNING - "SMB3.11 POSIX Extensions are experimental\n"); + pr_warn_once("SMB3.11 POSIX Extensions are experimental\n"); } else { - cifs_dbg(VFS, "Server does not support mounting with posix SMB3.11 extensions.\n"); + cifs_dbg(VFS, "Server does not support mounting with posix SMB3.11 extensions\n"); rc = -EOPNOTSUPP; goto out_fail; } @@ -4757,8 +4745,7 @@ static int is_path_remote(struct cifs_sb_info *cifs_sb, struct smb_vol *vol, rc = cifs_are_all_path_components_accessible(server, xid, tcon, cifs_sb, full_path, tcon->Flags & SMB_SHARE_IS_IN_DFS); if (rc != 0) { - cifs_server_dbg(VFS, "cannot query dirs between root and final path, " - "enabling CIFS_MOUNT_USE_PREFIX_PATH\n"); + cifs_server_dbg(VFS, "cannot query dirs between root and final path, enabling CIFS_MOUNT_USE_PREFIX_PATH\n"); cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; rc = 0; } diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index a67f88bf7ae1..df81c718d2fa 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -198,7 +198,7 @@ static ssize_t dfscache_proc_write(struct file *file, const char __user *buffer, if (c != '0') return -EINVAL; - cifs_dbg(FYI, "clearing dfs cache"); + cifs_dbg(FYI, "clearing dfs cache\n"); down_write(&htable_rw_lock); flush_cache_ents(); @@ -234,8 +234,8 @@ static inline void dump_tgts(const struct cache_entry *ce) static inline void dump_ce(const struct cache_entry *ce) { - cifs_dbg(FYI, "cache entry: path=%s,type=%s,ttl=%d,etime=%ld," - "interlink=%s,path_consumed=%d,expired=%s\n", ce->path, + cifs_dbg(FYI, "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,interlink=%s,path_consumed=%d,expired=%s\n", + ce->path, ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl, ce->etime.tv_nsec, IS_INTERLINK_SET(ce->flags) ? "yes" : "no", @@ -453,11 +453,11 @@ static void remove_oldest_entry(void) } if (!to_del) { - cifs_dbg(FYI, "%s: no entry to remove", __func__); + cifs_dbg(FYI, "%s: no entry to remove\n", __func__); return; } - cifs_dbg(FYI, "%s: removing entry", __func__); + cifs_dbg(FYI, "%s: removing entry\n", __func__); dump_ce(to_del); flush_cache_ent(to_del); } @@ -696,8 +696,8 @@ static int __dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, } if (atomic_read(&cache_count) >= CACHE_MAX_ENTRIES) { - cifs_dbg(FYI, "%s: reached max cache size (%d)", __func__, - CACHE_MAX_ENTRIES); + cifs_dbg(FYI, "%s: reached max cache size (%d)\n", + __func__, CACHE_MAX_ENTRIES); down_write(&htable_rw_lock); remove_oldest_entry(); up_write(&htable_rw_lock); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 75ddce8ef456..226bfa5e9444 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -857,7 +857,7 @@ cifs_reopen_persistent_handles(struct cifs_tcon *tcon) tcon->need_reopen_files = false; - cifs_dbg(FYI, "Reopen persistent handles"); + cifs_dbg(FYI, "Reopen persistent handles\n"); INIT_LIST_HEAD(&tmp_list); /* list all files open on tree connection, reopen resilient handles */ @@ -2056,7 +2056,7 @@ find_writable_file(struct cifsInodeInfo *cifs_inode, int flags) rc = cifs_get_writable_file(cifs_inode, flags, &cfile); if (rc) - cifs_dbg(FYI, "couldn't find writable handle rc=%d", rc); + cifs_dbg(FYI, "Couldn't find writable handle rc=%d\n", rc); return cfile; } @@ -2923,11 +2923,9 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, from, &pagevec, cur_len, &start); if (result < 0) { cifs_dbg(VFS, - "direct_writev couldn't get user pages " - "(rc=%zd) iter type %d iov_offset %zd " - "count %zd\n", - result, iov_iter_type(from), - from->iov_offset, from->count); + "direct_writev couldn't get user pages (rc=%zd) iter type %d iov_offset %zd count %zd\n", + result, iov_iter_type(from), + from->iov_offset, from->count); dump_stack(); rc = result; @@ -3654,12 +3652,10 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, cur_len, &start); if (result < 0) { cifs_dbg(VFS, - "couldn't get user pages (rc=%zd)" - " iter type %d" - " iov_offset %zd count %zd\n", - result, iov_iter_type(&direct_iov), - direct_iov.iov_offset, - direct_iov.count); + "Couldn't get user pages (rc=%zd) iter type %d iov_offset %zd count %zd\n", + result, iov_iter_type(&direct_iov), + direct_iov.iov_offset, + direct_iov.count); dump_stack(); rc = result; @@ -4828,7 +4824,7 @@ static int cifs_swap_activate(struct swap_info_struct *sis, } *span = sis->pages; - printk_once(KERN_WARNING "Swap support over SMB3 is experimental\n"); + pr_warn_once("Swap support over SMB3 is experimental\n"); /* * TODO: consider adding ACL (or documenting how) to prevent other diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 873b1effd412..b94c6398da94 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1155,7 +1155,7 @@ struct inode *cifs_root_iget(struct super_block *sb) /* some servers mistakenly claim POSIX support */ if (rc != -EOPNOTSUPP) goto iget_no_retry; - cifs_dbg(VFS, "server does not support POSIX extensions"); + cifs_dbg(VFS, "server does not support POSIX extensions\n"); tcon->unix_ext = false; } @@ -2010,7 +2010,7 @@ cifs_invalidate_mapping(struct inode *inode) if (inode->i_mapping && inode->i_mapping->nrpages != 0) { rc = invalidate_inode_pages2(inode->i_mapping); if (rc) - cifs_dbg(VFS, "%s: could not invalidate inode %p\n", + cifs_dbg(VFS, "%s: Could not invalidate inode %p\n", __func__, inode); } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 1ec6a5543eda..56791a692c8b 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -424,7 +424,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) if (data_offset > len - sizeof(struct file_notify_information)) { - cifs_dbg(FYI, "invalid data_offset %u\n", + cifs_dbg(FYI, "Invalid data_offset %u\n", data_offset); return true; } @@ -452,7 +452,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) large dirty files cached on the client */ if ((NT_STATUS_INVALID_HANDLE) == le32_to_cpu(pSMB->hdr.Status.CifsError)) { - cifs_dbg(FYI, "invalid handle on oplock break\n"); + cifs_dbg(FYI, "Invalid handle on oplock break\n"); return true; } else if (ERRbadfid == le16_to_cpu(pSMB->hdr.Status.DosError.Error)) { @@ -533,9 +533,9 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; cifs_sb->mnt_cifs_serverino_autodisabled = true; - cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s.\n", + cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s\n", tcon ? tcon->treeName : "new server"); - cifs_dbg(VFS, "The server doesn't seem to support them properly or the files might be on different servers (DFS).\n"); + cifs_dbg(VFS, "The server doesn't seem to support them properly or the files might be on different servers (DFS)\n"); cifs_dbg(VFS, "Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n"); } @@ -877,7 +877,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) while (count && npages < max_pages) { rc = iov_iter_get_pages(iter, pages, count, max_pages, &start); if (rc < 0) { - cifs_dbg(VFS, "couldn't get user pages (rc=%zd)\n", rc); + cifs_dbg(VFS, "Couldn't get user pages (rc=%zd)\n", rc); break; } @@ -936,7 +936,7 @@ cifs_alloc_hash(const char *name, *shash = crypto_alloc_shash(name, 0, 0); if (IS_ERR(*shash)) { - cifs_dbg(VFS, "could not allocate crypto %s\n", name); + cifs_dbg(VFS, "Could not allocate crypto %s\n", name); rc = PTR_ERR(*shash); *shash = NULL; *sdesc = NULL; diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 9b41436fb8db..b7ca4960d4ca 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -957,15 +957,15 @@ struct timespec64 cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset) sec = 2 * st->TwoSeconds; min = st->Minutes; if ((sec > 59) || (min > 59)) - cifs_dbg(VFS, "illegal time min %d sec %lld\n", min, sec); + cifs_dbg(VFS, "Invalid time min %d sec %lld\n", min, sec); sec += (min * 60); sec += 60 * 60 * st->Hours; if (st->Hours > 24) - cifs_dbg(VFS, "illegal hours %d\n", st->Hours); + cifs_dbg(VFS, "Invalid hours %d\n", st->Hours); day = sd->Day; month = sd->Month; if (day < 1 || day > 31 || month < 1 || month > 12) { - cifs_dbg(VFS, "illegal date, month %d day: %d\n", month, day); + cifs_dbg(VFS, "Invalid date, month %d day: %d\n", month, day); day = clamp(day, 1, 31); month = clamp(month, 1, 12); } diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 50f776a8d4ba..6df0922e7e30 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -53,7 +53,7 @@ static void dump_cifs_file_struct(struct file *file, char *label) return; } if (cf->invalidHandle) - cifs_dbg(FYI, "invalid handle\n"); + cifs_dbg(FYI, "Invalid handle\n"); if (cf->srch_inf.endOfSearch) cifs_dbg(FYI, "end of search\n"); if (cf->srch_inf.emptyDir) @@ -246,7 +246,7 @@ cifs_posix_to_fattr(struct cifs_fattr *fattr, struct smb2_posix_info *info, */ fattr->cf_mode = le32_to_cpu(info->Mode) & ~S_IFMT; - cifs_dbg(FYI, "posix fattr: dev %d, reparse %d, mode %o", + cifs_dbg(FYI, "posix fattr: dev %d, reparse %d, mode %o\n", le32_to_cpu(info->DeviceId), le32_to_cpu(info->ReparseTag), le32_to_cpu(info->Mode)); @@ -478,7 +478,7 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level) u32 next_offset = le32_to_cpu(pDirInfo->NextEntryOffset); if (old_entry + next_offset < old_entry) { - cifs_dbg(VFS, "invalid offset %u\n", next_offset); + cifs_dbg(VFS, "Invalid offset %u\n", next_offset); return NULL; } new_entry = old_entry + next_offset; @@ -515,7 +515,7 @@ static void cifs_fill_dirent_posix(struct cifs_dirent *de, /* payload should have already been checked at this point */ if (posix_info_parse(info, NULL, &parsed) < 0) { - cifs_dbg(VFS, "invalid POSIX info payload"); + cifs_dbg(VFS, "Invalid POSIX info payload\n"); return; } @@ -968,7 +968,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) } else if (current_entry != NULL) { cifs_dbg(FYI, "entry %lld found\n", ctx->pos); } else { - cifs_dbg(FYI, "could not find entry\n"); + cifs_dbg(FYI, "Could not find entry\n"); goto rddir2_exit; } cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n", diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 43a88e26d26b..3f8b43e77539 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -162,12 +162,14 @@ cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface) int rc; unsigned int xid = get_xid(); - cifs_dbg(FYI, "adding channel to ses %p (speed:%zu bps rdma:%s ", - ses, iface->speed, iface->rdma_capable ? "yes" : "no"); if (iface->sockaddr.ss_family == AF_INET) - cifs_dbg(FYI, "ip:%pI4)\n", &ipv4->sin_addr); + cifs_dbg(FYI, "adding channel to ses %p (speed:%zu bps rdma:%s ip:%pI4)\n", + ses, iface->speed, iface->rdma_capable ? "yes" : "no", + &ipv4->sin_addr); else - cifs_dbg(FYI, "ip:%pI6)\n", &ipv6->sin6_addr); + cifs_dbg(FYI, "adding channel to ses %p (speed:%zu bps rdma:%s ip:%pI4)\n", + ses, iface->speed, iface->rdma_capable ? "yes" : "no", + &ipv6->sin6_addr); /* * Setup a smb_vol with mostly the same info as the existing @@ -569,15 +571,15 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset); tilen = le16_to_cpu(pblob->TargetInfoArray.Length); if (tioffset > blob_len || tioffset + tilen > blob_len) { - cifs_dbg(VFS, "tioffset + tilen too high %u + %u", - tioffset, tilen); + cifs_dbg(VFS, "tioffset + tilen too high %u + %u\n", + tioffset, tilen); return -EINVAL; } if (tilen) { ses->auth_key.response = kmemdup(bcc_ptr + tioffset, tilen, GFP_KERNEL); if (!ses->auth_key.response) { - cifs_dbg(VFS, "Challenge target info alloc failure"); + cifs_dbg(VFS, "Challenge target info alloc failure\n"); return -ENOMEM; } ses->auth_key.len = tilen; @@ -1303,9 +1305,8 @@ sess_auth_kerberos(struct sess_data *sess_data) * sending us a response in an expected form */ if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { - cifs_dbg(VFS, - "incorrect version of cifs.upcall (expected %d but got %d)", - CIFS_SPNEGO_UPCALL_VERSION, msg->version); + cifs_dbg(VFS, "incorrect version of cifs.upcall (expected %d but got %d)\n", + CIFS_SPNEGO_UPCALL_VERSION, msg->version); rc = -EKEYREJECTED; goto out_put_spnego_key; } @@ -1313,8 +1314,8 @@ sess_auth_kerberos(struct sess_data *sess_data) ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, GFP_KERNEL); if (!ses->auth_key.response) { - cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory", - msg->sesskey_len); + cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory\n", + msg->sesskey_len); rc = -ENOMEM; goto out_put_spnego_key; } @@ -1657,8 +1658,7 @@ static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data) type = cifs_select_sectype(ses->server, ses->sectype); cifs_dbg(FYI, "sess setup type %d\n", type); if (type == Unspecified) { - cifs_dbg(VFS, - "Unable to select appropriate authentication method!"); + cifs_dbg(VFS, "Unable to select appropriate authentication method!\n"); return -EINVAL; } diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index b130efaf8feb..197ed455e657 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -247,7 +247,7 @@ check2ndT2(char *buf) /* check for plausible wct, bcc and t2 data and parm sizes */ /* check for parm and data offset going beyond end of smb */ if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */ - cifs_dbg(FYI, "invalid transact2 word count\n"); + cifs_dbg(FYI, "Invalid transact2 word count\n"); return -EINVAL; } diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 19115a9088ea..fa86c78384c3 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -337,8 +337,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_open_free(&rqst[0]); if (rc == -EREMCHG) { - printk_once(KERN_WARNING "server share %s deleted\n", - tcon->treeName); + pr_warn_once("server share %s deleted\n", tcon->treeName); tcon->need_reconnect = true; } diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 497afb0b9960..6a39451973f8 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -110,14 +110,14 @@ static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len, /* Make sure that negotiate contexts start after gss security blob */ nc_offset = le32_to_cpu(pneg_rsp->NegotiateContextOffset); if (nc_offset < non_ctxlen) { - printk_once(KERN_WARNING "invalid negotiate context offset\n"); + pr_warn_once("Invalid negotiate context offset\n"); return 0; } size_of_pad_before_neg_ctxts = nc_offset - non_ctxlen; /* Verify that at least minimal negotiate contexts fit within frame */ if (len < nc_offset + (neg_count * sizeof(struct smb2_neg_context))) { - printk_once(KERN_WARNING "negotiate context goes beyond end\n"); + pr_warn_once("negotiate context goes beyond end\n"); return 0; } @@ -190,14 +190,14 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) return 1; if (shdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) { - cifs_dbg(VFS, "Illegal structure size %u\n", + cifs_dbg(VFS, "Invalid structure size %u\n", le16_to_cpu(shdr->StructureSize)); return 1; } command = le16_to_cpu(shdr->Command); if (command >= NUMBER_OF_SMB2_COMMANDS) { - cifs_dbg(VFS, "Illegal SMB2 command %d\n", command); + cifs_dbg(VFS, "Invalid SMB2 command %d\n", command); return 1; } @@ -205,7 +205,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) if (command != SMB2_OPLOCK_BREAK_HE && (shdr->Status == 0 || pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2)) { /* error packets have 9 byte structure size */ - cifs_dbg(VFS, "Illegal response size %u for command %d\n", + cifs_dbg(VFS, "Invalid response size %u for command %d\n", le16_to_cpu(pdu->StructureSize2), command); return 1; } else if (command == SMB2_OPLOCK_BREAK_HE @@ -213,7 +213,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) && (le16_to_cpu(pdu->StructureSize2) != 44) && (le16_to_cpu(pdu->StructureSize2) != 36)) { /* special case for SMB2.1 lease break message */ - cifs_dbg(VFS, "Illegal response size %d for oplock break\n", + cifs_dbg(VFS, "Invalid response size %d for oplock break\n", le16_to_cpu(pdu->StructureSize2)); return 1; } @@ -864,14 +864,14 @@ ok: d = server->secmech.sdescsha512; rc = crypto_shash_init(&d->shash); if (rc) { - cifs_dbg(VFS, "%s: could not init sha512 shash\n", __func__); + cifs_dbg(VFS, "%s: Could not init sha512 shash\n", __func__); return rc; } rc = crypto_shash_update(&d->shash, ses->preauth_sha_hash, SMB2_PREAUTH_HASH_SIZE); if (rc) { - cifs_dbg(VFS, "%s: could not update sha512 shash\n", __func__); + cifs_dbg(VFS, "%s: Could not update sha512 shash\n", __func__); return rc; } @@ -879,7 +879,7 @@ ok: rc = crypto_shash_update(&d->shash, iov[i].iov_base, iov[i].iov_len); if (rc) { - cifs_dbg(VFS, "%s: could not update sha512 shash\n", + cifs_dbg(VFS, "%s: Could not update sha512 shash\n", __func__); return rc; } @@ -887,7 +887,7 @@ ok: rc = crypto_shash_final(&d->shash, ses->preauth_sha_hash); if (rc) { - cifs_dbg(VFS, "%s: could not finalize sha512 shash\n", + cifs_dbg(VFS, "%s: Could not finalize sha512 shash\n", __func__); return rc; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index fa5c79f64c0b..dec055d7c2f4 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -79,7 +79,7 @@ smb2_add_credits(struct TCP_Server_Info *server, if (*val > 65000) { *val = 65000; /* Don't get near 64K credits, avoid srv bugs */ - printk_once(KERN_WARNING "server overflowed SMB3 credits\n"); + pr_warn_once("server overflowed SMB3 credits\n"); } server->in_flight--; if (server->in_flight == 0 && (optype & CIFS_OP_MASK) != CIFS_NEG_OP) @@ -767,8 +767,8 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, if (rc) { if (rc == -EREMCHG) { tcon->need_reconnect = true; - printk_once(KERN_WARNING "server share %s deleted\n", - tcon->treeName); + pr_warn_once("server share %s deleted\n", + tcon->treeName); } goto oshr_exit; } @@ -1601,7 +1601,8 @@ smb2_ioctl_query_info(const unsigned int xid, qi.input_buffer_length, qi.output_buffer_length, buffer); } else { /* unknown flags */ - cifs_tcon_dbg(VFS, "invalid passthru query flags: 0x%x\n", qi.flags); + cifs_tcon_dbg(VFS, "Invalid passthru query flags: 0x%x\n", + qi.flags); rc = -EINVAL; } @@ -1731,7 +1732,7 @@ smb2_copychunk_range(const unsigned int xid, if (rc == 0) { if (ret_data_len != sizeof(struct copychunk_ioctl_rsp)) { - cifs_tcon_dbg(VFS, "invalid cchunk response size\n"); + cifs_tcon_dbg(VFS, "Invalid cchunk response size\n"); rc = -EIO; goto cchunk_out; } @@ -1745,12 +1746,12 @@ smb2_copychunk_range(const unsigned int xid, */ if (le32_to_cpu(retbuf->TotalBytesWritten) > le32_to_cpu(pcchunk->Length)) { - cifs_tcon_dbg(VFS, "invalid copy chunk response\n"); + cifs_tcon_dbg(VFS, "Invalid copy chunk response\n"); rc = -EIO; goto cchunk_out; } if (le32_to_cpu(retbuf->ChunksWritten) != 1) { - cifs_tcon_dbg(VFS, "invalid num chunks written\n"); + cifs_tcon_dbg(VFS, "Invalid num chunks written\n"); rc = -EIO; goto cchunk_out; } @@ -2484,8 +2485,8 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); if (rc == -EREMCHG) { tcon->need_reconnect = true; - printk_once(KERN_WARNING "server share %s deleted\n", - tcon->treeName); + pr_warn_once("server share %s deleted\n", + tcon->treeName); } goto qic_exit; } @@ -2765,15 +2766,15 @@ parse_reparse_point(struct reparse_data_buffer *buf, struct cifs_sb_info *cifs_sb) { if (plen < sizeof(struct reparse_data_buffer)) { - cifs_dbg(VFS, "reparse buffer is too small. Must be " - "at least 8 bytes but was %d\n", plen); + cifs_dbg(VFS, "reparse buffer is too small. Must be at least 8 bytes but was %d\n", + plen); return -EIO; } if (plen < le16_to_cpu(buf->ReparseDataLength) + sizeof(struct reparse_data_buffer)) { - cifs_dbg(VFS, "srv returned invalid reparse buf " - "length: %d\n", plen); + cifs_dbg(VFS, "srv returned invalid reparse buf length: %d\n", + plen); return -EIO; } @@ -2788,8 +2789,8 @@ parse_reparse_point(struct reparse_data_buffer *buf, (struct reparse_symlink_data_buffer *)buf, plen, target_path, cifs_sb); default: - cifs_dbg(VFS, "srv returned unknown symlink buffer " - "tag:0x%08x\n", le32_to_cpu(buf->ReparseTag)); + cifs_dbg(VFS, "srv returned unknown symlink buffer tag:0x%08x\n", + le32_to_cpu(buf->ReparseTag)); return -EOPNOTSUPP; } } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index cabc19f404e6..4b79181ff872 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -334,8 +334,8 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) (server->tcpStatus != CifsNeedReconnect), 10 * HZ); if (rc < 0) { - cifs_dbg(FYI, "%s: aborting reconnect due to a received" - " signal by the process\n", __func__); + cifs_dbg(FYI, "%s: aborting reconnect due to a received signal by the process\n", + __func__); return -ERESTARTSYS; } @@ -404,7 +404,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); if (rc) { /* If sess reconnected but tcon didn't, something strange ... */ - printk_once(KERN_WARNING "reconnect tcon failed rc = %d\n", rc); + pr_warn_once("reconnect tcon failed rc = %d\n", rc); goto out; } @@ -646,13 +646,13 @@ static void decode_preauth_context(struct smb2_preauth_neg_context *ctxt) /* If invalid preauth context warn but use what we requested, SHA-512 */ if (len < MIN_PREAUTH_CTXT_DATA_LEN) { - printk_once(KERN_WARNING "server sent bad preauth context\n"); + pr_warn_once("server sent bad preauth context\n"); return; } if (le16_to_cpu(ctxt->HashAlgorithmCount) != 1) - printk_once(KERN_WARNING "illegal SMB3 hash algorithm count\n"); + pr_warn_once("Invalid SMB3 hash algorithm count\n"); if (ctxt->HashAlgorithms != SMB2_PREAUTH_INTEGRITY_SHA512) - printk_once(KERN_WARNING "unknown SMB3 hash algorithm\n"); + pr_warn_once("unknown SMB3 hash algorithm\n"); } static void decode_compress_ctx(struct TCP_Server_Info *server, @@ -662,15 +662,15 @@ static void decode_compress_ctx(struct TCP_Server_Info *server, /* sizeof compress context is a one element compression capbility struct */ if (len < 10) { - printk_once(KERN_WARNING "server sent bad compression cntxt\n"); + pr_warn_once("server sent bad compression cntxt\n"); return; } if (le16_to_cpu(ctxt->CompressionAlgorithmCount) != 1) { - printk_once(KERN_WARNING "illegal SMB3 compress algorithm count\n"); + pr_warn_once("Invalid SMB3 compress algorithm count\n"); return; } if (le16_to_cpu(ctxt->CompressionAlgorithms[0]) > 3) { - printk_once(KERN_WARNING "unknown compression algorithm\n"); + pr_warn_once("unknown compression algorithm\n"); return; } server->compress_algorithm = ctxt->CompressionAlgorithms[0]; @@ -683,18 +683,18 @@ static int decode_encrypt_ctx(struct TCP_Server_Info *server, cifs_dbg(FYI, "decode SMB3.11 encryption neg context of len %d\n", len); if (len < MIN_ENCRYPT_CTXT_DATA_LEN) { - printk_once(KERN_WARNING "server sent bad crypto ctxt len\n"); + pr_warn_once("server sent bad crypto ctxt len\n"); return -EINVAL; } if (le16_to_cpu(ctxt->CipherCount) != 1) { - printk_once(KERN_WARNING "illegal SMB3.11 cipher count\n"); + pr_warn_once("Invalid SMB3.11 cipher count\n"); return -EINVAL; } cifs_dbg(FYI, "SMB311 cipher type:%d\n", le16_to_cpu(ctxt->Ciphers[0])); if ((ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_CCM) && (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_GCM)) { - printk_once(KERN_WARNING "invalid SMB3.11 cipher returned\n"); + pr_warn_once("Invalid SMB3.11 cipher returned\n"); return -EINVAL; } server->cipher_type = ctxt->Ciphers[0]; @@ -794,7 +794,7 @@ create_posix_buf(umode_t mode) buf->Name[14] = 0xCD; buf->Name[15] = 0x7C; buf->Mode = cpu_to_le32(mode); - cifs_dbg(FYI, "mode on posix create 0%o", mode); + cifs_dbg(FYI, "mode on posix create 0%o\n", mode); return buf; } @@ -806,7 +806,7 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode) iov[num].iov_base = create_posix_buf(mode); if (mode == ACL_NO_MODE) - cifs_dbg(FYI, "illegal mode\n"); + cifs_dbg(FYI, "Invalid mode\n"); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_posix); @@ -924,9 +924,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]); */ if (rc == -EOPNOTSUPP) { - cifs_server_dbg(VFS, "Dialect not supported by server. Consider " - "specifying vers=1.0 or vers=2.0 on mount for accessing" - " older servers\n"); + cifs_server_dbg(VFS, "Dialect not supported by server. Consider specifying vers=1.0 or vers=2.0 on mount for accessing older servers\n"); goto neg_exit; } else if (rc != 0) goto neg_exit; @@ -959,8 +957,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) } else if (le16_to_cpu(rsp->DialectRevision) != server->vals->protocol_id) { /* if requested single dialect ensure returned dialect matched */ - cifs_server_dbg(VFS, "Illegal 0x%x dialect returned: not requested\n", - le16_to_cpu(rsp->DialectRevision)); + cifs_server_dbg(VFS, "Invalid 0x%x dialect returned: not requested\n", + le16_to_cpu(rsp->DialectRevision)); return -EIO; } @@ -977,8 +975,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) cifs_dbg(FYI, "negotiated smb3.1.1 dialect\n"); else { - cifs_server_dbg(VFS, "Illegal dialect returned by server 0x%x\n", - le16_to_cpu(rsp->DialectRevision)); + cifs_server_dbg(VFS, "Invalid dialect returned by server 0x%x\n", + le16_to_cpu(rsp->DialectRevision)); rc = -EIO; goto neg_exit; } @@ -1136,15 +1134,16 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) rc = 0; goto out_free_inbuf; } else if (rc != 0) { - cifs_tcon_dbg(VFS, "validate protocol negotiate failed: %d\n", rc); + cifs_tcon_dbg(VFS, "validate protocol negotiate failed: %d\n", + rc); rc = -EIO; goto out_free_inbuf; } rc = -EIO; if (rsplen != sizeof(*pneg_rsp)) { - cifs_tcon_dbg(VFS, "invalid protocol negotiate response size: %d\n", - rsplen); + cifs_tcon_dbg(VFS, "Invalid protocol negotiate response size: %d\n", + rsplen); /* relax check since Mac returns max bufsize allowed on ioctl */ if (rsplen > CIFSMaxBufSize || rsplen < sizeof(*pneg_rsp)) @@ -1377,9 +1376,8 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) * sending us a response in an expected form */ if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { - cifs_dbg(VFS, - "bad cifs.upcall version. Expected %d got %d", - CIFS_SPNEGO_UPCALL_VERSION, msg->version); + cifs_dbg(VFS, "bad cifs.upcall version. Expected %d got %d\n", + CIFS_SPNEGO_UPCALL_VERSION, msg->version); rc = -EKEYREJECTED; goto out_put_spnego_key; } @@ -1389,8 +1387,7 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, GFP_KERNEL); if (!ses->auth_key.response) { - cifs_dbg(VFS, - "Kerberos can't allocate (%u bytes) memory", + cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory\n", msg->sesskey_len); rc = -ENOMEM; goto out_put_spnego_key; @@ -1604,8 +1601,7 @@ SMB2_select_sec(struct cifs_ses *ses, struct SMB2_sess_data *sess_data) type = smb2_select_sectype(cifs_ses_server(ses), ses->sectype); cifs_dbg(FYI, "sess setup type %d\n", type); if (type == Unspecified) { - cifs_dbg(VFS, - "Unable to select appropriate authentication method!"); + cifs_dbg(VFS, "Unable to select appropriate authentication method!\n"); return -EINVAL; } @@ -2832,8 +2828,8 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, trace_smb3_open_err(xid, tcon->tid, ses->Suid, oparms->create_options, oparms->desired_access, rc); if (rc == -EREMCHG) { - printk_once(KERN_WARNING "server share %s deleted\n", - tcon->treeName); + pr_warn_once("server share %s deleted\n", + tcon->treeName); tcon->need_reconnect = true; } goto creat_exit; @@ -3245,7 +3241,7 @@ smb2_validate_iov(unsigned int offset, unsigned int buffer_length, } if ((begin_of_buf > end_of_smb) || (end_of_buf > end_of_smb)) { - cifs_dbg(VFS, "illegal server response, bad offset to data\n"); + cifs_dbg(VFS, "Invalid server response, bad offset to data\n"); return -EINVAL; } @@ -4128,8 +4124,8 @@ smb2_writev_callback(struct mid_q_entry *mid) tcon->tid, tcon->ses->Suid, wdata->offset, wdata->bytes, wdata->result); if (wdata->result == -ENOSPC) - printk_once(KERN_WARNING "Out of space writing to %s\n", - tcon->treeName); + pr_warn_once("Out of space writing to %s\n", + tcon->treeName); } else trace_smb3_write_done(0 /* no xid */, wdata->cfile->fid.persistent_fid, @@ -4652,7 +4648,7 @@ smb2_parse_query_directory(struct cifs_tcon *tcon, else if (resp_buftype == CIFS_SMALL_BUFFER) srch_inf->smallBuf = true; else - cifs_tcon_dbg(VFS, "illegal search buffer type\n"); + cifs_tcon_dbg(VFS, "Invalid search buffer type\n"); return 0; } diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 1a5834a5d597..b029ed31ef91 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -294,15 +294,12 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) static void dump_smbd_negotiate_resp(struct smbd_negotiate_resp *resp) { - log_rdma_event(INFO, "resp message min_version %u max_version %u " - "negotiated_version %u credits_requested %u " - "credits_granted %u status %u max_readwrite_size %u " - "preferred_send_size %u max_receive_size %u " - "max_fragmented_size %u\n", - resp->min_version, resp->max_version, resp->negotiated_version, - resp->credits_requested, resp->credits_granted, resp->status, - resp->max_readwrite_size, resp->preferred_send_size, - resp->max_receive_size, resp->max_fragmented_size); + log_rdma_event(INFO, "resp message min_version %u max_version %u negotiated_version %u credits_requested %u credits_granted %u status %u max_readwrite_size %u preferred_send_size %u max_receive_size %u max_fragmented_size %u\n", + resp->min_version, resp->max_version, + resp->negotiated_version, resp->credits_requested, + resp->credits_granted, resp->status, + resp->max_readwrite_size, resp->preferred_send_size, + resp->max_receive_size, resp->max_fragmented_size); } /* @@ -450,10 +447,9 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) struct smbd_connection *info = response->info; int data_length = 0; - log_rdma_recv(INFO, "response=%p type=%d wc status=%d wc opcode %d " - "byte_len=%d pkey_index=%x\n", - response, response->type, wc->status, wc->opcode, - wc->byte_len, wc->pkey_index); + log_rdma_recv(INFO, "response=%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%x\n", + response, response->type, wc->status, wc->opcode, + wc->byte_len, wc->pkey_index); if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { log_rdma_recv(INFO, "wc->status=%d opcode=%d\n", @@ -519,12 +515,11 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) wake_up_interruptible(&info->wait_send_queue); } - log_incoming(INFO, "data flags %d data_offset %d " - "data_length %d remaining_data_length %d\n", - le16_to_cpu(data_transfer->flags), - le32_to_cpu(data_transfer->data_offset), - le32_to_cpu(data_transfer->data_length), - le32_to_cpu(data_transfer->remaining_data_length)); + log_incoming(INFO, "data flags %d data_offset %d data_length %d remaining_data_length %d\n", + le16_to_cpu(data_transfer->flags), + le32_to_cpu(data_transfer->data_offset), + le32_to_cpu(data_transfer->data_length), + le32_to_cpu(data_transfer->remaining_data_length)); /* Send a KEEP_ALIVE response right away if requested */ info->keep_alive_requested = KEEP_ALIVE_NONE; @@ -632,14 +627,10 @@ static int smbd_ia_open( } if (!frwr_is_supported(&info->id->device->attrs)) { - log_rdma_event(ERR, - "Fast Registration Work Requests " - "(FRWR) is not supported\n"); - log_rdma_event(ERR, - "Device capability flags = %llx " - "max_fast_reg_page_list_len = %u\n", - info->id->device->attrs.device_cap_flags, - info->id->device->attrs.max_fast_reg_page_list_len); + log_rdma_event(ERR, "Fast Registration Work Requests (FRWR) is not supported\n"); + log_rdma_event(ERR, "Device capability flags = %llx max_fast_reg_page_list_len = %u\n", + info->id->device->attrs.device_cap_flags, + info->id->device->attrs.max_fast_reg_page_list_len); rc = -EPROTONOSUPPORT; goto out2; } @@ -898,13 +889,12 @@ wait_send_queue: packet->remaining_data_length = cpu_to_le32(remaining_data_length); packet->padding = 0; - log_outgoing(INFO, "credits_requested=%d credits_granted=%d " - "data_offset=%d data_length=%d remaining_data_length=%d\n", - le16_to_cpu(packet->credits_requested), - le16_to_cpu(packet->credits_granted), - le32_to_cpu(packet->data_offset), - le32_to_cpu(packet->data_length), - le32_to_cpu(packet->remaining_data_length)); + log_outgoing(INFO, "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n", + le16_to_cpu(packet->credits_requested), + le16_to_cpu(packet->credits_granted), + le32_to_cpu(packet->data_offset), + le32_to_cpu(packet->data_length), + le32_to_cpu(packet->remaining_data_length)); /* Map the packet to DMA */ header_length = sizeof(struct smbd_data_transfer); @@ -1078,11 +1068,9 @@ static int smbd_negotiate(struct smbd_connection *info) response->type = SMBD_NEGOTIATE_RESP; rc = smbd_post_recv(info, response); - log_rdma_event(INFO, - "smbd_post_recv rc=%d iov.addr=%llx iov.length=%x " - "iov.lkey=%x\n", - rc, response->sge.addr, - response->sge.length, response->sge.lkey); + log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=%llx iov.length=%x iov.lkey=%x\n", + rc, response->sge.addr, + response->sge.length, response->sge.lkey); if (rc) return rc; @@ -1540,25 +1528,19 @@ static struct smbd_connection *_smbd_get_connection( if (smbd_send_credit_target > info->id->device->attrs.max_cqe || smbd_send_credit_target > info->id->device->attrs.max_qp_wr) { - log_rdma_event(ERR, - "consider lowering send_credit_target = %d. " - "Possible CQE overrun, device " - "reporting max_cpe %d max_qp_wr %d\n", - smbd_send_credit_target, - info->id->device->attrs.max_cqe, - info->id->device->attrs.max_qp_wr); + log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n", + smbd_send_credit_target, + info->id->device->attrs.max_cqe, + info->id->device->attrs.max_qp_wr); goto config_failed; } if (smbd_receive_credit_max > info->id->device->attrs.max_cqe || smbd_receive_credit_max > info->id->device->attrs.max_qp_wr) { - log_rdma_event(ERR, - "consider lowering receive_credit_max = %d. " - "Possible CQE overrun, device " - "reporting max_cpe %d max_qp_wr %d\n", - smbd_receive_credit_max, - info->id->device->attrs.max_cqe, - info->id->device->attrs.max_qp_wr); + log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n", + smbd_receive_credit_max, + info->id->device->attrs.max_cqe, + info->id->device->attrs.max_qp_wr); goto config_failed; } @@ -1865,11 +1847,9 @@ again: to_read -= to_copy; data_read += to_copy; - log_read(INFO, "_get_first_reassembly memcpy %d bytes " - "data_transfer_length-offset=%d after that " - "to_read=%d data_read=%d offset=%d\n", - to_copy, data_length - offset, - to_read, data_read, offset); + log_read(INFO, "_get_first_reassembly memcpy %d bytes data_transfer_length-offset=%d after that to_read=%d data_read=%d offset=%d\n", + to_copy, data_length - offset, + to_read, data_read, offset); } spin_lock_irq(&info->reassembly_queue_lock); @@ -1878,10 +1858,9 @@ again: spin_unlock_irq(&info->reassembly_queue_lock); info->first_entry_offset = offset; - log_read(INFO, "returning to thread data_read=%d " - "reassembly_data_length=%d first_entry_offset=%d\n", - data_read, info->reassembly_data_length, - info->first_entry_offset); + log_read(INFO, "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n", + data_read, info->reassembly_data_length, + info->first_entry_offset); read_rfc1002_done: return data_read; } @@ -1952,7 +1931,7 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) if (iov_iter_rw(&msg->msg_iter) == WRITE) { /* It's a bug in upper layer to get there */ - cifs_dbg(VFS, "CIFS: invalid msg iter dir %u\n", + cifs_dbg(VFS, "Invalid msg iter dir %u\n", iov_iter_rw(&msg->msg_iter)); rc = -EINVAL; goto out; @@ -1974,7 +1953,7 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) default: /* It's a bug in upper layer to get there */ - cifs_dbg(VFS, "CIFS: invalid msg type %d\n", + cifs_dbg(VFS, "Invalid msg type %d\n", iov_iter_type(&msg->msg_iter)); rc = -EINVAL; } @@ -2043,10 +2022,9 @@ next_rqst: dump_smb(iov[i].iov_base, iov[i].iov_len); - log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d " - "rq_tailsz=%d buflen=%lu\n", - rqst_idx, rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz, - rqst->rq_tailsz, smb_rqst_len(server, rqst)); + log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d rq_tailsz=%d buflen=%lu\n", + rqst_idx, rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz, + rqst->rq_tailsz, smb_rqst_len(server, rqst)); start = i = 0; buflen = 0; @@ -2056,11 +2034,9 @@ next_rqst: if (i > start) { remaining_data_length -= (buflen-iov[i].iov_len); - log_write(INFO, "sending iov[] from start=%d " - "i=%d nvecs=%d " - "remaining_data_length=%d\n", - start, i, i-start, - remaining_data_length); + log_write(INFO, "sending iov[] from start=%d i=%d nvecs=%d remaining_data_length=%d\n", + start, i, i - start, + remaining_data_length); rc = smbd_post_send_data( info, &iov[start], i-start, remaining_data_length); @@ -2069,10 +2045,9 @@ next_rqst: } else { /* iov[start] is too big, break it */ nvecs = (buflen+max_iov_size-1)/max_iov_size; - log_write(INFO, "iov[%d] iov_base=%p buflen=%d" - " break to %d vectors\n", - start, iov[start].iov_base, - buflen, nvecs); + log_write(INFO, "iov[%d] iov_base=%p buflen=%d break to %d vectors\n", + start, iov[start].iov_base, + buflen, nvecs); for (j = 0; j < nvecs; j++) { vec.iov_base = (char *)iov[start].iov_base + @@ -2084,11 +2059,9 @@ next_rqst: max_iov_size*(nvecs-1); remaining_data_length -= vec.iov_len; log_write(INFO, - "sending vec j=%d iov_base=%p" - " iov_len=%zu " - "remaining_data_length=%d\n", - j, vec.iov_base, vec.iov_len, - remaining_data_length); + "sending vec j=%d iov_base=%p iov_len=%zu remaining_data_length=%d\n", + j, vec.iov_base, vec.iov_len, + remaining_data_length); rc = smbd_post_send_data( info, &vec, 1, remaining_data_length); @@ -2106,11 +2079,9 @@ next_rqst: if (i == rqst->rq_nvec) { /* send out all remaining vecs */ remaining_data_length -= buflen; - log_write(INFO, - "sending iov[] from start=%d i=%d " - "nvecs=%d remaining_data_length=%d\n", - start, i, i-start, - remaining_data_length); + log_write(INFO, "sending iov[] from start=%d i=%d nvecs=%d remaining_data_length=%d\n", + start, i, i - start, + remaining_data_length); rc = smbd_post_send_data(info, &iov[start], i-start, remaining_data_length); if (rc) @@ -2134,10 +2105,9 @@ next_rqst: if (j == nvecs-1) size = buflen - j*max_iov_size; remaining_data_length -= size; - log_write(INFO, "sending pages i=%d offset=%d size=%d" - " remaining_data_length=%d\n", - i, j*max_iov_size+offset, size, - remaining_data_length); + log_write(INFO, "sending pages i=%d offset=%d size=%d remaining_data_length=%d\n", + i, j * max_iov_size + offset, size, + remaining_data_length); rc = smbd_post_send_page( info, rqst->rq_pages[i], j*max_iov_size + offset, @@ -2211,11 +2181,9 @@ static void smbd_mr_recovery_work(struct work_struct *work) info->pd, info->mr_type, info->max_frmr_depth); if (IS_ERR(smbdirect_mr->mr)) { - log_rdma_mr(ERR, - "ib_alloc_mr failed mr_type=%x " - "max_frmr_depth=%x\n", - info->mr_type, - info->max_frmr_depth); + log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", + info->mr_type, + info->max_frmr_depth); smbd_disconnect_rdma_connection(info); continue; } @@ -2278,9 +2246,8 @@ static int allocate_mr_list(struct smbd_connection *info) smbdirect_mr->mr = ib_alloc_mr(info->pd, info->mr_type, info->max_frmr_depth); if (IS_ERR(smbdirect_mr->mr)) { - log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x " - "max_frmr_depth=%x\n", - info->mr_type, info->max_frmr_depth); + log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", + info->mr_type, info->max_frmr_depth); goto out; } smbdirect_mr->sgl = kcalloc( diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index c97570eb2c18..c359221d6848 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -112,7 +112,7 @@ static void _cifs_mid_q_entry_release(struct kref *refcount) #ifdef CONFIG_CIFS_STATS2 now = jiffies; if (now < midEntry->when_alloc) - cifs_server_dbg(VFS, "invalid mid allocation time\n"); + cifs_server_dbg(VFS, "Invalid mid allocation time\n"); roundtrip_time = now - midEntry->when_alloc; if (smb_cmd < NUMBER_OF_SMB2_COMMANDS) { @@ -151,12 +151,12 @@ static void _cifs_mid_q_entry_release(struct kref *refcount) trace_smb3_slow_rsp(smb_cmd, midEntry->mid, midEntry->pid, midEntry->when_sent, midEntry->when_received); if (cifsFYI & CIFS_TIMER) { - pr_debug(" CIFS slow rsp: cmd %d mid %llu", - midEntry->command, midEntry->mid); - cifs_info(" A: 0x%lx S: 0x%lx R: 0x%lx\n", - now - midEntry->when_alloc, - now - midEntry->when_sent, - now - midEntry->when_received); + pr_debug("slow rsp: cmd %d mid %llu", + midEntry->command, midEntry->mid); + cifs_info("A: 0x%lx S: 0x%lx R: 0x%lx\n", + now - midEntry->when_alloc, + now - midEntry->when_sent, + now - midEntry->when_received); } } #endif @@ -477,8 +477,7 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, return -ENOMEM; if (!server->ops->init_transform_rq) { - cifs_server_dbg(VFS, "Encryption requested but transform " - "callback is missing\n"); + cifs_server_dbg(VFS, "Encryption requested but transform callback is missing\n"); return -EIO; } @@ -1300,8 +1299,8 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, use ses->maxReq */ if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { - cifs_server_dbg(VFS, "Illegal length, greater than maximum frame, %d\n", - len); + cifs_server_dbg(VFS, "Invalid length, greater than maximum frame, %d\n", + len); return -EIO; } @@ -1441,8 +1440,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, use ses->maxReq */ if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { - cifs_tcon_dbg(VFS, "Illegal length, greater than maximum frame, %d\n", - len); + cifs_tcon_dbg(VFS, "Invalid length, greater than maximum frame, %d\n", + len); return -EIO; } From adbb2dafe732d4715a602ca727dedaa55c0df7a7 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 30 May 2020 16:45:11 -0500 Subject: [PATCH 252/427] cifs: minor fix to two debug messages Joe Perches pointed out that we were missing a newline at the end of two debug messages Reported-by: Joe Perches Signed-off-by: Steve French --- fs/cifs/cifssmb.c | 2 +- fs/cifs/smb2pdu.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 5820f9569b7f..bf41ee048396 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -177,7 +177,7 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc, if (dfs_host_len != tcp_host_len || strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) { - cifs_dbg(FYI, "%s: %.*s doesn't match %.*s", + cifs_dbg(FYI, "%s: %.*s doesn't match %.*s\n", __func__, (int)dfs_host_len, dfs_host, (int)tcp_host_len, tcp_host); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 4b79181ff872..06463f386a60 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -209,7 +209,7 @@ static int __smb2_reconnect(const struct nls_table *nlsc, if (dfs_host_len != tcp_host_len || strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) { - cifs_dbg(FYI, "%s: %.*s doesn't match %.*s", + cifs_dbg(FYI, "%s: %.*s doesn't match %.*s\n", __func__, (int)dfs_host_len, dfs_host, (int)tcp_host_len, tcp_host); From fcee90cdf6f3a3a371add04d41528d5ba9c3b411 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Thu, 21 May 2020 10:47:09 +0800 Subject: [PATCH 253/427] PCI: tegra: Fix runtime PM imbalance on error pm_runtime_get_sync() increments the runtime PM usage counter even when it returns an error code. Thus a pairing decrement is needed on the error handling path to keep the counter balanced. Also, call pm_runtime_disable() when pm_runtime_get_sync() returns an error code. Link: https://lore.kernel.org/r/20200521024709.2368-1-dinghao.liu@zju.edu.cn Signed-off-by: Dinghao Liu Signed-off-by: Lorenzo Pieralisi Acked-by: Thierry Reding --- drivers/pci/controller/pci-tegra.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/pci/controller/pci-tegra.c b/drivers/pci/controller/pci-tegra.c index e3e917243e10..235b456698fc 100644 --- a/drivers/pci/controller/pci-tegra.c +++ b/drivers/pci/controller/pci-tegra.c @@ -2712,7 +2712,7 @@ static int tegra_pcie_probe(struct platform_device *pdev) err = pm_runtime_get_sync(pcie->dev); if (err < 0) { dev_err(dev, "fail to enable pcie controller: %d\n", err); - goto teardown_msi; + goto pm_runtime_put; } host->busnr = bus->start; @@ -2746,7 +2746,6 @@ static int tegra_pcie_probe(struct platform_device *pdev) pm_runtime_put: pm_runtime_put_sync(pcie->dev); pm_runtime_disable(pcie->dev); -teardown_msi: tegra_pcie_msi_teardown(pcie); put_resources: tegra_pcie_put_resources(pcie); From 4b50c8c4eaf06a825d1c005c0b1b4a8307087b83 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 31 May 2020 17:47:06 +0900 Subject: [PATCH 254/427] kbuild: force to build vmlinux if CONFIG_MODVERSION=y This code does not work as stated in the comment. $(CONFIG_MODVERSIONS) is always empty because it is expanded before include/config/auto.conf is included. Hence, 'make modules' with CONFIG_MODVERSION=y cannot record the version CRCs. This has been broken since 2003, commit ("kbuild: Enable modules to be build using the "make dir/" syntax"). [1] [1]: https://git.kernel.org/pub/scm/linux/kernel/git/history/history.git/commit/?id=15c6240cdc44bbeef3c4797ec860f9765ef4f1a7 Cc: linux-stable # v2.5.71+ Signed-off-by: Masahiro Yamada --- Makefile | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 2df903429d31..b856f84e28c9 100644 --- a/Makefile +++ b/Makefile @@ -619,12 +619,8 @@ KBUILD_MODULES := KBUILD_BUILTIN := 1 # If we have only "make modules", don't compile built-in objects. -# When we're building modules with modversions, we need to consider -# the built-in objects during the descend as well, in order to -# make sure the checksums are up to date before we record them. - ifeq ($(MAKECMDGOALS),modules) - KBUILD_BUILTIN := $(if $(CONFIG_MODVERSIONS),1) + KBUILD_BUILTIN := endif # If we have "make modules", compile modules @@ -1337,6 +1333,13 @@ ifdef CONFIG_MODULES all: modules +# When we're building modules with modversions, we need to consider +# the built-in objects during the descend as well, in order to +# make sure the checksums are up to date before we record them. +ifdef CONFIG_MODVERSIONS + KBUILD_BUILTIN := 1 +endif + # Build modules # # A module can be listed more than once in obj-m resulting in From fb2d99be8919d63ff5c48f33bff4847387c6742b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 31 May 2020 19:11:39 +0900 Subject: [PATCH 255/427] kbuild: merge two 'ifdef CONFIG_TRIM_UNUSED_KSYMS' blocks This hunk has two 'ifdef CONFIG_TRIM_UNUSED_KSYMS ... endif' blocks with no other code interleaved. Merge them. Signed-off-by: Masahiro Yamada --- Makefile | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index b856f84e28c9..44921d9cf3cf 100644 --- a/Makefile +++ b/Makefile @@ -1098,16 +1098,14 @@ vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_OBJS) $(KBUILD_VMLINUX_LIBS) # Recurse until adjust_autoksyms.sh is satisfied PHONY += autoksyms_recursive ifdef CONFIG_TRIM_UNUSED_KSYMS -autoksyms_recursive: descend modules.order - $(Q)$(CONFIG_SHELL) $(srctree)/scripts/adjust_autoksyms.sh \ - "$(MAKE) -f $(srctree)/Makefile vmlinux" -endif - # For the kernel to actually contain only the needed exported symbols, # we have to build modules as well to determine what those symbols are. # (this can be evaluated only once include/config/auto.conf has been included) -ifdef CONFIG_TRIM_UNUSED_KSYMS - KBUILD_MODULES := 1 +KBUILD_MODULES := 1 + +autoksyms_recursive: descend modules.order + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/adjust_autoksyms.sh \ + "$(MAKE) -f $(srctree)/Makefile vmlinux" endif autoksyms_h := $(if $(CONFIG_TRIM_UNUSED_KSYMS), include/generated/autoksyms.h) From 0a8820e7f807158670d3400974b20691cd8774d9 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:56:55 +0900 Subject: [PATCH 256/427] kbuild: refactor subdir-ym calculation Remove the unneeded variables, __subdir-y and __subdir-m. Signed-off-by: Masahiro Yamada --- scripts/Makefile.lib | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 0d931cc0df94..748e44d5a1e3 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -39,16 +39,14 @@ ifdef need-modorder modorder := $(patsubst %/,%/modules.order, $(filter %/, $(obj-y)) $(obj-m:.o=.ko)) endif +# Subdirectories we need to descend into +subdir-ym := $(sort $(subdir-y) $(subdir-m) \ + $(patsubst %/,%, $(filter %/, $(obj-y) $(obj-m)))) + # Handle objects in subdirs # --------------------------------------------------------------------------- # o if we encounter foo/ in $(obj-y), replace it by foo/built-in.a -# and add the directory to the list of dirs to descend into: $(subdir-y) # o if we encounter foo/ in $(obj-m), remove it from $(obj-m) -# and add the directory to the list of dirs to descend into: $(subdir-m) -__subdir-y := $(patsubst %/,%,$(filter %/, $(obj-y))) -subdir-y += $(__subdir-y) -__subdir-m := $(patsubst %/,%,$(filter %/, $(obj-m))) -subdir-m += $(__subdir-m) ifdef need-builtin obj-y := $(patsubst %/, %/built-in.a, $(obj-y)) else @@ -56,9 +54,6 @@ obj-y := $(filter-out %/, $(obj-y)) endif obj-m := $(filter-out %/, $(obj-m)) -# Subdirectories we need to descend into -subdir-ym := $(sort $(subdir-y) $(subdir-m)) - # If $(foo-objs), $(foo-y), $(foo-m), or $(foo-) exists, foo.o is a composite object multi-used-y := $(sort $(foreach m,$(obj-y), $(if $(strip $($(m:.o=-objs)) $($(m:.o=-y)) $($(m:.o=-))), $(m)))) multi-used-m := $(sort $(foreach m,$(obj-m), $(if $(strip $($(m:.o=-objs)) $($(m:.o=-y)) $($(m:.o=-m)) $($(m:.o=-))), $(m)))) From f3908ab3ffd92c77af1bad7f699b8a1c14f462bf Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:56:56 +0900 Subject: [PATCH 257/427] kbuild: refactor tagets caluculation for KBUILD_{BUILTIN,KBUILD_MODULES} Remove lib-target, builtin-target, modorder-target, and modtargets. Instead, add targets-for-builtin and targets-for-modules. Signed-off-by: Masahiro Yamada --- scripts/Makefile.build | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index ee9a817e19a3..a1f09bec8c70 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -73,19 +73,24 @@ endif subdir-builtin := $(sort $(filter %/built-in.a, $(real-obj-y))) subdir-modorder := $(sort $(filter %/modules.order, $(modorder))) +targets-for-builtin := $(extra-y) + ifneq ($(strip $(lib-y) $(lib-m) $(lib-)),) -lib-target := $(obj)/lib.a +targets-for-builtin += $(obj)/lib.a endif ifdef need-builtin -builtin-target := $(obj)/built-in.a +targets-for-builtin += $(obj)/built-in.a endif +targets-for-modules := $(obj-m) +targets-for-modules += $(patsubst %.o, %.mod, $(obj-m)) + ifdef need-modorder -modorder-target := $(obj)/modules.order +targets-for-modules += $(obj)/modules.order endif -mod-targets := $(patsubst %.o, %.mod, $(obj-m)) +targets += $(targets-for-builtin) $(targets-for-modules) # Linus' kernel sanity checking tool ifeq ($(KBUILD_CHECKSRC),1) @@ -284,8 +289,6 @@ cmd_mod = { \ $(obj)/%.mod: $(obj)/%.o FORCE $(call if_changed,mod) -targets += $(mod-targets) - quiet_cmd_cc_lst_c = MKLST $@ cmd_cc_lst_c = $(CC) $(c_flags) -g -c -o $*.o $< && \ $(CONFIG_SHELL) $(srctree)/scripts/makelst $*.o \ @@ -359,7 +362,7 @@ $(obj)/%.o: $(src)/%.S $(objtool_dep) FORCE $(call if_changed_rule,as_o_S) targets += $(filter-out $(subdir-builtin), $(real-obj-y)) $(real-obj-m) $(lib-y) -targets += $(extra-y) $(always-y) $(MAKECMDGOALS) +targets += $(always-y) $(MAKECMDGOALS) # Linker scripts preprocessor (.lds.S -> .lds) # --------------------------------------------------------------------------- @@ -396,8 +399,6 @@ quiet_cmd_ar_builtin = AR $@ $(obj)/built-in.a: $(real-obj-y) FORCE $(call if_changed,ar_builtin) -targets += $(builtin-target) - # # Rule to create modules.order file # @@ -414,8 +415,6 @@ $(obj)/modules.order: $(subdir-modorder) FORCE $(obj)/lib.a: $(lib-y) FORCE $(call if_changed,ar) -targets += $(lib-target) - # NOTE: # Do not replace $(filter %.o,^) with $(real-prereqs). When a single object # module is turned into a multi object module, $^ will contain header file @@ -478,8 +477,8 @@ endif else -__build: $(if $(KBUILD_BUILTIN),$(builtin-target) $(lib-target) $(extra-y)) \ - $(if $(KBUILD_MODULES),$(obj-m) $(mod-targets) $(modorder-target)) \ +__build: $(if $(KBUILD_BUILTIN), $(targets-for-builtin)) \ + $(if $(KBUILD_MODULES), $(targets-for-modules)) \ $(subdir-ym) $(always-y) @: From 23febe375d94d55927019467b6ac5fd503d83b2d Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:56:57 +0900 Subject: [PATCH 258/427] kbuild: merge init-y into core-y No arch Makefile specifies init-y. Merge init-y into core-y. This does not change the link order. Signed-off-by: Masahiro Yamada --- Makefile | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 44921d9cf3cf..f9c37045cf64 100644 --- a/Makefile +++ b/Makefile @@ -643,12 +643,11 @@ endif ifeq ($(KBUILD_EXTMOD),) # Objects we will link into vmlinux / subdirs we need to visit -init-y := init/ +core-y := init/ usr/ drivers-y := drivers/ sound/ drivers-$(CONFIG_SAMPLES) += samples/ net-y := net/ libs-y := lib/ -core-y := usr/ virt-y := virt/ endif # KBUILD_EXTMOD @@ -1060,18 +1059,17 @@ export MODULES_NSDEPS := $(extmod-prefix)modules.nsdeps ifeq ($(KBUILD_EXTMOD),) core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/ -vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \ +vmlinux-dirs := $(patsubst %/,%,$(filter %/, \ $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ $(net-y) $(net-m) $(libs-y) $(libs-m) $(virt-y))) vmlinux-alldirs := $(sort $(vmlinux-dirs) Documentation \ - $(patsubst %/,%,$(filter %/, $(init-) $(core-) \ + $(patsubst %/,%,$(filter %/, $(core-) \ $(drivers-) $(net-) $(libs-) $(virt-)))) build-dirs := $(vmlinux-dirs) clean-dirs := $(vmlinux-alldirs) -init-y := $(patsubst %/, %/built-in.a, $(init-y)) core-y := $(patsubst %/, %/built-in.a, $(core-y)) drivers-y := $(patsubst %/, %/built-in.a, $(drivers-y)) net-y := $(patsubst %/, %/built-in.a, $(net-y)) @@ -1085,7 +1083,7 @@ endif virt-y := $(patsubst %/, %/built-in.a, $(virt-y)) # Externally visible symbols (used by link-vmlinux.sh) -export KBUILD_VMLINUX_OBJS := $(head-y) $(init-y) $(core-y) $(libs-y2) \ +export KBUILD_VMLINUX_OBJS := $(head-y) $(core-y) $(libs-y2) \ $(drivers-y) $(net-y) $(virt-y) export KBUILD_VMLINUX_LIBS := $(libs-y1) export KBUILD_LDS := arch/$(SRCARCH)/kernel/vmlinux.lds From 95fb6317b3ab827aa35e7e52cb3a535b0cc6ec7e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:56:58 +0900 Subject: [PATCH 259/427] kbuild: merge net-y and virt-y into drivers-y This will slightly change the link order; drivers-y from arch Makefile will be linked after virt/built-in.a, but I guess this is not a big deal. Signed-off-by: Masahiro Yamada --- Makefile | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index f9c37045cf64..c0c086d06753 100644 --- a/Makefile +++ b/Makefile @@ -646,9 +646,8 @@ ifeq ($(KBUILD_EXTMOD),) core-y := init/ usr/ drivers-y := drivers/ sound/ drivers-$(CONFIG_SAMPLES) += samples/ -net-y := net/ +drivers-y += net/ virt/ libs-y := lib/ -virt-y := virt/ endif # KBUILD_EXTMOD # The all: target is the default when no target is given on the @@ -1061,18 +1060,17 @@ core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/ vmlinux-dirs := $(patsubst %/,%,$(filter %/, \ $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ - $(net-y) $(net-m) $(libs-y) $(libs-m) $(virt-y))) + $(libs-y) $(libs-m))) vmlinux-alldirs := $(sort $(vmlinux-dirs) Documentation \ $(patsubst %/,%,$(filter %/, $(core-) \ - $(drivers-) $(net-) $(libs-) $(virt-)))) + $(drivers-) $(libs-)))) build-dirs := $(vmlinux-dirs) clean-dirs := $(vmlinux-alldirs) core-y := $(patsubst %/, %/built-in.a, $(core-y)) drivers-y := $(patsubst %/, %/built-in.a, $(drivers-y)) -net-y := $(patsubst %/, %/built-in.a, $(net-y)) libs-y2 := $(patsubst %/, %/built-in.a, $(filter %/, $(libs-y))) ifdef CONFIG_MODULES libs-y1 := $(filter-out %/, $(libs-y)) @@ -1080,11 +1078,9 @@ libs-y2 += $(patsubst %/, %/lib.a, $(filter %/, $(libs-y))) else libs-y1 := $(patsubst %/, %/lib.a, $(libs-y)) endif -virt-y := $(patsubst %/, %/built-in.a, $(virt-y)) # Externally visible symbols (used by link-vmlinux.sh) -export KBUILD_VMLINUX_OBJS := $(head-y) $(core-y) $(libs-y2) \ - $(drivers-y) $(net-y) $(virt-y) +export KBUILD_VMLINUX_OBJS := $(head-y) $(core-y) $(libs-y2) $(drivers-y) export KBUILD_VMLINUX_LIBS := $(libs-y1) export KBUILD_LDS := arch/$(SRCARCH)/kernel/vmlinux.lds export LDFLAGS_vmlinux From c39ba6b3a8d47be07c180f857564a25a0356d336 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 1 Jun 2020 08:44:39 +0000 Subject: [PATCH 260/427] workqueue: fix a piece of comment about reserved bits for work flags 8a2e8e5dec7e("workqueue: fix cwq->nr_active underflow") allocated one more bit from the work flags, and it updated partial of the comments (128 bytes -> 256 bytes), but it failed to update the info about the number of reserved bits. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 8b505d22fc0e..26de0cae2a0a 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -62,7 +62,7 @@ enum { WORK_CPU_UNBOUND = NR_CPUS, /* - * Reserve 7 bits off of pwq pointer w/ debugobjects turned off. + * Reserve 8 bits off of pwq pointer w/ debugobjects turned off. * This makes pwqs aligned to 256 bytes and allows 15 workqueue * flush colors. */ From 10cdb15759540f03d056e2f73fe26377ed7dcfda Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 1 Jun 2020 08:44:40 +0000 Subject: [PATCH 261/427] workqueue: use BUILD_BUG_ON() for compile time test instead of WARN_ON() Any runtime WARN_ON() has to be fixed, and BUILD_BUG_ON() can help you nitice it earlier. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c667ca5aed61..9fbe1e237563 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5917,7 +5917,7 @@ void __init workqueue_init_early(void) int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; int i, cpu; - WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); + BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags)); From 708b2000362476c9c7a3571c0cc774dffb91836a Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 26 May 2020 16:18:29 -0700 Subject: [PATCH 262/427] PCI/AER: Remove HEST/FIRMWARE_FIRST parsing for AER ownership Commit c100beb9ccfb ("PCI/AER: Use only _OSC to determine AER ownership") removed the use of HEST in determining AER ownership, but the AER driver still used HEST to verify AER ownership in some of its APIs. Per the ACPI spec v6.3, sec 18.3.2.4, some HEST table entries contain a FIRMWARE_FIRST bit, but that bit does not tell us anything about ownership of the AER capability. Remove parsing of HEST to look for FIRMWARE_FIRST. Add pcie_aer_is_native() for the places that need to know whether the OS owns the AER capability. [bhelgaas: commit log, reorder patch, remove unused __aer_firmware_first] Link: https://lore.kernel.org/r/9a37f53a4e6ff4942ff8e18dbb20b00e16c47341.1590534843.git.sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/aer.c | 124 +++++-------------------------------- drivers/pci/pcie/dpc.c | 2 +- drivers/pci/pcie/portdrv.h | 13 +--- include/linux/pci.h | 2 - 4 files changed, 17 insertions(+), 124 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index efc26773cc6d..803273ba30db 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -217,118 +217,22 @@ void pcie_ecrc_get_policy(char *str) } #endif /* CONFIG_PCIE_ECRC */ -#ifdef CONFIG_ACPI_APEI -static inline int hest_match_pci(struct acpi_hest_aer_common *p, - struct pci_dev *pci) -{ - return ACPI_HEST_SEGMENT(p->bus) == pci_domain_nr(pci->bus) && - ACPI_HEST_BUS(p->bus) == pci->bus->number && - p->device == PCI_SLOT(pci->devfn) && - p->function == PCI_FUNC(pci->devfn); -} - -static inline bool hest_match_type(struct acpi_hest_header *hest_hdr, - struct pci_dev *dev) -{ - u16 hest_type = hest_hdr->type; - u8 pcie_type = pci_pcie_type(dev); - - if ((hest_type == ACPI_HEST_TYPE_AER_ROOT_PORT && - pcie_type == PCI_EXP_TYPE_ROOT_PORT) || - (hest_type == ACPI_HEST_TYPE_AER_ENDPOINT && - pcie_type == PCI_EXP_TYPE_ENDPOINT) || - (hest_type == ACPI_HEST_TYPE_AER_BRIDGE && - (dev->class >> 16) == PCI_BASE_CLASS_BRIDGE)) - return true; - return false; -} - -struct aer_hest_parse_info { - struct pci_dev *pci_dev; - int firmware_first; -}; - -static int hest_source_is_pcie_aer(struct acpi_hest_header *hest_hdr) -{ - if (hest_hdr->type == ACPI_HEST_TYPE_AER_ROOT_PORT || - hest_hdr->type == ACPI_HEST_TYPE_AER_ENDPOINT || - hest_hdr->type == ACPI_HEST_TYPE_AER_BRIDGE) - return 1; - return 0; -} - -static int aer_hest_parse(struct acpi_hest_header *hest_hdr, void *data) -{ - struct aer_hest_parse_info *info = data; - struct acpi_hest_aer_common *p; - int ff; - - if (!hest_source_is_pcie_aer(hest_hdr)) - return 0; - - p = (struct acpi_hest_aer_common *)(hest_hdr + 1); - ff = !!(p->flags & ACPI_HEST_FIRMWARE_FIRST); - - /* - * If no specific device is supplied, determine whether - * FIRMWARE_FIRST is set for *any* PCIe device. - */ - if (!info->pci_dev) { - info->firmware_first |= ff; - return 0; - } - - /* Otherwise, check the specific device */ - if (p->flags & ACPI_HEST_GLOBAL) { - if (hest_match_type(hest_hdr, info->pci_dev)) - info->firmware_first = ff; - } else - if (hest_match_pci(p, info->pci_dev)) - info->firmware_first = ff; - - return 0; -} - -static void aer_set_firmware_first(struct pci_dev *pci_dev) -{ - int rc; - struct aer_hest_parse_info info = { - .pci_dev = pci_dev, - .firmware_first = 0, - }; - - rc = apei_hest_parse(aer_hest_parse, &info); - - if (rc) - pci_dev->__aer_firmware_first = 0; - else - pci_dev->__aer_firmware_first = info.firmware_first; - pci_dev->__aer_firmware_first_valid = 1; -} - -int pcie_aer_get_firmware_first(struct pci_dev *dev) -{ - if (!pci_is_pcie(dev)) - return 0; - - if (pcie_ports_native) - return 0; - - if (!dev->__aer_firmware_first_valid) - aer_set_firmware_first(dev); - return dev->__aer_firmware_first; -} -#endif - #define PCI_EXP_AER_FLAGS (PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE | \ PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE) -int pci_enable_pcie_error_reporting(struct pci_dev *dev) +int pcie_aer_is_native(struct pci_dev *dev) { - if (pcie_aer_get_firmware_first(dev)) - return -EIO; + struct pci_host_bridge *host = pci_find_host_bridge(dev->bus); if (!dev->aer_cap) + return 0; + + return pcie_ports_native || host->native_aer; +} + +int pci_enable_pcie_error_reporting(struct pci_dev *dev) +{ + if (!pcie_aer_is_native(dev)) return -EIO; return pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_AER_FLAGS); @@ -337,7 +241,7 @@ EXPORT_SYMBOL_GPL(pci_enable_pcie_error_reporting); int pci_disable_pcie_error_reporting(struct pci_dev *dev) { - if (pcie_aer_get_firmware_first(dev)) + if (!pcie_aer_is_native(dev)) return -EIO; return pcie_capability_clear_word(dev, PCI_EXP_DEVCTL, @@ -362,7 +266,7 @@ int pci_aer_clear_nonfatal_status(struct pci_dev *dev) if (!pos) return -EIO; - if (pcie_aer_get_firmware_first(dev)) + if (!pcie_aer_is_native(dev)) return -EIO; /* Clear status bits for ERR_NONFATAL errors only */ @@ -385,7 +289,7 @@ void pci_aer_clear_fatal_status(struct pci_dev *dev) if (!pos) return; - if (pcie_aer_get_firmware_first(dev)) + if (!pcie_aer_is_native(dev)) return; /* Clear status bits for ERR_FATAL errors only */ @@ -435,7 +339,7 @@ int pci_aer_raw_clear_status(struct pci_dev *dev) int pci_aer_clear_status(struct pci_dev *dev) { - if (pcie_aer_get_firmware_first(dev)) + if (!pcie_aer_is_native(dev)) return -EIO; return pci_aer_raw_clear_status(dev); diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c index 762170423fdd..0993d51abf03 100644 --- a/drivers/pci/pcie/dpc.c +++ b/drivers/pci/pcie/dpc.c @@ -284,7 +284,7 @@ static int dpc_probe(struct pcie_device *dev) int status; u16 ctl, cap; - if (pcie_aer_get_firmware_first(pdev) && !pcie_ports_dpc_native) + if (!pcie_aer_is_native(pdev) && !pcie_ports_dpc_native) return -ENOTSUPP; status = devm_request_threaded_irq(device, dev->irq, dpc_irq, diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h index 64b5e081cdb2..af7cf237432a 100644 --- a/drivers/pci/pcie/portdrv.h +++ b/drivers/pci/pcie/portdrv.h @@ -29,8 +29,10 @@ extern bool pcie_ports_dpc_native; #ifdef CONFIG_PCIEAER int pcie_aer_init(void); +int pcie_aer_is_native(struct pci_dev *dev); #else static inline int pcie_aer_init(void) { return 0; } +static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; } #endif #ifdef CONFIG_HOTPLUG_PCI_PCIE @@ -147,16 +149,5 @@ static inline bool pcie_pme_no_msi(void) { return false; } static inline void pcie_pme_interrupt_enable(struct pci_dev *dev, bool en) {} #endif /* !CONFIG_PCIE_PME */ -#ifdef CONFIG_ACPI_APEI -int pcie_aer_get_firmware_first(struct pci_dev *pci_dev); -#else -static inline int pcie_aer_get_firmware_first(struct pci_dev *pci_dev) -{ - if (pci_dev->__aer_firmware_first_valid) - return pci_dev->__aer_firmware_first; - return 0; -} -#endif - struct device *pcie_port_find_device(struct pci_dev *dev, u32 service); #endif /* _PORTDRV_H_ */ diff --git a/include/linux/pci.h b/include/linux/pci.h index 83ce1cdf5676..43f265830eca 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -420,8 +420,6 @@ struct pci_dev { * mappings to make sure they cannot access arbitrary memory. */ unsigned int untrusted:1; - unsigned int __aer_firmware_first_valid:1; - unsigned int __aer_firmware_first:1; unsigned int broken_intx_masking:1; /* INTx masking can't be used */ unsigned int io_window_1k:1; /* Intel bridge 1K I/O windows */ unsigned int irq_managed:1; From 123f985aea0d603466518f041670d195eb2a4111 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 26 May 2020 16:18:25 -0700 Subject: [PATCH 263/427] PCI/AER: Remove redundant pci_is_pcie() checks AER is a PCIe Extended Capability, so dev->aer_cap will only be set for PCIe devices. Remove redundant pci_is_pcie() checks. Link: https://lore.kernel.org/r/361c622eabe5b845b8092e0bec04a3a2c262cb38.1590534843.git.sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/aer.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 803273ba30db..ff41e44e56ee 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -139,9 +139,6 @@ static int enable_ecrc_checking(struct pci_dev *dev) int pos; u32 reg32; - if (!pci_is_pcie(dev)) - return -ENODEV; - pos = dev->aer_cap; if (!pos) return -ENODEV; @@ -167,9 +164,6 @@ static int disable_ecrc_checking(struct pci_dev *dev) int pos; u32 reg32; - if (!pci_is_pcie(dev)) - return -ENODEV; - pos = dev->aer_cap; if (!pos) return -ENODEV; @@ -315,9 +309,6 @@ int pci_aer_raw_clear_status(struct pci_dev *dev) u32 status; int port_type; - if (!pci_is_pcie(dev)) - return -ENODEV; - pos = dev->aer_cap; if (!pos) return -EIO; From af10cce7ad515aa819ee8a4ef6777c28b19ddfb7 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 26 May 2020 16:18:26 -0700 Subject: [PATCH 264/427] PCI/AER: Remove redundant dev->aer_cap checks pcie_aer_get_firmware_first() checks dev->aer_cap, so we can remove redundant dev->aer_cap checks in the callers. Link: https://lore.kernel.org/r/d5ccc7a060ec9cdc234bdae7df8a0a4410f13f42.1590534843.git.sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/aer.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index ff41e44e56ee..61e8cb23e98b 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -253,13 +253,9 @@ void pci_aer_clear_device_status(struct pci_dev *dev) int pci_aer_clear_nonfatal_status(struct pci_dev *dev) { - int pos; + int pos = dev->aer_cap; u32 status, sev; - pos = dev->aer_cap; - if (!pos) - return -EIO; - if (!pcie_aer_is_native(dev)) return -EIO; @@ -276,13 +272,9 @@ EXPORT_SYMBOL_GPL(pci_aer_clear_nonfatal_status); void pci_aer_clear_fatal_status(struct pci_dev *dev) { - int pos; + int pos = dev->aer_cap; u32 status, sev; - pos = dev->aer_cap; - if (!pos) - return; - if (!pcie_aer_is_native(dev)) return; From 07b2fbb565e2df7ccc41e5c977b19f5f1f9fe013 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 29 May 2020 17:56:09 -0500 Subject: [PATCH 265/427] PCI/AER: Use "aer" variable for capability offset Previously we used "pos" or "aer_pos" for the offset of the AER Capability. Use "aer" consistently and initialize it the same way everywhere. No functional change intended. Link: https://lore.kernel.org/r/20200529230915.GA479883@bjorn-Precision-5520 Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan --- drivers/pci/pcie/aer.c | 179 +++++++++++++++++++---------------------- 1 file changed, 84 insertions(+), 95 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 61e8cb23e98b..3acf56683915 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -136,19 +136,18 @@ static const char * const ecrc_policy_str[] = { */ static int enable_ecrc_checking(struct pci_dev *dev) { - int pos; + int aer = dev->aer_cap; u32 reg32; - pos = dev->aer_cap; - if (!pos) + if (!aer) return -ENODEV; - pci_read_config_dword(dev, pos + PCI_ERR_CAP, ®32); + pci_read_config_dword(dev, aer + PCI_ERR_CAP, ®32); if (reg32 & PCI_ERR_CAP_ECRC_GENC) reg32 |= PCI_ERR_CAP_ECRC_GENE; if (reg32 & PCI_ERR_CAP_ECRC_CHKC) reg32 |= PCI_ERR_CAP_ECRC_CHKE; - pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32); + pci_write_config_dword(dev, aer + PCI_ERR_CAP, reg32); return 0; } @@ -161,16 +160,15 @@ static int enable_ecrc_checking(struct pci_dev *dev) */ static int disable_ecrc_checking(struct pci_dev *dev) { - int pos; + int aer = dev->aer_cap; u32 reg32; - pos = dev->aer_cap; - if (!pos) + if (!aer) return -ENODEV; - pci_read_config_dword(dev, pos + PCI_ERR_CAP, ®32); + pci_read_config_dword(dev, aer + PCI_ERR_CAP, ®32); reg32 &= ~(PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE); - pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32); + pci_write_config_dword(dev, aer + PCI_ERR_CAP, reg32); return 0; } @@ -253,18 +251,18 @@ void pci_aer_clear_device_status(struct pci_dev *dev) int pci_aer_clear_nonfatal_status(struct pci_dev *dev) { - int pos = dev->aer_cap; + int aer = dev->aer_cap; u32 status, sev; if (!pcie_aer_is_native(dev)) return -EIO; /* Clear status bits for ERR_NONFATAL errors only */ - pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status); - pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &sev); + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, &status); + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_SEVER, &sev); status &= ~sev; if (status) - pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status); + pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, status); return 0; } @@ -272,18 +270,18 @@ EXPORT_SYMBOL_GPL(pci_aer_clear_nonfatal_status); void pci_aer_clear_fatal_status(struct pci_dev *dev) { - int pos = dev->aer_cap; + int aer = dev->aer_cap; u32 status, sev; if (!pcie_aer_is_native(dev)) return; /* Clear status bits for ERR_FATAL errors only */ - pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status); - pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &sev); + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, &status); + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_SEVER, &sev); status &= sev; if (status) - pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status); + pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, status); } /** @@ -297,25 +295,24 @@ void pci_aer_clear_fatal_status(struct pci_dev *dev) */ int pci_aer_raw_clear_status(struct pci_dev *dev) { - int pos; + int aer = dev->aer_cap; u32 status; int port_type; - pos = dev->aer_cap; - if (!pos) + if (!aer) return -EIO; port_type = pci_pcie_type(dev); if (port_type == PCI_EXP_TYPE_ROOT_PORT) { - pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, &status); - pci_write_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, status); + pci_read_config_dword(dev, aer + PCI_ERR_ROOT_STATUS, &status); + pci_write_config_dword(dev, aer + PCI_ERR_ROOT_STATUS, status); } - pci_read_config_dword(dev, pos + PCI_ERR_COR_STATUS, &status); - pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS, status); + pci_read_config_dword(dev, aer + PCI_ERR_COR_STATUS, &status); + pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS, status); - pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status); - pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status); + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, &status); + pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, status); return 0; } @@ -330,12 +327,11 @@ int pci_aer_clear_status(struct pci_dev *dev) void pci_save_aer_state(struct pci_dev *dev) { + int aer = dev->aer_cap; struct pci_cap_saved_state *save_state; u32 *cap; - int pos; - pos = dev->aer_cap; - if (!pos) + if (!aer) return; save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_ERR); @@ -343,22 +339,21 @@ void pci_save_aer_state(struct pci_dev *dev) return; cap = &save_state->cap.data[0]; - pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, cap++); - pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, cap++); - pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, cap++); - pci_read_config_dword(dev, pos + PCI_ERR_CAP, cap++); + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, cap++); + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_SEVER, cap++); + pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, cap++); + pci_read_config_dword(dev, aer + PCI_ERR_CAP, cap++); if (pcie_cap_has_rtctl(dev)) - pci_read_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, cap++); + pci_read_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, cap++); } void pci_restore_aer_state(struct pci_dev *dev) { + int aer = dev->aer_cap; struct pci_cap_saved_state *save_state; u32 *cap; - int pos; - pos = dev->aer_cap; - if (!pos) + if (!aer) return; save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_ERR); @@ -366,12 +361,12 @@ void pci_restore_aer_state(struct pci_dev *dev) return; cap = &save_state->cap.data[0]; - pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, *cap++); - pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, *cap++); - pci_write_config_dword(dev, pos + PCI_ERR_COR_MASK, *cap++); - pci_write_config_dword(dev, pos + PCI_ERR_CAP, *cap++); + pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, *cap++); + pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_SEVER, *cap++); + pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, *cap++); + pci_write_config_dword(dev, aer + PCI_ERR_CAP, *cap++); if (pcie_cap_has_rtctl(dev)) - pci_write_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, *cap++); + pci_write_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, *cap++); } void pci_aer_init(struct pci_dev *dev) @@ -802,7 +797,7 @@ static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev) */ static bool is_error_source(struct pci_dev *dev, struct aer_err_info *e_info) { - int pos; + int aer = dev->aer_cap; u32 status, mask; u16 reg16; @@ -837,17 +832,16 @@ static bool is_error_source(struct pci_dev *dev, struct aer_err_info *e_info) if (!(reg16 & PCI_EXP_AER_FLAGS)) return false; - pos = dev->aer_cap; - if (!pos) + if (!aer) return false; /* Check if error is recorded */ if (e_info->severity == AER_CORRECTABLE) { - pci_read_config_dword(dev, pos + PCI_ERR_COR_STATUS, &status); - pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, &mask); + pci_read_config_dword(dev, aer + PCI_ERR_COR_STATUS, &status); + pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, &mask); } else { - pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status); - pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, &mask); + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, &status); + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &mask); } if (status & ~mask) return true; @@ -918,16 +912,15 @@ static bool find_source_device(struct pci_dev *parent, */ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info) { - int pos; + int aer = dev->aer_cap; if (info->severity == AER_CORRECTABLE) { /* * Correctable error does not need software intervention. * No need to go through error recovery process. */ - pos = dev->aer_cap; - if (pos) - pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS, + if (aer) + pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS, info->status); pci_aer_clear_device_status(dev); } else if (info->severity == AER_NONFATAL) @@ -1018,22 +1011,21 @@ EXPORT_SYMBOL_GPL(aer_recover_queue); */ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) { - int pos, temp; + int aer = dev->aer_cap; + int temp; /* Must reset in this function */ info->status = 0; info->tlp_header_valid = 0; - pos = dev->aer_cap; - /* The device might not support AER */ - if (!pos) + if (!aer) return 0; if (info->severity == AER_CORRECTABLE) { - pci_read_config_dword(dev, pos + PCI_ERR_COR_STATUS, + pci_read_config_dword(dev, aer + PCI_ERR_COR_STATUS, &info->status); - pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, + pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, &info->mask); if (!(info->status & ~info->mask)) return 0; @@ -1042,27 +1034,27 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) info->severity == AER_NONFATAL) { /* Link is still healthy for IO reads */ - pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, &info->status); - pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &info->mask); if (!(info->status & ~info->mask)) return 0; /* Get First Error Pointer */ - pci_read_config_dword(dev, pos + PCI_ERR_CAP, &temp); + pci_read_config_dword(dev, aer + PCI_ERR_CAP, &temp); info->first_error = PCI_ERR_CAP_FEP(temp); if (info->status & AER_LOG_TLP_MASKS) { info->tlp_header_valid = 1; pci_read_config_dword(dev, - pos + PCI_ERR_HEADER_LOG, &info->tlp.dw0); + aer + PCI_ERR_HEADER_LOG, &info->tlp.dw0); pci_read_config_dword(dev, - pos + PCI_ERR_HEADER_LOG + 4, &info->tlp.dw1); + aer + PCI_ERR_HEADER_LOG + 4, &info->tlp.dw1); pci_read_config_dword(dev, - pos + PCI_ERR_HEADER_LOG + 8, &info->tlp.dw2); + aer + PCI_ERR_HEADER_LOG + 8, &info->tlp.dw2); pci_read_config_dword(dev, - pos + PCI_ERR_HEADER_LOG + 12, &info->tlp.dw3); + aer + PCI_ERR_HEADER_LOG + 12, &info->tlp.dw3); } } @@ -1168,15 +1160,15 @@ static irqreturn_t aer_irq(int irq, void *context) struct pcie_device *pdev = (struct pcie_device *)context; struct aer_rpc *rpc = get_service_data(pdev); struct pci_dev *rp = rpc->rpd; + int aer = rp->aer_cap; struct aer_err_source e_src = {}; - int pos = rp->aer_cap; - pci_read_config_dword(rp, pos + PCI_ERR_ROOT_STATUS, &e_src.status); + pci_read_config_dword(rp, aer + PCI_ERR_ROOT_STATUS, &e_src.status); if (!(e_src.status & (PCI_ERR_ROOT_UNCOR_RCV|PCI_ERR_ROOT_COR_RCV))) return IRQ_NONE; - pci_read_config_dword(rp, pos + PCI_ERR_ROOT_ERR_SRC, &e_src.id); - pci_write_config_dword(rp, pos + PCI_ERR_ROOT_STATUS, e_src.status); + pci_read_config_dword(rp, aer + PCI_ERR_ROOT_ERR_SRC, &e_src.id); + pci_write_config_dword(rp, aer + PCI_ERR_ROOT_STATUS, e_src.status); if (!kfifo_put(&rpc->aer_fifo, e_src)) return IRQ_HANDLED; @@ -1228,7 +1220,7 @@ static void set_downstream_devices_error_reporting(struct pci_dev *dev, static void aer_enable_rootport(struct aer_rpc *rpc) { struct pci_dev *pdev = rpc->rpd; - int aer_pos; + int aer = pdev->aer_cap; u16 reg16; u32 reg32; @@ -1240,14 +1232,13 @@ static void aer_enable_rootport(struct aer_rpc *rpc) pcie_capability_clear_word(pdev, PCI_EXP_RTCTL, SYSTEM_ERROR_INTR_ON_MESG_MASK); - aer_pos = pdev->aer_cap; /* Clear error status */ - pci_read_config_dword(pdev, aer_pos + PCI_ERR_ROOT_STATUS, ®32); - pci_write_config_dword(pdev, aer_pos + PCI_ERR_ROOT_STATUS, reg32); - pci_read_config_dword(pdev, aer_pos + PCI_ERR_COR_STATUS, ®32); - pci_write_config_dword(pdev, aer_pos + PCI_ERR_COR_STATUS, reg32); - pci_read_config_dword(pdev, aer_pos + PCI_ERR_UNCOR_STATUS, ®32); - pci_write_config_dword(pdev, aer_pos + PCI_ERR_UNCOR_STATUS, reg32); + pci_read_config_dword(pdev, aer + PCI_ERR_ROOT_STATUS, ®32); + pci_write_config_dword(pdev, aer + PCI_ERR_ROOT_STATUS, reg32); + pci_read_config_dword(pdev, aer + PCI_ERR_COR_STATUS, ®32); + pci_write_config_dword(pdev, aer + PCI_ERR_COR_STATUS, reg32); + pci_read_config_dword(pdev, aer + PCI_ERR_UNCOR_STATUS, ®32); + pci_write_config_dword(pdev, aer + PCI_ERR_UNCOR_STATUS, reg32); /* * Enable error reporting for the root port device and downstream port @@ -1256,9 +1247,9 @@ static void aer_enable_rootport(struct aer_rpc *rpc) set_downstream_devices_error_reporting(pdev, true); /* Enable Root Port's interrupt in response to error messages */ - pci_read_config_dword(pdev, aer_pos + PCI_ERR_ROOT_COMMAND, ®32); + pci_read_config_dword(pdev, aer + PCI_ERR_ROOT_COMMAND, ®32); reg32 |= ROOT_PORT_INTR_ON_MESG_MASK; - pci_write_config_dword(pdev, aer_pos + PCI_ERR_ROOT_COMMAND, reg32); + pci_write_config_dword(pdev, aer + PCI_ERR_ROOT_COMMAND, reg32); } /** @@ -1270,8 +1261,8 @@ static void aer_enable_rootport(struct aer_rpc *rpc) static void aer_disable_rootport(struct aer_rpc *rpc) { struct pci_dev *pdev = rpc->rpd; + int aer = pdev->aer_cap; u32 reg32; - int pos; /* * Disable error reporting for the root port device and downstream port @@ -1279,15 +1270,14 @@ static void aer_disable_rootport(struct aer_rpc *rpc) */ set_downstream_devices_error_reporting(pdev, false); - pos = pdev->aer_cap; /* Disable Root's interrupt in response to error messages */ - pci_read_config_dword(pdev, pos + PCI_ERR_ROOT_COMMAND, ®32); + pci_read_config_dword(pdev, aer + PCI_ERR_ROOT_COMMAND, ®32); reg32 &= ~ROOT_PORT_INTR_ON_MESG_MASK; - pci_write_config_dword(pdev, pos + PCI_ERR_ROOT_COMMAND, reg32); + pci_write_config_dword(pdev, aer + PCI_ERR_ROOT_COMMAND, reg32); /* Clear Root's error status reg */ - pci_read_config_dword(pdev, pos + PCI_ERR_ROOT_STATUS, ®32); - pci_write_config_dword(pdev, pos + PCI_ERR_ROOT_STATUS, reg32); + pci_read_config_dword(pdev, aer + PCI_ERR_ROOT_STATUS, ®32); + pci_write_config_dword(pdev, aer + PCI_ERR_ROOT_STATUS, reg32); } /** @@ -1344,28 +1334,27 @@ static int aer_probe(struct pcie_device *dev) */ static pci_ers_result_t aer_root_reset(struct pci_dev *dev) { + int aer = dev->aer_cap; u32 reg32; - int pos; int rc; - pos = dev->aer_cap; /* Disable Root's interrupt in response to error messages */ - pci_read_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, ®32); + pci_read_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, ®32); reg32 &= ~ROOT_PORT_INTR_ON_MESG_MASK; - pci_write_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, reg32); + pci_write_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, reg32); rc = pci_bus_error_reset(dev); pci_info(dev, "Root Port link has been reset\n"); /* Clear Root Error Status */ - pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, ®32); - pci_write_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, reg32); + pci_read_config_dword(dev, aer + PCI_ERR_ROOT_STATUS, ®32); + pci_write_config_dword(dev, aer + PCI_ERR_ROOT_STATUS, reg32); /* Enable Root Port's interrupt in response to error messages */ - pci_read_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, ®32); + pci_read_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, ®32); reg32 |= ROOT_PORT_INTR_ON_MESG_MASK; - pci_write_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, reg32); + pci_write_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, reg32); return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; } From 9103aaf9b40c4c4e51c2a4631d221daeb7d12bad Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Sat, 9 May 2020 17:56:54 +0800 Subject: [PATCH 266/427] PCI/DPC: Print IRQ number used by port Print IRQ number used by DPC port, like AER/PME does. It provides convenience to track DPC interrupts counts of certain port from /proc/interrupts. Link: https://lore.kernel.org/r/1589018214-52752-1-git-send-email-yangyicong@hisilicon.com Signed-off-by: Yicong Yang Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/dpc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c index 0993d51abf03..daa9a4153776 100644 --- a/drivers/pci/pcie/dpc.c +++ b/drivers/pci/pcie/dpc.c @@ -301,6 +301,7 @@ static int dpc_probe(struct pcie_device *dev) ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN; pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl); + pci_info(pdev, "enabled with IRQ %d\n", dev->irq); pci_info(pdev, "error containment capabilities: Int Msg #%d, RPExt%c PoisonedTLP%c SwTrigger%c RP PIO Log %d, DL_ActiveErr%c\n", cap & PCI_EXP_DPC_IRQ, FLAG(cap, PCI_EXP_DPC_CAP_RP_EXT), From 01a4dc0d8dee90d156c6af6a0d970f5a8767a90f Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 27 May 2020 22:53:41 +0200 Subject: [PATCH 267/427] sh: sh4a: Bring back tmu3_device early device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 1399c195ef50 ("sh: Switch to new style TMU device") converted tmu3_device platform device to new style of platform data but removed it from sh7786_early_devices array effectively removing last three timers and causing a warning: arch/sh/kernel/cpu/sh4a/setup-sh7786.c:243:31: warning: ‘tmu3_device’ defined but not used [-Wunused-variable] Fixes: 1399c195ef50 ("sh: Switch to new style TMU device") Signed-off-by: Krzysztof Kozlowski Reviewed-by: Geert Uytterhoeven Signed-off-by: Rich Felker --- arch/sh/kernel/cpu/sh4a/setup-sh7786.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c index 4b0db8259e3d..74620f30b19b 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c @@ -391,6 +391,7 @@ static struct platform_device *sh7786_early_devices[] __initdata = { &tmu0_device, &tmu1_device, &tmu2_device, + &tmu3_device, }; static struct platform_device *sh7786_devices[] __initdata = { From bd158322ba5f6190403e6aeb53c1e7b659f9ade8 Mon Sep 17 00:00:00 2001 From: Bin Meng Date: Sat, 2 May 2020 04:04:43 -0700 Subject: [PATCH 268/427] sh: Replace CONFIG_MTD_M25P80 with CONFIG_MTD_SPI_NOR in sh7757lcr_defconfig CONFIG_MTD_M25P80 was removed and replaced by CONFIG_MTD_SPI_NOR in commit b35b9a10362d ("mtd: spi-nor: Move m25p80 code in spi-nor.c") Signed-off-by: Bin Meng Signed-off-by: Rich Felker --- arch/sh/configs/sh7757lcr_defconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/configs/sh7757lcr_defconfig b/arch/sh/configs/sh7757lcr_defconfig index 9f2aed0b3bca..d0933a9b9799 100644 --- a/arch/sh/configs/sh7757lcr_defconfig +++ b/arch/sh/configs/sh7757lcr_defconfig @@ -36,7 +36,7 @@ CONFIG_IPV6=y # CONFIG_FW_LOADER is not set CONFIG_MTD=y CONFIG_MTD_BLOCK=y -CONFIG_MTD_M25P80=y +CONFIG_MTD_SPI_NOR=y CONFIG_BLK_DEV_RAM=y CONFIG_SCSI=y CONFIG_BLK_DEV_SD=y From 2bbb0e3cc8d4cd51fce9b306e207dba668d39c5f Mon Sep 17 00:00:00 2001 From: Romain Naour Date: Sun, 15 Mar 2020 18:51:07 +0100 Subject: [PATCH 269/427] arch/sh: vmlinux.scr Since the patch [1], building the kernel using a toolchain built with Binutils 2.33.1 prevent booting a sh4 system under Qemu. Apply the patch provided by Alan Modra [2] that fix alignment of rodata. [1] https://sourceware.org/git/gitweb.cgi?p=binutils-gdb.git;h=ebd2263ba9a9124d93bbc0ece63d7e0fae89b40e [2] https://www.sourceware.org/ml/binutils/2019-12/msg00112.html Signed-off-by: Romain Naour Cc: Alan Modra Signed-off-by: Rich Felker --- arch/sh/boot/compressed/vmlinux.scr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/boot/compressed/vmlinux.scr b/arch/sh/boot/compressed/vmlinux.scr index 862d74808236..dd292b4b9082 100644 --- a/arch/sh/boot/compressed/vmlinux.scr +++ b/arch/sh/boot/compressed/vmlinux.scr @@ -1,6 +1,6 @@ SECTIONS { - .rodata..compressed : { + .rodata..compressed : ALIGN(8) { input_len = .; LONG(input_data_end - input_data) input_data = .; *(.data) From eface6c5d1a341f59fe6b264f6c798cb259df9b1 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 17 Feb 2020 17:54:55 +0100 Subject: [PATCH 270/427] sh: configs: Cleanup old Kconfig IO scheduler options CONFIG_IOSCHED_DEADLINE and CONFIG_IOSCHED_CFQ are gone since commit f382fb0bcef4 ("block: remove legacy IO schedulers"). The IOSCHED_DEADLINE was replaced by MQ_IOSCHED_DEADLINE and it will be now enabled by default (along with MQ_IOSCHED_KYBER). The BFQ_GROUP_IOSCHED is the only multiqueue scheduler which comes with group scheduling so select it in configs previously choosing CFQ_GROUP_IOSCHED. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Rich Felker --- arch/sh/configs/apsh4ad0a_defconfig | 3 ++- arch/sh/configs/kfr2r09_defconfig | 2 -- arch/sh/configs/magicpanelr2_defconfig | 2 -- arch/sh/configs/polaris_defconfig | 1 - arch/sh/configs/r7780mp_defconfig | 2 -- arch/sh/configs/r7785rp_defconfig | 2 -- arch/sh/configs/rsk7201_defconfig | 2 -- arch/sh/configs/rsk7203_defconfig | 2 -- arch/sh/configs/rsk7264_defconfig | 2 -- arch/sh/configs/rsk7269_defconfig | 2 -- arch/sh/configs/sdk7786_defconfig | 3 ++- arch/sh/configs/se7206_defconfig | 2 -- arch/sh/configs/se7343_defconfig | 1 - arch/sh/configs/se7619_defconfig | 2 -- arch/sh/configs/se7705_defconfig | 2 -- arch/sh/configs/se7712_defconfig | 2 -- arch/sh/configs/se7721_defconfig | 2 -- arch/sh/configs/se7722_defconfig | 2 -- arch/sh/configs/se7780_defconfig | 1 - arch/sh/configs/sh7710voipgw_defconfig | 1 - arch/sh/configs/shmin_defconfig | 2 -- arch/sh/configs/ul2_defconfig | 2 -- 22 files changed, 4 insertions(+), 38 deletions(-) diff --git a/arch/sh/configs/apsh4ad0a_defconfig b/arch/sh/configs/apsh4ad0a_defconfig index 6dd0da73ca5a..6abd9bd70106 100644 --- a/arch/sh/configs/apsh4ad0a_defconfig +++ b/arch/sh/configs/apsh4ad0a_defconfig @@ -20,7 +20,8 @@ CONFIG_PROFILING=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -CONFIG_CFQ_GROUP_IOSCHED=y +CONFIG_IOSCHED_BFQ=y +CONFIG_BFQ_GROUP_IOSCHED=y CONFIG_CPU_SUBTYPE_SH7786=y CONFIG_MEMORY_SIZE=0x10000000 CONFIG_HUGETLB_PAGE_SIZE_1MB=y diff --git a/arch/sh/configs/kfr2r09_defconfig b/arch/sh/configs/kfr2r09_defconfig index 1dc3f670c481..833404490cfe 100644 --- a/arch/sh/configs/kfr2r09_defconfig +++ b/arch/sh/configs/kfr2r09_defconfig @@ -10,8 +10,6 @@ CONFIG_SLAB=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_CPU_SUBTYPE_SH7724=y CONFIG_MEMORY_SIZE=0x08000000 CONFIG_FLATMEM_MANUAL=y diff --git a/arch/sh/configs/magicpanelr2_defconfig b/arch/sh/configs/magicpanelr2_defconfig index 664c4dee6e6a..0989ed929540 100644 --- a/arch/sh/configs/magicpanelr2_defconfig +++ b/arch/sh/configs/magicpanelr2_defconfig @@ -14,8 +14,6 @@ CONFIG_MODULE_UNLOAD=y CONFIG_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_CPU_SUBTYPE_SH7720=y CONFIG_MEMORY_START=0x0C000000 CONFIG_MEMORY_SIZE=0x03F00000 diff --git a/arch/sh/configs/polaris_defconfig b/arch/sh/configs/polaris_defconfig index e3a1d3d2694a..246408ec7462 100644 --- a/arch/sh/configs/polaris_defconfig +++ b/arch/sh/configs/polaris_defconfig @@ -12,7 +12,6 @@ CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODVERSIONS=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set CONFIG_CPU_SUBTYPE_SH7709=y CONFIG_MEMORY_START=0x0C000000 CONFIG_FLATMEM_MANUAL=y diff --git a/arch/sh/configs/r7780mp_defconfig b/arch/sh/configs/r7780mp_defconfig index 0a18f8011c55..c97ec60cff27 100644 --- a/arch/sh/configs/r7780mp_defconfig +++ b/arch/sh/configs/r7780mp_defconfig @@ -12,8 +12,6 @@ CONFIG_OPROFILE=m CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_CPU_SUBTYPE_SH7780=y CONFIG_MEMORY_SIZE=0x08000000 CONFIG_FLATMEM_MANUAL=y diff --git a/arch/sh/configs/r7785rp_defconfig b/arch/sh/configs/r7785rp_defconfig index 7226ac5a1d44..55fce65eb454 100644 --- a/arch/sh/configs/r7785rp_defconfig +++ b/arch/sh/configs/r7785rp_defconfig @@ -15,8 +15,6 @@ CONFIG_KPROBES=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_CPU_SUBTYPE_SH7785=y CONFIG_MEMORY_SIZE=0x08000000 CONFIG_HUGETLB_PAGE_SIZE_1MB=y diff --git a/arch/sh/configs/rsk7201_defconfig b/arch/sh/configs/rsk7201_defconfig index 9f4f474705b7..841809b5c2dc 100644 --- a/arch/sh/configs/rsk7201_defconfig +++ b/arch/sh/configs/rsk7201_defconfig @@ -15,8 +15,6 @@ CONFIG_PROFILING=y CONFIG_OPROFILE=y CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_CPU_SUBTYPE_SH7201=y CONFIG_MEMORY_SIZE=0x01000000 CONFIG_FLATMEM_MANUAL=y diff --git a/arch/sh/configs/rsk7203_defconfig b/arch/sh/configs/rsk7203_defconfig index 10a32bd4cf66..0055031664ad 100644 --- a/arch/sh/configs/rsk7203_defconfig +++ b/arch/sh/configs/rsk7203_defconfig @@ -16,8 +16,6 @@ CONFIG_PROFILING=y CONFIG_OPROFILE=y CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_CPU_SUBTYPE_SH7203=y CONFIG_MEMORY_START=0x0c000000 CONFIG_MEMORY_SIZE=0x01000000 diff --git a/arch/sh/configs/rsk7264_defconfig b/arch/sh/configs/rsk7264_defconfig index 78643191c99e..f7b9c528c6df 100644 --- a/arch/sh/configs/rsk7264_defconfig +++ b/arch/sh/configs/rsk7264_defconfig @@ -17,8 +17,6 @@ CONFIG_MMAP_ALLOW_UNINITIALIZED=y CONFIG_PROFILING=y # CONFIG_BLK_DEV_BSG is not set CONFIG_PARTITION_ADVANCED=y -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_CPU_SUBTYPE_SH7264=y CONFIG_MEMORY_START=0x0c000000 CONFIG_FLATMEM_MANUAL=y diff --git a/arch/sh/configs/rsk7269_defconfig b/arch/sh/configs/rsk7269_defconfig index fb9fa7faf635..4bff14fb185d 100644 --- a/arch/sh/configs/rsk7269_defconfig +++ b/arch/sh/configs/rsk7269_defconfig @@ -4,8 +4,6 @@ CONFIG_EMBEDDED=y # CONFIG_VM_EVENT_COUNTERS is not set CONFIG_SLAB=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_SWAP_IO_SPACE=y CONFIG_CPU_SUBTYPE_SH7269=y CONFIG_MEMORY_START=0x0c000000 diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig index 7fa116b436c3..61bec46ebd66 100644 --- a/arch/sh/configs/sdk7786_defconfig +++ b/arch/sh/configs/sdk7786_defconfig @@ -39,7 +39,8 @@ CONFIG_OPROFILE=m CONFIG_KPROBES=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_CFQ_GROUP_IOSCHED=y +CONFIG_IOSCHED_BFQ=y +CONFIG_BFQ_GROUP_IOSCHED=y CONFIG_CPU_SUBTYPE_SH7786=y CONFIG_MEMORY_START=0x40000000 CONFIG_MEMORY_SIZE=0x20000000 diff --git a/arch/sh/configs/se7206_defconfig b/arch/sh/configs/se7206_defconfig index a93402b3a319..21a43f14ffac 100644 --- a/arch/sh/configs/se7206_defconfig +++ b/arch/sh/configs/se7206_defconfig @@ -28,8 +28,6 @@ CONFIG_OPROFILE=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_CPU_SUBTYPE_SH7206=y CONFIG_MEMORY_START=0x0c000000 CONFIG_FLATMEM_MANUAL=y diff --git a/arch/sh/configs/se7343_defconfig b/arch/sh/configs/se7343_defconfig index 06d067c842cd..4e794e719a28 100644 --- a/arch/sh/configs/se7343_defconfig +++ b/arch/sh/configs/se7343_defconfig @@ -11,7 +11,6 @@ CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_CPU_SUBTYPE_SH7343=y CONFIG_MEMORY_START=0x0c000000 CONFIG_MEMORY_SIZE=0x01000000 diff --git a/arch/sh/configs/se7619_defconfig b/arch/sh/configs/se7619_defconfig index f54722dbc8f5..3264415a5931 100644 --- a/arch/sh/configs/se7619_defconfig +++ b/arch/sh/configs/se7619_defconfig @@ -11,8 +11,6 @@ CONFIG_LOG_BUF_SHIFT=14 # CONFIG_VM_EVENT_COUNTERS is not set CONFIG_SLAB=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_MEMORY_START=0x0c000000 CONFIG_FLATMEM_MANUAL=y CONFIG_CPU_BIG_ENDIAN=y diff --git a/arch/sh/configs/se7705_defconfig b/arch/sh/configs/se7705_defconfig index ddfc69841955..4496b94b7d88 100644 --- a/arch/sh/configs/se7705_defconfig +++ b/arch/sh/configs/se7705_defconfig @@ -8,8 +8,6 @@ CONFIG_BLK_DEV_INITRD=y CONFIG_SLAB=y CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_CPU_SUBTYPE_SH7705=y CONFIG_MEMORY_START=0x0c000000 CONFIG_MEMORY_SIZE=0x02000000 diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig index 9a527f978106..ee6d28ae08de 100644 --- a/arch/sh/configs/se7712_defconfig +++ b/arch/sh/configs/se7712_defconfig @@ -12,8 +12,6 @@ CONFIG_KALLSYMS_ALL=y CONFIG_SLAB=y CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_CPU_SUBTYPE_SH7712=y CONFIG_MEMORY_START=0x0c000000 CONFIG_MEMORY_SIZE=0x02000000 diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig index 3b0e1eb6e874..bad921bc10f8 100644 --- a/arch/sh/configs/se7721_defconfig +++ b/arch/sh/configs/se7721_defconfig @@ -12,8 +12,6 @@ CONFIG_KALLSYMS_ALL=y CONFIG_SLAB=y CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_CPU_SUBTYPE_SH7721=y CONFIG_MEMORY_START=0x0c000000 CONFIG_MEMORY_SIZE=0x02000000 diff --git a/arch/sh/configs/se7722_defconfig b/arch/sh/configs/se7722_defconfig index 88bf9e849008..09e455817447 100644 --- a/arch/sh/configs/se7722_defconfig +++ b/arch/sh/configs/se7722_defconfig @@ -8,8 +8,6 @@ CONFIG_PROFILING=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_CPU_SUBTYPE_SH7722=y CONFIG_MEMORY_START=0x0c000000 CONFIG_NUMA=y diff --git a/arch/sh/configs/se7780_defconfig b/arch/sh/configs/se7780_defconfig index ec32c82646ed..dcd85b858ac8 100644 --- a/arch/sh/configs/se7780_defconfig +++ b/arch/sh/configs/se7780_defconfig @@ -9,7 +9,6 @@ CONFIG_LOG_BUF_SHIFT=14 CONFIG_SLAB=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -# CONFIG_IOSCHED_CFQ is not set CONFIG_CPU_SUBTYPE_SH7780=y CONFIG_MEMORY_SIZE=0x08000000 CONFIG_SH_7780_SOLUTION_ENGINE=y diff --git a/arch/sh/configs/sh7710voipgw_defconfig b/arch/sh/configs/sh7710voipgw_defconfig index c86f28442a80..08426913c0e3 100644 --- a/arch/sh/configs/sh7710voipgw_defconfig +++ b/arch/sh/configs/sh7710voipgw_defconfig @@ -11,7 +11,6 @@ CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_CPU_SUBTYPE_SH7710=y CONFIG_MEMORY_START=0x0c000000 CONFIG_MEMORY_SIZE=0x00800000 diff --git a/arch/sh/configs/shmin_defconfig b/arch/sh/configs/shmin_defconfig index d589cfdfb7eb..a27b129b93c5 100644 --- a/arch/sh/configs/shmin_defconfig +++ b/arch/sh/configs/shmin_defconfig @@ -12,8 +12,6 @@ CONFIG_LOG_BUF_SHIFT=14 # CONFIG_SHMEM is not set CONFIG_SLOB=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_CPU_SUBTYPE_SH7706=y CONFIG_MEMORY_START=0x0c000000 CONFIG_MEMORY_SIZE=0x00800000 diff --git a/arch/sh/configs/ul2_defconfig b/arch/sh/configs/ul2_defconfig index dc2e3061130f..103b81ec1ffb 100644 --- a/arch/sh/configs/ul2_defconfig +++ b/arch/sh/configs/ul2_defconfig @@ -8,8 +8,6 @@ CONFIG_PROFILING=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_CPU_SUBTYPE_SH7366=y CONFIG_MEMORY_SIZE=0x01f00000 CONFIG_NUMA=y From 6410607b6eae1a56db4aba44ccc4eb5979cf60ae Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 20 Jan 2020 10:22:13 +0900 Subject: [PATCH 271/427] sh: Add missing DECLARE_EXPORT() for __ashiftrt_r4_xx __ashiftrt_r4_xx might be used from kernel module. We need DECLARE_EXPORT() for them, otherwise we will get compile error. This patch adds missing DECLARE_EXPORT() ERROR: "__ashiftrt_r4_25" [drivers/iio/pressure/bmp280.ko] undefined! ERROR: "__ashiftrt_r4_26" [drivers/iio/dac/ad5764.ko] undefined! ERROR: "__ashiftrt_r4_26" [drivers/iio/accel/mma7660.ko] undefined! ERROR: "__ashiftrt_r4_25" [drivers/iio/accel/dmard06.ko] undefined! ERROR: "__ashiftrt_r4_26" [drivers/iio/accel/bma220_spi.ko] undefined! ERROR: "__ashiftrt_r4_25" [drivers/crypto/hisilicon/sec/hisi_sec.ko] undefined! ERROR: "__ashiftrt_r4_26" [drivers/rtc/rtc-x1205.ko] undefined! ERROR: "__ashiftrt_r4_25" [drivers/rtc/rtc-pcf85063.ko] undefined! ERROR: "__ashiftrt_r4_25" [drivers/rtc/rtc-pcf2123.ko] undefined! ERROR: "__ashiftrt_r4_25" [drivers/input/tablet/gtco.ko] undefined! ERROR: "__ashiftrt_r4_26" [drivers/input/mouse/psmouse.ko] undefined! ERROR: "__ashiftrt_r4_28" [drivers/input/mouse/psmouse.ko] undefined! ERROR: "__ashiftrt_r4_28" [drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu.ko] undefined! ERROR: "__ashiftrt_r4_28" [fs/udf/udf.ko] undefined! Signed-off-by: Kuninori Morimoto Signed-off-by: Rich Felker --- arch/sh/kernel/sh_ksyms_32.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/sh/kernel/sh_ksyms_32.c b/arch/sh/kernel/sh_ksyms_32.c index 282774472603..5858936cb431 100644 --- a/arch/sh/kernel/sh_ksyms_32.c +++ b/arch/sh/kernel/sh_ksyms_32.c @@ -38,6 +38,13 @@ DECLARE_EXPORT(__ashlsi3); DECLARE_EXPORT(__lshrsi3_r0); DECLARE_EXPORT(__ashrsi3_r0); DECLARE_EXPORT(__ashlsi3_r0); + +DECLARE_EXPORT(__ashiftrt_r4_0); +DECLARE_EXPORT(__ashiftrt_r4_1); +DECLARE_EXPORT(__ashiftrt_r4_2); +DECLARE_EXPORT(__ashiftrt_r4_3); +DECLARE_EXPORT(__ashiftrt_r4_4); +DECLARE_EXPORT(__ashiftrt_r4_5); DECLARE_EXPORT(__ashiftrt_r4_6); DECLARE_EXPORT(__ashiftrt_r4_7); DECLARE_EXPORT(__ashiftrt_r4_8); @@ -48,13 +55,23 @@ DECLARE_EXPORT(__ashiftrt_r4_12); DECLARE_EXPORT(__ashiftrt_r4_13); DECLARE_EXPORT(__ashiftrt_r4_14); DECLARE_EXPORT(__ashiftrt_r4_15); +DECLARE_EXPORT(__ashiftrt_r4_16); +DECLARE_EXPORT(__ashiftrt_r4_17); +DECLARE_EXPORT(__ashiftrt_r4_18); +DECLARE_EXPORT(__ashiftrt_r4_19); DECLARE_EXPORT(__ashiftrt_r4_20); DECLARE_EXPORT(__ashiftrt_r4_21); DECLARE_EXPORT(__ashiftrt_r4_22); DECLARE_EXPORT(__ashiftrt_r4_23); DECLARE_EXPORT(__ashiftrt_r4_24); +DECLARE_EXPORT(__ashiftrt_r4_25); +DECLARE_EXPORT(__ashiftrt_r4_26); DECLARE_EXPORT(__ashiftrt_r4_27); +DECLARE_EXPORT(__ashiftrt_r4_28); +DECLARE_EXPORT(__ashiftrt_r4_29); DECLARE_EXPORT(__ashiftrt_r4_30); +DECLARE_EXPORT(__ashiftrt_r4_31); +DECLARE_EXPORT(__ashiftrt_r4_32); DECLARE_EXPORT(__movstr); DECLARE_EXPORT(__movstrSI8); DECLARE_EXPORT(__movstrSI12); From 4580ba4ad2e6b8ddaada3db61d179d4dfac12047 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 20 Jan 2020 10:22:17 +0900 Subject: [PATCH 272/427] sh: Convert iounmap() macros to inline functions Macro iounmap() do nothing, but that results in unused variable warnings all over the place. This patch convert it to inline to avoid warning We will get this warning without this patch ${LINUX}/drivers/thermal/broadcom/ns-thermal.c:78:21: \ warning: unused variable 'ns_thermal' [-Wunused-variable] struct ns_thermal *ns_thermal = platform_get_drvdata(pdev); ^~~~~~~~~~ Fixes: 98c90e5ea34e9 ("sh: remove __iounmap") Signed-off-by: Kuninori Morimoto Signed-off-by: Rich Felker --- arch/sh/include/asm/io.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h index 39c9ead489e5..b42228906eaf 100644 --- a/arch/sh/include/asm/io.h +++ b/arch/sh/include/asm/io.h @@ -328,7 +328,7 @@ __ioremap_mode(phys_addr_t offset, unsigned long size, pgprot_t prot) #else #define __ioremap(offset, size, prot) ((void __iomem *)(offset)) #define __ioremap_mode(offset, size, prot) ((void __iomem *)(offset)) -#define iounmap(addr) do { } while (0) +static inline void iounmap(void __iomem *addr) {} #endif /* CONFIG_MMU */ static inline void __iomem *ioremap(phys_addr_t offset, unsigned long size) From 3125ddc42487307b59e44c405a3282770475150d Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 20 Jan 2020 10:22:21 +0900 Subject: [PATCH 273/427] sh: Convert ins[bwl]/outs[bwl] macros to inline functions Macro ins[bwl]/outs[bwl] are just calling BUG(), but that results in unused variable warnings all over the place. This patch convert macro to inline to avoid warning We will get this kind of warning without this patch ${LINUX}/drivers/iio/adc/ad7606_par.c:21:23: \ warning: unused variable 'st' [-Wunused-variable] struct ad7606_state *st = iio_priv(indio_dev); ^~ Signed-off-by: Kuninori Morimoto Signed-off-by: Rich Felker --- arch/sh/include/asm/io_noioport.h | 34 +++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/arch/sh/include/asm/io_noioport.h b/arch/sh/include/asm/io_noioport.h index 90d6109f1622..f7938fe0f911 100644 --- a/arch/sh/include/asm/io_noioport.h +++ b/arch/sh/include/asm/io_noioport.h @@ -53,12 +53,34 @@ static inline void ioport_unmap(void __iomem *addr) #define outw_p(x, addr) outw((x), (addr)) #define outl_p(x, addr) outl((x), (addr)) -#define insb(a, b, c) BUG() -#define insw(a, b, c) BUG() -#define insl(a, b, c) BUG() +static inline void insb(unsigned long port, void *dst, unsigned long count) +{ + BUG(); +} -#define outsb(a, b, c) BUG() -#define outsw(a, b, c) BUG() -#define outsl(a, b, c) BUG() +static inline void insw(unsigned long port, void *dst, unsigned long count) +{ + BUG(); +} + +static inline void insl(unsigned long port, void *dst, unsigned long count) +{ + BUG(); +} + +static inline void outsb(unsigned long port, const void *src, unsigned long count) +{ + BUG(); +} + +static inline void outsw(unsigned long port, const void *src, unsigned long count) +{ + BUG(); +} + +static inline void outsl(unsigned long port, const void *src, unsigned long count) +{ + BUG(); +} #endif /* __ASM_SH_IO_NOIOPORT_H */ From d1f56f318d234fc5db230af2f3e0088f689ab3c0 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 12 Dec 2019 11:38:43 +0900 Subject: [PATCH 274/427] sh: add missing EXPORT_SYMBOL() for __delay __delay() is used from kernel module. We need EXPORT_SYMBOL(), otherwise we will get compile error. ERROR: "__delay" [drivers/net/phy/mdio-cavium.ko] undefined! Signed-off-by: Kuninori Morimoto Signed-off-by: Rich Felker --- arch/sh/lib/delay.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/sh/lib/delay.c b/arch/sh/lib/delay.c index dad8e6a54906..540e670dbafc 100644 --- a/arch/sh/lib/delay.c +++ b/arch/sh/lib/delay.c @@ -29,6 +29,7 @@ void __delay(unsigned long loops) : "0" (loops) : "t"); } +EXPORT_SYMBOL(__delay); inline void __const_udelay(unsigned long xloops) { From 37744feebc086908fd89760650f458ab19071750 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 20 Apr 2020 11:37:12 +0200 Subject: [PATCH 275/427] sh: remove sh5 support sh5 never became a product and has probably never really worked. Remove it by recursively deleting all associated Kconfig options and all corresponding files. Reviewed-by: Geert Uytterhoeven Signed-off-by: Arnd Bergmann Signed-off-by: Rich Felker --- arch/sh/Kconfig | 62 +- arch/sh/Kconfig.cpu | 9 - arch/sh/Kconfig.debug | 13 +- arch/sh/Makefile | 29 +- arch/sh/boot/compressed/Makefile | 12 +- arch/sh/boot/compressed/misc.c | 8 - arch/sh/drivers/pci/Makefile | 1 - arch/sh/drivers/pci/ops-sh5.c | 65 - arch/sh/drivers/pci/pci-sh5.c | 217 --- arch/sh/drivers/pci/pci-sh5.h | 108 -- arch/sh/include/asm/barrier.h | 4 +- arch/sh/include/asm/bitops.h | 26 - arch/sh/include/asm/bl_bit.h | 11 +- arch/sh/include/asm/bl_bit_64.h | 37 - arch/sh/include/asm/bugs.h | 4 - arch/sh/include/asm/cache_insns.h | 12 +- arch/sh/include/asm/cache_insns_64.h | 20 - arch/sh/include/asm/checksum.h | 6 +- arch/sh/include/asm/elf.h | 23 - arch/sh/include/asm/extable.h | 4 - arch/sh/include/asm/fixmap.h | 4 - arch/sh/include/asm/io.h | 4 - arch/sh/include/asm/irq.h | 3 - arch/sh/include/asm/mmu_context.h | 12 - arch/sh/include/asm/mmu_context_64.h | 75 - arch/sh/include/asm/page.h | 21 +- arch/sh/include/asm/pgtable.h | 17 - arch/sh/include/asm/pgtable_64.h | 307 ---- arch/sh/include/asm/posix_types.h | 6 +- arch/sh/include/asm/processor.h | 14 +- arch/sh/include/asm/processor_64.h | 212 --- arch/sh/include/asm/ptrace_64.h | 14 - arch/sh/include/asm/string.h | 6 +- arch/sh/include/asm/string_64.h | 21 - arch/sh/include/asm/switch_to.h | 11 +- arch/sh/include/asm/switch_to_64.h | 32 - arch/sh/include/asm/syscall.h | 6 +- arch/sh/include/asm/syscall_64.h | 75 - arch/sh/include/asm/syscalls.h | 9 +- arch/sh/include/asm/syscalls_64.h | 18 - arch/sh/include/asm/thread_info.h | 4 +- arch/sh/include/asm/tlb.h | 6 +- arch/sh/include/asm/tlb_64.h | 68 - arch/sh/include/asm/traps.h | 4 - arch/sh/include/asm/traps_64.h | 35 - arch/sh/include/asm/types.h | 5 - arch/sh/include/asm/uaccess.h | 4 - arch/sh/include/asm/uaccess_64.h | 85 - arch/sh/include/asm/unistd.h | 6 +- arch/sh/include/asm/user.h | 7 - arch/sh/include/asm/vermagic.h | 4 - arch/sh/include/asm/vmlinux.lds.h | 8 - arch/sh/include/cpu-sh5/cpu/addrspace.h | 12 - arch/sh/include/cpu-sh5/cpu/cache.h | 94 - arch/sh/include/cpu-sh5/cpu/irq.h | 113 -- arch/sh/include/cpu-sh5/cpu/mmu_context.h | 22 - arch/sh/include/cpu-sh5/cpu/registers.h | 103 -- arch/sh/include/cpu-sh5/cpu/rtc.h | 9 - arch/sh/include/uapi/asm/posix_types.h | 8 +- arch/sh/include/uapi/asm/posix_types_64.h | 29 - arch/sh/include/uapi/asm/ptrace.h | 5 - arch/sh/include/uapi/asm/ptrace_64.h | 15 - arch/sh/include/uapi/asm/sigcontext.h | 13 - arch/sh/include/uapi/asm/stat.h | 61 - arch/sh/include/uapi/asm/swab.h | 10 - arch/sh/include/uapi/asm/unistd.h | 8 +- arch/sh/include/uapi/asm/unistd_64.h | 423 ----- arch/sh/kernel/Makefile | 16 +- arch/sh/kernel/cpu/Makefile | 1 - arch/sh/kernel/cpu/init.c | 2 +- arch/sh/kernel/cpu/irq/Makefile | 3 +- arch/sh/kernel/cpu/irq/intc-sh5.c | 194 -- arch/sh/kernel/cpu/proc.c | 1 - arch/sh/kernel/cpu/sh5/Makefile | 16 - arch/sh/kernel/cpu/sh5/clock-sh5.c | 76 - arch/sh/kernel/cpu/sh5/entry.S | 2000 --------------------- arch/sh/kernel/cpu/sh5/fpu.c | 106 -- arch/sh/kernel/cpu/sh5/probe.c | 72 - arch/sh/kernel/cpu/sh5/setup-sh5.c | 121 -- arch/sh/kernel/cpu/sh5/switchto.S | 195 -- arch/sh/kernel/cpu/sh5/unwind.c | 342 ---- arch/sh/kernel/head_64.S | 346 ---- arch/sh/kernel/irq_64.c | 48 - arch/sh/kernel/module.c | 9 - arch/sh/kernel/process.c | 2 - arch/sh/kernel/process_64.c | 461 ----- arch/sh/kernel/ptrace_64.c | 576 ------ arch/sh/kernel/reboot.c | 6 - arch/sh/kernel/sh_ksyms_64.c | 51 - arch/sh/kernel/signal_64.c | 567 ------ arch/sh/kernel/syscalls_64.S | 419 ----- arch/sh/kernel/traps_64.c | 814 --------- arch/sh/kernel/vmlinux.lds.S | 18 +- arch/sh/lib64/Makefile | 17 - arch/sh/lib64/copy_page.S | 89 - arch/sh/lib64/copy_user_memcpy.S | 218 --- arch/sh/lib64/memcpy.S | 202 --- arch/sh/lib64/memset.S | 92 - arch/sh/lib64/panic.c | 15 - arch/sh/lib64/sdivsi3.S | 136 -- arch/sh/lib64/strcpy.S | 98 - arch/sh/lib64/strlen.S | 34 - arch/sh/lib64/udelay.c | 49 - arch/sh/lib64/udivdi3.S | 121 -- arch/sh/lib64/udivsi3.S | 60 - arch/sh/mm/Kconfig | 16 +- arch/sh/mm/Makefile | 31 +- arch/sh/mm/cache-sh5.c | 621 ------- arch/sh/mm/cache.c | 6 - arch/sh/mm/extable_64.c | 84 - arch/sh/mm/tlb-sh5.c | 224 --- arch/sh/mm/tlbex_64.c | 166 -- arch/sh/mm/tlbflush_64.c | 172 -- drivers/rtc/Kconfig | 2 +- fs/Kconfig.binfmt | 2 +- scripts/headers_install.sh | 3 - tools/arch/sh/include/asm/barrier.h | 2 +- 117 files changed, 67 insertions(+), 11554 deletions(-) delete mode 100644 arch/sh/drivers/pci/ops-sh5.c delete mode 100644 arch/sh/drivers/pci/pci-sh5.c delete mode 100644 arch/sh/drivers/pci/pci-sh5.h delete mode 100644 arch/sh/include/asm/bl_bit_64.h delete mode 100644 arch/sh/include/asm/cache_insns_64.h delete mode 100644 arch/sh/include/asm/mmu_context_64.h delete mode 100644 arch/sh/include/asm/pgtable_64.h delete mode 100644 arch/sh/include/asm/processor_64.h delete mode 100644 arch/sh/include/asm/ptrace_64.h delete mode 100644 arch/sh/include/asm/string_64.h delete mode 100644 arch/sh/include/asm/switch_to_64.h delete mode 100644 arch/sh/include/asm/syscall_64.h delete mode 100644 arch/sh/include/asm/syscalls_64.h delete mode 100644 arch/sh/include/asm/tlb_64.h delete mode 100644 arch/sh/include/asm/traps_64.h delete mode 100644 arch/sh/include/asm/uaccess_64.h delete mode 100644 arch/sh/include/cpu-sh5/cpu/addrspace.h delete mode 100644 arch/sh/include/cpu-sh5/cpu/cache.h delete mode 100644 arch/sh/include/cpu-sh5/cpu/irq.h delete mode 100644 arch/sh/include/cpu-sh5/cpu/mmu_context.h delete mode 100644 arch/sh/include/cpu-sh5/cpu/registers.h delete mode 100644 arch/sh/include/cpu-sh5/cpu/rtc.h delete mode 100644 arch/sh/include/uapi/asm/posix_types_64.h delete mode 100644 arch/sh/include/uapi/asm/ptrace_64.h delete mode 100644 arch/sh/include/uapi/asm/unistd_64.h delete mode 100644 arch/sh/kernel/cpu/irq/intc-sh5.c delete mode 100644 arch/sh/kernel/cpu/sh5/Makefile delete mode 100644 arch/sh/kernel/cpu/sh5/clock-sh5.c delete mode 100644 arch/sh/kernel/cpu/sh5/entry.S delete mode 100644 arch/sh/kernel/cpu/sh5/fpu.c delete mode 100644 arch/sh/kernel/cpu/sh5/probe.c delete mode 100644 arch/sh/kernel/cpu/sh5/setup-sh5.c delete mode 100644 arch/sh/kernel/cpu/sh5/switchto.S delete mode 100644 arch/sh/kernel/cpu/sh5/unwind.c delete mode 100644 arch/sh/kernel/head_64.S delete mode 100644 arch/sh/kernel/irq_64.c delete mode 100644 arch/sh/kernel/process_64.c delete mode 100644 arch/sh/kernel/ptrace_64.c delete mode 100644 arch/sh/kernel/sh_ksyms_64.c delete mode 100644 arch/sh/kernel/signal_64.c delete mode 100644 arch/sh/kernel/syscalls_64.S delete mode 100644 arch/sh/kernel/traps_64.c delete mode 100644 arch/sh/lib64/Makefile delete mode 100644 arch/sh/lib64/copy_page.S delete mode 100644 arch/sh/lib64/copy_user_memcpy.S delete mode 100644 arch/sh/lib64/memcpy.S delete mode 100644 arch/sh/lib64/memset.S delete mode 100644 arch/sh/lib64/panic.c delete mode 100644 arch/sh/lib64/sdivsi3.S delete mode 100644 arch/sh/lib64/strcpy.S delete mode 100644 arch/sh/lib64/strlen.S delete mode 100644 arch/sh/lib64/udelay.c delete mode 100644 arch/sh/lib64/udivdi3.S delete mode 100644 arch/sh/lib64/udivsi3.S delete mode 100644 arch/sh/mm/cache-sh5.c delete mode 100644 arch/sh/mm/extable_64.c delete mode 100644 arch/sh/mm/tlb-sh5.c delete mode 100644 arch/sh/mm/tlbex_64.c delete mode 100644 arch/sh/mm/tlbflush_64.c diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index b4f0e37b83eb..74403e80221c 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -54,15 +54,6 @@ config SUPERH select HAVE_NMI select NEED_SG_DMA_LENGTH select ARCH_HAS_GIGANTIC_PAGE - - help - The SuperH is a RISC processor targeted for use in embedded systems - and consumer electronics; it was also used in the Sega Dreamcast - gaming console. The SuperH port has a home page at - . - -config SUPERH32 - def_bool "$(ARCH)" = "sh" select ARCH_32BIT_OFF_T select GUP_GET_PTE_LOW_HIGH if X2TLB select HAVE_KPROBES @@ -81,19 +72,15 @@ config SUPERH32 select ARCH_HIBERNATION_POSSIBLE if MMU select SPARSE_IRQ select HAVE_STACKPROTECTOR - -config SUPERH64 - def_bool "$(ARCH)" = "sh64" - select HAVE_EXIT_THREAD - select KALLSYMS + help + The SuperH is a RISC processor targeted for use in embedded systems + and consumer electronics; it was also used in the Sega Dreamcast + gaming console. The SuperH port has a home page at + . config GENERIC_BUG def_bool y - depends on BUG && SUPERH32 - -config GENERIC_CSUM - def_bool y - depends on SUPERH64 + depends on BUG config GENERIC_HWEIGHT def_bool y @@ -203,12 +190,6 @@ config CPU_SH4AL_DSP select CPU_SH4A select CPU_HAS_DSP -config CPU_SH5 - bool - select CPU_HAS_FPU - select SYS_SUPPORTS_SH_TMU - select SYS_SUPPORTS_HUGETLBFS if MMU - config CPU_SHX2 bool @@ -228,8 +209,6 @@ config CPU_HAS_PMU default y bool -if SUPERH32 - choice prompt "Processor sub-type selection" @@ -518,27 +497,6 @@ config CPU_SUBTYPE_SH7366 endchoice -endif - -if SUPERH64 - -choice - prompt "Processor sub-type selection" - -# SH-5 Processor Support - -config CPU_SUBTYPE_SH5_101 - bool "Support SH5-101 processor" - select CPU_SH5 - -config CPU_SUBTYPE_SH5_103 - bool "Support SH5-103 processor" - select CPU_SH5 - -endchoice - -endif - source "arch/sh/mm/Kconfig" source "arch/sh/Kconfig.cpu" @@ -592,7 +550,7 @@ source "kernel/Kconfig.hz" config KEXEC bool "kexec system call (EXPERIMENTAL)" - depends on SUPERH32 && MMU + depends on MMU select KEXEC_CORE help kexec is a system call that implements the ability to shutdown your @@ -610,7 +568,7 @@ config KEXEC config CRASH_DUMP bool "kernel crash dumps (EXPERIMENTAL)" - depends on SUPERH32 && BROKEN_ON_SMP + depends on BROKEN_ON_SMP help Generate crash dump after being started by kexec. This should be normally only set in special crash dump kernels @@ -624,7 +582,7 @@ config CRASH_DUMP config KEXEC_JUMP bool "kexec jump (EXPERIMENTAL)" - depends on SUPERH32 && KEXEC && HIBERNATION + depends on KEXEC && HIBERNATION help Jump between original kernel and kexeced kernel and invoke code via KEXEC @@ -701,7 +659,7 @@ config HOTPLUG_CPU config GUSA def_bool y - depends on !SMP && SUPERH32 + depends on !SMP help This enables support for gUSA (general UserSpace Atomicity). This is the default implementation for both UP and non-ll/sc diff --git a/arch/sh/Kconfig.cpu b/arch/sh/Kconfig.cpu index 4a4edc7e03d4..97ca35f2cd37 100644 --- a/arch/sh/Kconfig.cpu +++ b/arch/sh/Kconfig.cpu @@ -13,7 +13,6 @@ config CPU_LITTLE_ENDIAN config CPU_BIG_ENDIAN bool "Big Endian" - depends on !CPU_SH5 endchoice @@ -27,10 +26,6 @@ config SH_FPU This option must be set in order to enable the FPU. -config SH64_FPU_DENORM_FLUSH - bool "Flush floating point denorms to zero" - depends on SH_FPU && SUPERH64 - config SH_FPU_EMU def_bool n prompt "FPU emulation support" @@ -77,10 +72,6 @@ config SPECULATIVE_EXECUTION If unsure, say N. -config SH64_ID2815_WORKAROUND - bool "Include workaround for SH5-101 cut2 silicon defect ID2815" - depends on CPU_SUBTYPE_SH5_101 - config CPU_HAS_INTEVT bool diff --git a/arch/sh/Kconfig.debug b/arch/sh/Kconfig.debug index 010b6c33bbba..28a43d63bde1 100644 --- a/arch/sh/Kconfig.debug +++ b/arch/sh/Kconfig.debug @@ -5,7 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT config SH_STANDARD_BIOS bool "Use LinuxSH standard BIOS" - depends on SUPERH32 help Say Y here if your target has the gdb-sh-stub package from www.m17n.org (or any conforming standard LinuxSH BIOS) @@ -19,7 +18,7 @@ config SH_STANDARD_BIOS config STACK_DEBUG bool "Check for stack overflows" - depends on DEBUG_KERNEL && SUPERH32 + depends on DEBUG_KERNEL help This option will cause messages to be printed if free stack space drops below a certain limit. Saying Y here will add overhead to @@ -38,7 +37,7 @@ config 4KSTACKS config IRQSTACKS bool "Use separate kernel stacks when processing interrupts" - depends on DEBUG_KERNEL && SUPERH32 && BROKEN + depends on DEBUG_KERNEL && BROKEN help If you say Y here the kernel will use separate kernel stacks for handling hard and soft interrupts. This can help avoid @@ -46,7 +45,7 @@ config IRQSTACKS config DUMP_CODE bool "Show disassembly of nearby code in register dumps" - depends on DEBUG_KERNEL && SUPERH32 + depends on DEBUG_KERNEL default y if DEBUG_BUGVERBOSE default n help @@ -59,7 +58,6 @@ config DUMP_CODE config DWARF_UNWINDER bool "Enable the DWARF unwinder for stacktraces" select FRAME_POINTER - depends on SUPERH32 default n help Enabling this option will make stacktraces more accurate, at @@ -77,11 +75,6 @@ config SH_NO_BSS_INIT For all other cases, say N. If this option seems perplexing, or you aren't sure, say N. -config SH64_SR_WATCH - bool "Debug: set SR.WATCH to enable hardware watchpoints and trace" - depends on SUPERH64 - config MCOUNT def_bool y - depends on SUPERH32 depends on STACK_DEBUG || FUNCTION_TRACER diff --git a/arch/sh/Makefile b/arch/sh/Makefile index b4a86f27e048..da9cf952f33c 100644 --- a/arch/sh/Makefile +++ b/arch/sh/Makefile @@ -11,7 +11,7 @@ # ifneq ($(SUBARCH),$(ARCH)) ifeq ($(CROSS_COMPILE),) - CROSS_COMPILE := $(call cc-cross-prefix, $(UTS_MACHINE)-linux- $(UTS_MACHINE)-linux-gnu- $(UTS_MACHINE)-unknown-linux-gnu-) + CROSS_COMPILE := $(call cc-cross-prefix, sh-linux- sh-linux-gnu- sh-unknown-linux-gnu-) endif endif @@ -29,12 +29,9 @@ isa-$(CONFIG_CPU_SH3) := sh3 isa-$(CONFIG_CPU_SH4) := sh4 isa-$(CONFIG_CPU_SH4A) := sh4a isa-$(CONFIG_CPU_SH4AL_DSP) := sh4al -isa-$(CONFIG_CPU_SH5) := shmedia -ifeq ($(CONFIG_SUPERH32),y) isa-$(CONFIG_SH_DSP) := $(isa-y)-dsp isa-y := $(isa-y)-up -endif cflags-$(CONFIG_CPU_SH2) := $(call cc-option,-m2,) cflags-$(CONFIG_CPU_J2) += $(call cc-option,-mj2,) @@ -47,7 +44,6 @@ cflags-$(CONFIG_CPU_SH4) := $(call cc-option,-m4,) \ cflags-$(CONFIG_CPU_SH4A) += $(call cc-option,-m4a,) \ $(call cc-option,-m4a-nofpu,) cflags-$(CONFIG_CPU_SH4AL_DSP) += $(call cc-option,-m4al,) -cflags-$(CONFIG_CPU_SH5) := $(call cc-option,-m5-32media-nofpu,) ifeq ($(cflags-y),) # @@ -88,7 +84,7 @@ OBJCOPYFLAGS := -O binary -R .note -R .note.gnu.build-id -R .comment \ -R .stab -R .stabstr -S # Give the various platforms the opportunity to set default image types -defaultimage-$(CONFIG_SUPERH32) := zImage +defaultimage-y := zImage defaultimage-$(CONFIG_SH_SH7785LCR) := uImage defaultimage-$(CONFIG_SH_RSK) := uImage defaultimage-$(CONFIG_SH_URQUELL) := uImage @@ -107,31 +103,22 @@ KBUILD_IMAGE := $(boot)/$(defaultimage-y) # Choosing incompatible machines durings configuration will result in # error messages during linking. # -ifdef CONFIG_SUPERH32 UTS_MACHINE := sh -BITS := 32 LDFLAGS_vmlinux += -e _stext -else -UTS_MACHINE := sh64 -BITS := 64 -LDFLAGS_vmlinux += --defsym phys_stext=_stext-$(CONFIG_PAGE_OFFSET) \ - --defsym phys_stext_shmedia=phys_stext+1 \ - -e phys_stext_shmedia -endif ifdef CONFIG_CPU_LITTLE_ENDIAN -ld-bfd := elf32-$(UTS_MACHINE)-linux +ld-bfd := elf32-sh-linux LDFLAGS_vmlinux += --defsym jiffies=jiffies_64 --oformat $(ld-bfd) KBUILD_LDFLAGS += -EL else -ld-bfd := elf32-$(UTS_MACHINE)big-linux +ld-bfd := elf32-shbig-linux LDFLAGS_vmlinux += --defsym jiffies=jiffies_64+4 --oformat $(ld-bfd) KBUILD_LDFLAGS += -EB endif -export ld-bfd BITS +export ld-bfd -head-y := arch/sh/kernel/head_$(BITS).o +head-y := arch/sh/kernel/head_32.o core-y += arch/sh/kernel/ arch/sh/mm/ arch/sh/boards/ core-$(CONFIG_SH_FPU_EMU) += arch/sh/math-emu/ @@ -185,7 +172,6 @@ cpuincdir-$(CONFIG_CPU_SH2) += cpu-sh2 cpuincdir-$(CONFIG_CPU_SH3) += cpu-sh3 cpuincdir-$(CONFIG_CPU_SH4A) += cpu-sh4a cpuincdir-$(CONFIG_CPU_SH4) += cpu-sh4 -cpuincdir-$(CONFIG_CPU_SH5) += cpu-sh5 cpuincdir-y += cpu-common # Must be last drivers-y += arch/sh/drivers/ @@ -206,8 +192,7 @@ ifeq ($(CONFIG_DWARF_UNWINDER),y) KBUILD_CFLAGS += -fasynchronous-unwind-tables endif -libs-$(CONFIG_SUPERH32) := arch/sh/lib/ $(libs-y) -libs-$(CONFIG_SUPERH64) := arch/sh/lib64/ $(libs-y) +libs-y := arch/sh/lib/ $(libs-y) BOOT_TARGETS = uImage uImage.bz2 uImage.gz uImage.lzma uImage.xz uImage.lzo \ uImage.srec uImage.bin zImage vmlinux.bin vmlinux.srec \ diff --git a/arch/sh/boot/compressed/Makefile b/arch/sh/boot/compressed/Makefile index f5e1bd779789..ad0e2403e56f 100644 --- a/arch/sh/boot/compressed/Makefile +++ b/arch/sh/boot/compressed/Makefile @@ -8,9 +8,9 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz \ vmlinux.bin.bz2 vmlinux.bin.lzma \ vmlinux.bin.xz vmlinux.bin.lzo \ - head_$(BITS).o misc.o piggy.o + head_32.o misc.o piggy.o -OBJECTS = $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/cache.o +OBJECTS = $(obj)/head_32.o $(obj)/misc.o $(obj)/cache.o GCOV_PROFILE := n @@ -39,15 +39,11 @@ LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(IMAGE_OFFSET) -e startup \ # # Pull in the necessary libgcc bits from the in-kernel implementation. # -lib1funcs-$(CONFIG_SUPERH32) := ashiftrt.S ashldi3.c ashrsi3.S ashlsi3.S \ - lshrsi3.S -lib1funcs-obj := \ +lib1funcs-y := ashiftrt.S ashldi3.c ashrsi3.S ashlsi3.S lshrsi3.S +lib1funcs-obj := \ $(addsuffix .o, $(basename $(addprefix $(obj)/, $(lib1funcs-y)))) lib1funcs-dir := $(srctree)/arch/$(SRCARCH)/lib -ifeq ($(BITS),64) - lib1funcs-dir := $(addsuffix $(BITS), $(lib1funcs-dir)) -endif KBUILD_CFLAGS += -I$(lib1funcs-dir) -DDISABLE_BRANCH_PROFILING diff --git a/arch/sh/boot/compressed/misc.c b/arch/sh/boot/compressed/misc.c index e69ec12cbbe6..a03b6680a9d9 100644 --- a/arch/sh/boot/compressed/misc.c +++ b/arch/sh/boot/compressed/misc.c @@ -116,11 +116,7 @@ void ftrace_stub(void) { } -#ifdef CONFIG_SUPERH64 -#define stackalign 8 -#else #define stackalign 4 -#endif #define STACK_SIZE (4096) long __attribute__ ((aligned(stackalign))) user_stack[STACK_SIZE]; @@ -130,13 +126,9 @@ void decompress_kernel(void) { unsigned long output_addr; -#ifdef CONFIG_SUPERH64 - output_addr = (CONFIG_MEMORY_START + 0x2000); -#else output_addr = __pa((unsigned long)&_text+PAGE_SIZE); #if defined(CONFIG_29BIT) output_addr |= P2SEG; -#endif #endif output = (unsigned char *)output_addr; diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile index 947bfe8bb0a7..a5c1e9066f83 100644 --- a/arch/sh/drivers/pci/Makefile +++ b/arch/sh/drivers/pci/Makefile @@ -10,7 +10,6 @@ obj-$(CONFIG_CPU_SUBTYPE_SH7763) += pci-sh7780.o ops-sh4.o obj-$(CONFIG_CPU_SUBTYPE_SH7780) += pci-sh7780.o ops-sh4.o obj-$(CONFIG_CPU_SUBTYPE_SH7785) += pci-sh7780.o ops-sh4.o obj-$(CONFIG_CPU_SUBTYPE_SH7786) += pcie-sh7786.o ops-sh7786.o -obj-$(CONFIG_CPU_SH5) += pci-sh5.o ops-sh5.o obj-$(CONFIG_SH_DREAMCAST) += ops-dreamcast.o fixups-dreamcast.o \ pci-dreamcast.o diff --git a/arch/sh/drivers/pci/ops-sh5.c b/arch/sh/drivers/pci/ops-sh5.c deleted file mode 100644 index 9fbaf72949ab..000000000000 --- a/arch/sh/drivers/pci/ops-sh5.c +++ /dev/null @@ -1,65 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Support functions for the SH5 PCI hardware. - * - * Copyright (C) 2001 David J. Mckay (david.mckay@st.com) - * Copyright (C) 2003, 2004 Paul Mundt - * Copyright (C) 2004 Richard Curnow - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "pci-sh5.h" - -static int sh5pci_read(struct pci_bus *bus, unsigned int devfn, int where, - int size, u32 *val) -{ - SH5PCI_WRITE(PAR, CONFIG_CMD(bus, devfn, where)); - - switch (size) { - case 1: - *val = (u8)SH5PCI_READ_BYTE(PDR + (where & 3)); - break; - case 2: - *val = (u16)SH5PCI_READ_SHORT(PDR + (where & 2)); - break; - case 4: - *val = SH5PCI_READ(PDR); - break; - } - - return PCIBIOS_SUCCESSFUL; -} - -static int sh5pci_write(struct pci_bus *bus, unsigned int devfn, int where, - int size, u32 val) -{ - SH5PCI_WRITE(PAR, CONFIG_CMD(bus, devfn, where)); - - switch (size) { - case 1: - SH5PCI_WRITE_BYTE(PDR + (where & 3), (u8)val); - break; - case 2: - SH5PCI_WRITE_SHORT(PDR + (where & 2), (u16)val); - break; - case 4: - SH5PCI_WRITE(PDR, val); - break; - } - - return PCIBIOS_SUCCESSFUL; -} - -struct pci_ops sh5_pci_ops = { - .read = sh5pci_read, - .write = sh5pci_write, -}; diff --git a/arch/sh/drivers/pci/pci-sh5.c b/arch/sh/drivers/pci/pci-sh5.c deleted file mode 100644 index 03225d27770b..000000000000 --- a/arch/sh/drivers/pci/pci-sh5.c +++ /dev/null @@ -1,217 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 David J. Mckay (david.mckay@st.com) - * Copyright (C) 2003, 2004 Paul Mundt - * Copyright (C) 2004 Richard Curnow - * - * Support functions for the SH5 PCI hardware. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "pci-sh5.h" - -unsigned long pcicr_virt; -unsigned long PCI_IO_AREA; - -/* Rounds a number UP to the nearest power of two. Used for - * sizing the PCI window. - */ -static u32 __init r2p2(u32 num) -{ - int i = 31; - u32 tmp = num; - - if (num == 0) - return 0; - - do { - if (tmp & (1 << 31)) - break; - i--; - tmp <<= 1; - } while (i >= 0); - - tmp = 1 << i; - /* If the original number isn't a power of 2, round it up */ - if (tmp != num) - tmp <<= 1; - - return tmp; -} - -static irqreturn_t pcish5_err_irq(int irq, void *dev_id) -{ - struct pt_regs *regs = get_irq_regs(); - unsigned pci_int, pci_air, pci_cir, pci_aint; - - pci_int = SH5PCI_READ(INT); - pci_cir = SH5PCI_READ(CIR); - pci_air = SH5PCI_READ(AIR); - - if (pci_int) { - printk("PCI INTERRUPT (at %08llx)!\n", regs->pc); - printk("PCI INT -> 0x%x\n", pci_int & 0xffff); - printk("PCI AIR -> 0x%x\n", pci_air); - printk("PCI CIR -> 0x%x\n", pci_cir); - SH5PCI_WRITE(INT, ~0); - } - - pci_aint = SH5PCI_READ(AINT); - if (pci_aint) { - printk("PCI ARB INTERRUPT!\n"); - printk("PCI AINT -> 0x%x\n", pci_aint); - printk("PCI AIR -> 0x%x\n", pci_air); - printk("PCI CIR -> 0x%x\n", pci_cir); - SH5PCI_WRITE(AINT, ~0); - } - - return IRQ_HANDLED; -} - -static irqreturn_t pcish5_serr_irq(int irq, void *dev_id) -{ - printk("SERR IRQ\n"); - - return IRQ_NONE; -} - -static struct resource sh5_pci_resources[2]; - -static struct pci_channel sh5pci_controller = { - .pci_ops = &sh5_pci_ops, - .resources = sh5_pci_resources, - .nr_resources = ARRAY_SIZE(sh5_pci_resources), - .mem_offset = 0x00000000, - .io_offset = 0x00000000, -}; - -static int __init sh5pci_init(void) -{ - unsigned long memStart = __pa(memory_start); - unsigned long memSize = __pa(memory_end) - memStart; - u32 lsr0; - u32 uval; - - if (request_irq(IRQ_ERR, pcish5_err_irq, - 0, "PCI Error",NULL) < 0) { - printk(KERN_ERR "PCISH5: Cannot hook PCI_PERR interrupt\n"); - return -EINVAL; - } - - if (request_irq(IRQ_SERR, pcish5_serr_irq, - 0, "PCI SERR interrupt", NULL) < 0) { - printk(KERN_ERR "PCISH5: Cannot hook PCI_SERR interrupt\n"); - return -EINVAL; - } - - pcicr_virt = (unsigned long)ioremap(SH5PCI_ICR_BASE, 1024); - if (!pcicr_virt) { - panic("Unable to remap PCICR\n"); - } - - PCI_IO_AREA = (unsigned long)ioremap(SH5PCI_IO_BASE, 0x10000); - if (!PCI_IO_AREA) { - panic("Unable to remap PCIIO\n"); - } - - /* Clear snoop registers */ - SH5PCI_WRITE(CSCR0, 0); - SH5PCI_WRITE(CSCR1, 0); - - /* Switch off interrupts */ - SH5PCI_WRITE(INTM, 0); - SH5PCI_WRITE(AINTM, 0); - SH5PCI_WRITE(PINTM, 0); - - /* Set bus active, take it out of reset */ - uval = SH5PCI_READ(CR); - - /* Set command Register */ - SH5PCI_WRITE(CR, uval | CR_LOCK_MASK | CR_CFINT| CR_FTO | CR_PFE | - CR_PFCS | CR_BMAM); - - uval=SH5PCI_READ(CR); - - /* Allow it to be a master */ - /* NB - WE DISABLE I/O ACCESS to stop overlap */ - /* set WAIT bit to enable stepping, an attempt to improve stability */ - SH5PCI_WRITE_SHORT(CSR_CMD, - PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | - PCI_COMMAND_WAIT); - - /* - ** Set translation mapping memory in order to convert the address - ** used for the main bus, to the PCI internal address. - */ - SH5PCI_WRITE(MBR,0x40000000); - - /* Always set the max size 512M */ - SH5PCI_WRITE(MBMR, PCISH5_MEM_SIZCONV(512*1024*1024)); - - /* - ** I/O addresses are mapped at internal PCI specific address - ** as is described into the configuration bridge table. - ** These are changed to 0, to allow cards that have legacy - ** io such as vga to function correctly. We set the SH5 IOBAR to - ** 256K, which is a bit big as we can only have 64K of address space - */ - - SH5PCI_WRITE(IOBR,0x0); - - /* Set up a 256K window. Totally pointless waste of address space */ - SH5PCI_WRITE(IOBMR,0); - - /* The SH5 has a HUGE 256K I/O region, which breaks the PCI spec. - * Ideally, we would want to map the I/O region somewhere, but it - * is so big this is not that easy! - */ - SH5PCI_WRITE(CSR_IBAR0,~0); - /* Set memory size value */ - memSize = memory_end - memory_start; - - /* Now we set up the mbars so the PCI bus can see the memory of - * the machine */ - if (memSize < (1024 * 1024)) { - printk(KERN_ERR "PCISH5: Ridiculous memory size of 0x%lx?\n", - memSize); - return -EINVAL; - } - - /* Set LSR 0 */ - lsr0 = (memSize > (512 * 1024 * 1024)) ? 0x1ff00001 : - ((r2p2(memSize) - 0x100000) | 0x1); - SH5PCI_WRITE(LSR0, lsr0); - - /* Set MBAR 0 */ - SH5PCI_WRITE(CSR_MBAR0, memory_start); - SH5PCI_WRITE(LAR0, memory_start); - - SH5PCI_WRITE(CSR_MBAR1,0); - SH5PCI_WRITE(LAR1,0); - SH5PCI_WRITE(LSR1,0); - - /* Enable the PCI interrupts on the device */ - SH5PCI_WRITE(INTM, ~0); - SH5PCI_WRITE(AINTM, ~0); - SH5PCI_WRITE(PINTM, ~0); - - sh5_pci_resources[0].start = PCI_IO_AREA; - sh5_pci_resources[0].end = PCI_IO_AREA + 0x10000; - - sh5_pci_resources[1].start = memStart; - sh5_pci_resources[1].end = memStart + memSize; - - return register_pci_controller(&sh5pci_controller); -} -arch_initcall(sh5pci_init); diff --git a/arch/sh/drivers/pci/pci-sh5.h b/arch/sh/drivers/pci/pci-sh5.h deleted file mode 100644 index 91348af0ef6c..000000000000 --- a/arch/sh/drivers/pci/pci-sh5.h +++ /dev/null @@ -1,108 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright (C) 2001 David J. Mckay (david.mckay@st.com) - * - * Definitions for the SH5 PCI hardware. - */ -#ifndef __PCI_SH5_H -#define __PCI_SH5_H - -/* Product ID */ -#define PCISH5_PID 0x350d - -/* vendor ID */ -#define PCISH5_VID 0x1054 - -/* Configuration types */ -#define ST_TYPE0 0x00 /* Configuration cycle type 0 */ -#define ST_TYPE1 0x01 /* Configuration cycle type 1 */ - -/* VCR data */ -#define PCISH5_VCR_STATUS 0x00 -#define PCISH5_VCR_VERSION 0x08 - -/* -** ICR register offsets and bits -*/ -#define PCISH5_ICR_CR 0x100 /* PCI control register values */ -#define CR_PBAM (1<<12) -#define CR_PFCS (1<<11) -#define CR_FTO (1<<10) -#define CR_PFE (1<<9) -#define CR_TBS (1<<8) -#define CR_SPUE (1<<7) -#define CR_BMAM (1<<6) -#define CR_HOST (1<<5) -#define CR_CLKEN (1<<4) -#define CR_SOCS (1<<3) -#define CR_IOCS (1<<2) -#define CR_RSTCTL (1<<1) -#define CR_CFINT (1<<0) -#define CR_LOCK_MASK 0xa5000000 - -#define PCISH5_ICR_INT 0x114 /* Interrupt registert values */ -#define INT_MADIM (1<<2) - -#define PCISH5_ICR_LSR0 0X104 /* Local space register values */ -#define PCISH5_ICR_LSR1 0X108 /* Local space register values */ -#define PCISH5_ICR_LAR0 0x10c /* Local address register values */ -#define PCISH5_ICR_LAR1 0x110 /* Local address register values */ -#define PCISH5_ICR_INTM 0x118 /* Interrupt mask register values */ -#define PCISH5_ICR_AIR 0x11c /* Interrupt error address information register values */ -#define PCISH5_ICR_CIR 0x120 /* Interrupt error command information register values */ -#define PCISH5_ICR_AINT 0x130 /* Interrupt error arbiter interrupt register values */ -#define PCISH5_ICR_AINTM 0x134 /* Interrupt error arbiter interrupt mask register values */ -#define PCISH5_ICR_BMIR 0x138 /* Interrupt error info register of bus master values */ -#define PCISH5_ICR_PAR 0x1c0 /* Pio address register values */ -#define PCISH5_ICR_MBR 0x1c4 /* Memory space bank register values */ -#define PCISH5_ICR_IOBR 0x1c8 /* I/O space bank register values */ -#define PCISH5_ICR_PINT 0x1cc /* power management interrupt register values */ -#define PCISH5_ICR_PINTM 0x1d0 /* power management interrupt mask register values */ -#define PCISH5_ICR_MBMR 0x1d8 /* memory space bank mask register values */ -#define PCISH5_ICR_IOBMR 0x1dc /* I/O space bank mask register values */ -#define PCISH5_ICR_CSCR0 0x210 /* PCI cache snoop control register 0 */ -#define PCISH5_ICR_CSCR1 0x214 /* PCI cache snoop control register 1 */ -#define PCISH5_ICR_PDR 0x220 /* Pio data register values */ - -/* These are configs space registers */ -#define PCISH5_ICR_CSR_VID 0x000 /* Vendor id */ -#define PCISH5_ICR_CSR_DID 0x002 /* Device id */ -#define PCISH5_ICR_CSR_CMD 0x004 /* Command register */ -#define PCISH5_ICR_CSR_STATUS 0x006 /* Stautus */ -#define PCISH5_ICR_CSR_IBAR0 0x010 /* I/O base address register */ -#define PCISH5_ICR_CSR_MBAR0 0x014 /* First Memory base address register */ -#define PCISH5_ICR_CSR_MBAR1 0x018 /* Second Memory base address register */ - -/* Base address of registers */ -#define SH5PCI_ICR_BASE (PHYS_PCI_BLOCK + 0x00040000) -#define SH5PCI_IO_BASE (PHYS_PCI_BLOCK + 0x00800000) -/* #define SH5PCI_VCR_BASE (P2SEG_PCICB_BLOCK + P2SEG) */ - -extern unsigned long pcicr_virt; -/* Register selection macro */ -#define PCISH5_ICR_REG(x) ( pcicr_virt + (PCISH5_ICR_##x)) -/* #define PCISH5_VCR_REG(x) ( SH5PCI_VCR_BASE (PCISH5_VCR_##x)) */ - -/* Write I/O functions */ -#define SH5PCI_WRITE(reg,val) __raw_writel((u32)(val),PCISH5_ICR_REG(reg)) -#define SH5PCI_WRITE_SHORT(reg,val) __raw_writew((u16)(val),PCISH5_ICR_REG(reg)) -#define SH5PCI_WRITE_BYTE(reg,val) __raw_writeb((u8)(val),PCISH5_ICR_REG(reg)) - -/* Read I/O functions */ -#define SH5PCI_READ(reg) __raw_readl(PCISH5_ICR_REG(reg)) -#define SH5PCI_READ_SHORT(reg) __raw_readw(PCISH5_ICR_REG(reg)) -#define SH5PCI_READ_BYTE(reg) __raw_readb(PCISH5_ICR_REG(reg)) - -/* Set PCI config bits */ -#define SET_CONFIG_BITS(bus,devfn,where) ((((bus) << 16) | ((devfn) << 8) | ((where) & ~3)) | 0x80000000) - -/* Set PCI command register */ -#define CONFIG_CMD(bus, devfn, where) SET_CONFIG_BITS(bus->number,devfn,where) - -/* Size converters */ -#define PCISH5_MEM_SIZCONV(x) (((x / 0x40000) - 1) << 18) -#define PCISH5_IO_SIZCONV(x) (((x / 0x40000) - 1) << 18) - -extern struct pci_ops sh5_pci_ops; - -#endif /* __PCI_SH5_H */ diff --git a/arch/sh/include/asm/barrier.h b/arch/sh/include/asm/barrier.h index 66faae19d254..0d58a0159aa6 100644 --- a/arch/sh/include/asm/barrier.h +++ b/arch/sh/include/asm/barrier.h @@ -6,7 +6,7 @@ #ifndef __ASM_SH_BARRIER_H #define __ASM_SH_BARRIER_H -#if defined(CONFIG_CPU_SH4A) || defined(CONFIG_CPU_SH5) +#if defined(CONFIG_CPU_SH4A) #include #endif @@ -24,7 +24,7 @@ * Historically we have only done this type of barrier for the MMUCR, but * it's also necessary for the CCR, so we make it generic here instead. */ -#if defined(CONFIG_CPU_SH4A) || defined(CONFIG_CPU_SH5) +#if defined(CONFIG_CPU_SH4A) #define mb() __asm__ __volatile__ ("synco": : :"memory") #define rmb() mb() #define wmb() mb() diff --git a/arch/sh/include/asm/bitops.h b/arch/sh/include/asm/bitops.h index 8c3578288db5..445dd14c448a 100644 --- a/arch/sh/include/asm/bitops.h +++ b/arch/sh/include/asm/bitops.h @@ -26,7 +26,6 @@ #include #endif -#ifdef CONFIG_SUPERH32 static inline unsigned long ffz(unsigned long word) { unsigned long result; @@ -60,31 +59,6 @@ static inline unsigned long __ffs(unsigned long word) : "t"); return result; } -#else -static inline unsigned long ffz(unsigned long word) -{ - unsigned long result, __d2, __d3; - - __asm__("gettr tr0, %2\n\t" - "pta $+32, tr0\n\t" - "andi %1, 1, %3\n\t" - "beq %3, r63, tr0\n\t" - "pta $+4, tr0\n" - "0:\n\t" - "shlri.l %1, 1, %1\n\t" - "addi %0, 1, %0\n\t" - "andi %1, 1, %3\n\t" - "beqi %3, 1, tr0\n" - "1:\n\t" - "ptabs %2, tr0\n\t" - : "=r" (result), "=r" (word), "=r" (__d2), "=r" (__d3) - : "0" (0L), "1" (word)); - - return result; -} - -#include -#endif #include #include diff --git a/arch/sh/include/asm/bl_bit.h b/arch/sh/include/asm/bl_bit.h index 7e3d81691ad5..5d04f2c62563 100644 --- a/arch/sh/include/asm/bl_bit.h +++ b/arch/sh/include/asm/bl_bit.h @@ -1,11 +1,2 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_SH_BL_BIT_H -#define __ASM_SH_BL_BIT_H - -#ifdef CONFIG_SUPERH32 -# include -#else -# include -#endif - -#endif /* __ASM_SH_BL_BIT_H */ +#include diff --git a/arch/sh/include/asm/bl_bit_64.h b/arch/sh/include/asm/bl_bit_64.h deleted file mode 100644 index aac9780fe864..000000000000 --- a/arch/sh/include/asm/bl_bit_64.h +++ /dev/null @@ -1,37 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2003 Paul Mundt - * Copyright (C) 2004 Richard Curnow - */ -#ifndef __ASM_SH_BL_BIT_64_H -#define __ASM_SH_BL_BIT_64_H - -#include - -#define SR_BL_LL 0x0000000010000000LL - -static inline void set_bl_bit(void) -{ - unsigned long long __dummy0, __dummy1 = SR_BL_LL; - - __asm__ __volatile__("getcon " __SR ", %0\n\t" - "or %0, %1, %0\n\t" - "putcon %0, " __SR "\n\t" - : "=&r" (__dummy0) - : "r" (__dummy1)); - -} - -static inline void clear_bl_bit(void) -{ - unsigned long long __dummy0, __dummy1 = ~SR_BL_LL; - - __asm__ __volatile__("getcon " __SR ", %0\n\t" - "and %0, %1, %0\n\t" - "putcon %0, " __SR "\n\t" - : "=&r" (__dummy0) - : "r" (__dummy1)); -} - -#endif /* __ASM_SH_BL_BIT_64_H */ diff --git a/arch/sh/include/asm/bugs.h b/arch/sh/include/asm/bugs.h index 030df56bfdb2..fe52abb69cea 100644 --- a/arch/sh/include/asm/bugs.h +++ b/arch/sh/include/asm/bugs.h @@ -53,10 +53,6 @@ static void __init check_bugs(void) *p++ = 's'; *p++ = 'p'; break; - case CPU_FAMILY_SH5: - *p++ = '6'; - *p++ = '4'; - break; case CPU_FAMILY_UNKNOWN: /* * Specifically use CPU_FAMILY_UNKNOWN rather than diff --git a/arch/sh/include/asm/cache_insns.h b/arch/sh/include/asm/cache_insns.h index c5a4acdc53f9..d7edd5297bd0 100644 --- a/arch/sh/include/asm/cache_insns.h +++ b/arch/sh/include/asm/cache_insns.h @@ -1,12 +1,2 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_SH_CACHE_INSNS_H -#define __ASM_SH_CACHE_INSNS_H - - -#ifdef CONFIG_SUPERH32 -# include -#else -# include -#endif - -#endif /* __ASM_SH_CACHE_INSNS_H */ +#include diff --git a/arch/sh/include/asm/cache_insns_64.h b/arch/sh/include/asm/cache_insns_64.h deleted file mode 100644 index ed682b987b0d..000000000000 --- a/arch/sh/include/asm/cache_insns_64.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2003 Paul Mundt - * Copyright (C) 2004 Richard Curnow - */ -#ifndef __ASM_SH_CACHE_INSNS_64_H -#define __ASM_SH_CACHE_INSNS_64_H - -#define __icbi(addr) __asm__ __volatile__ ( "icbi %0, 0\n\t" : : "r" (addr)) -#define __ocbp(addr) __asm__ __volatile__ ( "ocbp %0, 0\n\t" : : "r" (addr)) -#define __ocbi(addr) __asm__ __volatile__ ( "ocbi %0, 0\n\t" : : "r" (addr)) -#define __ocbwb(addr) __asm__ __volatile__ ( "ocbwb %0, 0\n\t" : : "r" (addr)) - -static inline reg_size_t register_align(void *val) -{ - return (unsigned long long)(signed long long)(signed long)val; -} - -#endif /* __ASM_SH_CACHE_INSNS_64_H */ diff --git a/arch/sh/include/asm/checksum.h b/arch/sh/include/asm/checksum.h index a460a108969d..00e39dd0d146 100644 --- a/arch/sh/include/asm/checksum.h +++ b/arch/sh/include/asm/checksum.h @@ -1,6 +1,2 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifdef CONFIG_SUPERH32 -# include -#else -# include -#endif +#include diff --git a/arch/sh/include/asm/elf.h b/arch/sh/include/asm/elf.h index 5ec8db1ddc20..7661fb5d548a 100644 --- a/arch/sh/include/asm/elf.h +++ b/arch/sh/include/asm/elf.h @@ -133,28 +133,6 @@ typedef struct user_fpu_struct elf_fpregset_t; #define ELF_PLATFORM (utsname()->machine) -#ifdef __SH5__ -#define ELF_PLAT_INIT(_r, load_addr) \ - do { _r->regs[0]=0; _r->regs[1]=0; _r->regs[2]=0; _r->regs[3]=0; \ - _r->regs[4]=0; _r->regs[5]=0; _r->regs[6]=0; _r->regs[7]=0; \ - _r->regs[8]=0; _r->regs[9]=0; _r->regs[10]=0; _r->regs[11]=0; \ - _r->regs[12]=0; _r->regs[13]=0; _r->regs[14]=0; _r->regs[15]=0; \ - _r->regs[16]=0; _r->regs[17]=0; _r->regs[18]=0; _r->regs[19]=0; \ - _r->regs[20]=0; _r->regs[21]=0; _r->regs[22]=0; _r->regs[23]=0; \ - _r->regs[24]=0; _r->regs[25]=0; _r->regs[26]=0; _r->regs[27]=0; \ - _r->regs[28]=0; _r->regs[29]=0; _r->regs[30]=0; _r->regs[31]=0; \ - _r->regs[32]=0; _r->regs[33]=0; _r->regs[34]=0; _r->regs[35]=0; \ - _r->regs[36]=0; _r->regs[37]=0; _r->regs[38]=0; _r->regs[39]=0; \ - _r->regs[40]=0; _r->regs[41]=0; _r->regs[42]=0; _r->regs[43]=0; \ - _r->regs[44]=0; _r->regs[45]=0; _r->regs[46]=0; _r->regs[47]=0; \ - _r->regs[48]=0; _r->regs[49]=0; _r->regs[50]=0; _r->regs[51]=0; \ - _r->regs[52]=0; _r->regs[53]=0; _r->regs[54]=0; _r->regs[55]=0; \ - _r->regs[56]=0; _r->regs[57]=0; _r->regs[58]=0; _r->regs[59]=0; \ - _r->regs[60]=0; _r->regs[61]=0; _r->regs[62]=0; \ - _r->tregs[0]=0; _r->tregs[1]=0; _r->tregs[2]=0; _r->tregs[3]=0; \ - _r->tregs[4]=0; _r->tregs[5]=0; _r->tregs[6]=0; _r->tregs[7]=0; \ - _r->sr = SR_FD | SR_MMU; } while (0) -#else #define ELF_PLAT_INIT(_r, load_addr) \ do { _r->regs[0]=0; _r->regs[1]=0; _r->regs[2]=0; _r->regs[3]=0; \ _r->regs[4]=0; _r->regs[5]=0; _r->regs[6]=0; _r->regs[7]=0; \ @@ -182,7 +160,6 @@ do { \ _r->regs[14] = 0; \ _r->sr = SR_FD; \ } while (0) -#endif #define SET_PERSONALITY(ex) \ set_personality(PER_LINUX_32BIT | (current->personality & (~PER_MASK))) diff --git a/arch/sh/include/asm/extable.h b/arch/sh/include/asm/extable.h index ed46f8bebb9f..5658d2bae372 100644 --- a/arch/sh/include/asm/extable.h +++ b/arch/sh/include/asm/extable.h @@ -4,8 +4,4 @@ #include -#if defined(CONFIG_SUPERH64) && defined(CONFIG_MMU) -#define ARCH_HAS_SEARCH_EXTABLE -#endif - #endif diff --git a/arch/sh/include/asm/fixmap.h b/arch/sh/include/asm/fixmap.h index e30348c58073..f38adc189b83 100644 --- a/arch/sh/include/asm/fixmap.h +++ b/arch/sh/include/asm/fixmap.h @@ -83,11 +83,7 @@ extern void __clear_fixmap(enum fixed_addresses idx, pgprot_t flags); * the start of the fixmap, and leave one page empty * at the top of mem.. */ -#ifdef CONFIG_SUPERH32 #define FIXADDR_TOP (P4SEG - PAGE_SIZE) -#else -#define FIXADDR_TOP ((unsigned long)(-PAGE_SIZE)) -#endif #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h index b42228906eaf..3924d91e0fa0 100644 --- a/arch/sh/include/asm/io.h +++ b/arch/sh/include/asm/io.h @@ -115,12 +115,8 @@ static inline void pfx##reads##bwlq(volatile void __iomem *mem, \ __BUILD_MEMORY_STRING(__raw_, b, u8) __BUILD_MEMORY_STRING(__raw_, w, u16) -#ifdef CONFIG_SUPERH32 void __raw_writesl(void __iomem *addr, const void *data, int longlen); void __raw_readsl(const void __iomem *addr, void *data, int longlen); -#else -__BUILD_MEMORY_STRING(__raw_, l, u32) -#endif __BUILD_MEMORY_STRING(__raw_, q, u64) diff --git a/arch/sh/include/asm/irq.h b/arch/sh/include/asm/irq.h index 8065a3222e19..6d44c32ef047 100644 --- a/arch/sh/include/asm/irq.h +++ b/arch/sh/include/asm/irq.h @@ -66,8 +66,5 @@ extern void irq_finish(unsigned int irq); #endif #include -#ifdef CONFIG_CPU_SH5 -#include -#endif #endif /* __ASM_SH_IRQ_H */ diff --git a/arch/sh/include/asm/mmu_context.h b/arch/sh/include/asm/mmu_context.h index 2d09650093c7..48e67d544d53 100644 --- a/arch/sh/include/asm/mmu_context.h +++ b/arch/sh/include/asm/mmu_context.h @@ -48,11 +48,7 @@ */ #define MMU_VPN_MASK 0xfffff000 -#if defined(CONFIG_SUPERH32) #include -#else -#include -#endif /* * Get MMU context if needed. @@ -74,14 +70,6 @@ static inline void get_mmu_context(struct mm_struct *mm, unsigned int cpu) */ local_flush_tlb_all(); -#ifdef CONFIG_SUPERH64 - /* - * The SH-5 cache uses the ASIDs, requiring both the I and D - * cache to be flushed when the ASID is exhausted. Weak. - */ - flush_cache_all(); -#endif - /* * Fix version; Note that we avoid version #0 * to distinguish NO_CONTEXT. diff --git a/arch/sh/include/asm/mmu_context_64.h b/arch/sh/include/asm/mmu_context_64.h deleted file mode 100644 index bacafe0b887d..000000000000 --- a/arch/sh/include/asm/mmu_context_64.h +++ /dev/null @@ -1,75 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_SH_MMU_CONTEXT_64_H -#define __ASM_SH_MMU_CONTEXT_64_H - -/* - * sh64-specific mmu_context interface. - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2003 - 2007 Paul Mundt - */ -#include -#include - -#define SR_ASID_MASK 0xffffffffff00ffffULL -#define SR_ASID_SHIFT 16 - -/* - * Destroy context related info for an mm_struct that is about - * to be put to rest. - */ -static inline void destroy_context(struct mm_struct *mm) -{ - /* Well, at least free TLB entries */ - flush_tlb_mm(mm); -} - -static inline unsigned long get_asid(void) -{ - unsigned long long sr; - - asm volatile ("getcon " __SR ", %0\n\t" - : "=r" (sr)); - - sr = (sr >> SR_ASID_SHIFT) & MMU_CONTEXT_ASID_MASK; - return (unsigned long) sr; -} - -/* Set ASID into SR */ -static inline void set_asid(unsigned long asid) -{ - unsigned long long sr, pc; - - asm volatile ("getcon " __SR ", %0" : "=r" (sr)); - - sr = (sr & SR_ASID_MASK) | (asid << SR_ASID_SHIFT); - - /* - * It is possible that this function may be inlined and so to avoid - * the assembler reporting duplicate symbols we make use of the - * gas trick of generating symbols using numerics and forward - * reference. - */ - asm volatile ("movi 1, %1\n\t" - "shlli %1, 28, %1\n\t" - "or %0, %1, %1\n\t" - "putcon %1, " __SR "\n\t" - "putcon %0, " __SSR "\n\t" - "movi 1f, %1\n\t" - "ori %1, 1 , %1\n\t" - "putcon %1, " __SPC "\n\t" - "rte\n" - "1:\n\t" - : "=r" (sr), "=r" (pc) : "0" (sr)); -} - -/* arch/sh/kernel/cpu/sh5/entry.S */ -extern unsigned long switch_and_save_asid(unsigned long new_asid); - -/* No spare register to twiddle, so use a software cache */ -extern pgd_t *mmu_pdtp_cache; - -#define set_TTB(pgd) (mmu_pdtp_cache = (pgd)) -#define get_TTB() (mmu_pdtp_cache) - -#endif /* __ASM_SH_MMU_CONTEXT_64_H */ diff --git a/arch/sh/include/asm/page.h b/arch/sh/include/asm/page.h index ea8d68f58e39..eca5daa43b93 100644 --- a/arch/sh/include/asm/page.h +++ b/arch/sh/include/asm/page.h @@ -35,8 +35,6 @@ #define HPAGE_SHIFT 22 #elif defined(CONFIG_HUGETLB_PAGE_SIZE_64MB) #define HPAGE_SHIFT 26 -#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512MB) -#define HPAGE_SHIFT 29 #endif #ifdef CONFIG_HUGETLB_PAGE @@ -82,18 +80,12 @@ typedef struct { unsigned long long pgd; } pgd_t; ((x).pte_low | ((unsigned long long)(x).pte_high << 32)) #define __pte(x) \ ({ pte_t __pte = {(x), ((unsigned long long)(x)) >> 32}; __pte; }) -#elif defined(CONFIG_SUPERH32) +#else typedef struct { unsigned long pte_low; } pte_t; typedef struct { unsigned long pgprot; } pgprot_t; typedef struct { unsigned long pgd; } pgd_t; #define pte_val(x) ((x).pte_low) #define __pte(x) ((pte_t) { (x) } ) -#else -typedef struct { unsigned long long pte_low; } pte_t; -typedef struct { unsigned long long pgprot; } pgprot_t; -typedef struct { unsigned long pgd; } pgd_t; -#define pte_val(x) ((x).pte_low) -#define __pte(x) ((pte_t) { (x) } ) #endif #define pgd_val(x) ((x).pgd) @@ -191,15 +183,4 @@ typedef struct page *pgtable_t; */ #define ARCH_DMA_MINALIGN L1_CACHE_BYTES -#ifdef CONFIG_SUPERH64 -/* - * While BYTES_PER_WORD == 4 on the current sh64 ABI, GCC will still - * happily generate {ld/st}.q pairs, requiring us to have 8-byte - * alignment to avoid traps. The kmalloc alignment is guaranteed by - * virtue of L1_CACHE_BYTES, requiring this to only be special cased - * for slab caches. - */ -#define ARCH_SLAB_MINALIGN 8 -#endif - #endif /* __ASM_SH_PAGE_H */ diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index cbd0f3c55a0c..02d936406c6e 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -76,18 +76,10 @@ static inline unsigned long phys_addr_mask(void) #define PTE_PHYS_MASK (phys_addr_mask() & PAGE_MASK) #define PTE_FLAGS_MASK (~(PTE_PHYS_MASK) << PAGE_SHIFT) -#ifdef CONFIG_SUPERH32 #define VMALLOC_START (P3SEG) -#else -#define VMALLOC_START (0xf0000000) -#endif #define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) -#if defined(CONFIG_SUPERH32) #include -#else -#include -#endif /* * SH-X and lower (legacy) SuperH parts (SH-3, SH-4, some SH-4A) can't do page @@ -159,15 +151,6 @@ static inline bool pte_access_permitted(pte_t pte, bool write) prot |= _PAGE_EXT(_PAGE_EXT_KERN_WRITE | _PAGE_EXT_USER_WRITE); return __pte_access_permitted(pte, prot); } -#elif defined(CONFIG_SUPERH64) -static inline bool pte_access_permitted(pte_t pte, bool write) -{ - u64 prot = _PAGE_PRESENT | _PAGE_USER | _PAGE_READ; - - if (write) - prot |= _PAGE_WRITE; - return __pte_access_permitted(pte, prot); -} #else static inline bool pte_access_permitted(pte_t pte, bool write) { diff --git a/arch/sh/include/asm/pgtable_64.h b/arch/sh/include/asm/pgtable_64.h deleted file mode 100644 index 1778bc5971e7..000000000000 --- a/arch/sh/include/asm/pgtable_64.h +++ /dev/null @@ -1,307 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_SH_PGTABLE_64_H -#define __ASM_SH_PGTABLE_64_H - -/* - * include/asm-sh/pgtable_64.h - * - * This file contains the functions and defines necessary to modify and use - * the SuperH page table tree. - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2003, 2004 Paul Mundt - * Copyright (C) 2003, 2004 Richard Curnow - */ -#include -#include -#include - -/* - * Error outputs. - */ -#define pte_ERROR(e) \ - printk("%s:%d: bad pte %016Lx.\n", __FILE__, __LINE__, pte_val(e)) -#define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) - -/* - * Table setting routines. Used within arch/mm only. - */ -#define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) - -static __inline__ void set_pte(pte_t *pteptr, pte_t pteval) -{ - unsigned long long x = ((unsigned long long) pteval.pte_low); - unsigned long long *xp = (unsigned long long *) pteptr; - /* - * Sign-extend based on NPHYS. - */ - *(xp) = (x & NPHYS_SIGN) ? (x | NPHYS_MASK) : x; -} -#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) - -/* - * PGD defines. Top level. - */ - -/* To find an entry in a generic PGD. */ -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) -#define __pgd_offset(address) pgd_index(address) -#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address)) - -/* To find an entry in a kernel PGD. */ -#define pgd_offset_k(address) pgd_offset(&init_mm, address) - -#define __pud_offset(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -#define __pmd_offset(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) - -/* - * PMD level access routines. Same notes as above. - */ -#define _PMD_EMPTY 0x0 -/* Either the PMD is empty or present, it's not paged out */ -#define pmd_present(pmd_entry) (pmd_val(pmd_entry) & _PAGE_PRESENT) -#define pmd_clear(pmd_entry_p) (set_pmd((pmd_entry_p), __pmd(_PMD_EMPTY))) -#define pmd_none(pmd_entry) (pmd_val((pmd_entry)) == _PMD_EMPTY) -#define pmd_bad(pmd_entry) ((pmd_val(pmd_entry) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) - -#define pmd_page_vaddr(pmd_entry) \ - ((unsigned long) __va(pmd_val(pmd_entry) & PAGE_MASK)) - -#define pmd_page(pmd) \ - (virt_to_page(pmd_val(pmd))) - -/* PMD to PTE dereferencing */ -#define pte_index(address) \ - ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) - -#define __pte_offset(address) pte_index(address) - -#define pte_offset_kernel(dir, addr) \ - ((pte_t *) ((pmd_val(*(dir))) & PAGE_MASK) + pte_index((addr))) - -#define pte_offset_map(dir,addr) pte_offset_kernel(dir, addr) -#define pte_unmap(pte) do { } while (0) - -#ifndef __ASSEMBLY__ -/* - * PTEL coherent flags. - * See Chapter 17 ST50 CPU Core Volume 1, Architecture. - */ -/* The bits that are required in the SH-5 TLB are placed in the h/w-defined - positions, to avoid expensive bit shuffling on every refill. The remaining - bits are used for s/w purposes and masked out on each refill. - - Note, the PTE slots are used to hold data of type swp_entry_t when a page is - swapped out. Only the _PAGE_PRESENT flag is significant when the page is - swapped out, and it must be placed so that it doesn't overlap either the - type or offset fields of swp_entry_t. For x86, offset is at [31:8] and type - at [6:1], with _PAGE_PRESENT at bit 0 for both pte_t and swp_entry_t. This - scheme doesn't map to SH-5 because bit [0] controls cacheability. So bit - [2] is used for _PAGE_PRESENT and the type field of swp_entry_t is split - into 2 pieces. That is handled by SWP_ENTRY and SWP_TYPE below. */ -#define _PAGE_WT 0x001 /* CB0: if cacheable, 1->write-thru, 0->write-back */ -#define _PAGE_DEVICE 0x001 /* CB0: if uncacheable, 1->device (i.e. no write-combining or reordering at bus level) */ -#define _PAGE_CACHABLE 0x002 /* CB1: uncachable/cachable */ -#define _PAGE_PRESENT 0x004 /* software: page referenced */ -#define _PAGE_SIZE0 0x008 /* SZ0-bit : size of page */ -#define _PAGE_SIZE1 0x010 /* SZ1-bit : size of page */ -#define _PAGE_SHARED 0x020 /* software: reflects PTEH's SH */ -#define _PAGE_READ 0x040 /* PR0-bit : read access allowed */ -#define _PAGE_EXECUTE 0x080 /* PR1-bit : execute access allowed */ -#define _PAGE_WRITE 0x100 /* PR2-bit : write access allowed */ -#define _PAGE_USER 0x200 /* PR3-bit : user space access allowed */ -#define _PAGE_DIRTY 0x400 /* software: page accessed in write */ -#define _PAGE_ACCESSED 0x800 /* software: page referenced */ - -/* Wrapper for extended mode pgprot twiddling */ -#define _PAGE_EXT(x) ((unsigned long long)(x) << 32) - -/* - * We can use the sign-extended bits in the PTEL to get 32 bits of - * software flags. This works for now because no implementations uses - * anything above the PPN field. - */ -#define _PAGE_WIRED _PAGE_EXT(0x001) /* software: wire the tlb entry */ -#define _PAGE_SPECIAL _PAGE_EXT(0x002) - -#define _PAGE_CLEAR_FLAGS (_PAGE_PRESENT | _PAGE_SHARED | \ - _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_WIRED) - -/* Mask which drops software flags */ -#define _PAGE_FLAGS_HARDWARE_MASK (NEFF_MASK & ~(_PAGE_CLEAR_FLAGS)) - -/* - * HugeTLB support - */ -#if defined(CONFIG_HUGETLB_PAGE_SIZE_64K) -#define _PAGE_SZHUGE (_PAGE_SIZE0) -#elif defined(CONFIG_HUGETLB_PAGE_SIZE_1MB) -#define _PAGE_SZHUGE (_PAGE_SIZE1) -#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512MB) -#define _PAGE_SZHUGE (_PAGE_SIZE0 | _PAGE_SIZE1) -#endif - -/* - * Stub out _PAGE_SZHUGE if we don't have a good definition for it, - * to make pte_mkhuge() happy. - */ -#ifndef _PAGE_SZHUGE -# define _PAGE_SZHUGE (0) -#endif - -/* - * Default flags for a Kernel page. - * This is fundametally also SHARED because the main use of this define - * (other than for PGD/PMD entries) is for the VMALLOC pool which is - * contextless. - * - * _PAGE_EXECUTE is required for modules - * - */ -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \ - _PAGE_EXECUTE | \ - _PAGE_CACHABLE | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SHARED) - -/* Default flags for a User page */ -#define _PAGE_TABLE (_KERNPG_TABLE | _PAGE_USER) - -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SPECIAL) - -/* - * We have full permissions (Read/Write/Execute/Shared). - */ -#define _PAGE_COMMON (_PAGE_PRESENT | _PAGE_USER | \ - _PAGE_CACHABLE | _PAGE_ACCESSED) - -#define PAGE_NONE __pgprot(_PAGE_CACHABLE | _PAGE_ACCESSED) -#define PAGE_SHARED __pgprot(_PAGE_COMMON | _PAGE_READ | _PAGE_WRITE | \ - _PAGE_SHARED) -#define PAGE_EXECREAD __pgprot(_PAGE_COMMON | _PAGE_READ | _PAGE_EXECUTE) - -/* - * We need to include PAGE_EXECUTE in PAGE_COPY because it is the default - * protection mode for the stack. - */ -#define PAGE_COPY PAGE_EXECREAD - -#define PAGE_READONLY __pgprot(_PAGE_COMMON | _PAGE_READ) -#define PAGE_WRITEONLY __pgprot(_PAGE_COMMON | _PAGE_WRITE) -#define PAGE_RWX __pgprot(_PAGE_COMMON | _PAGE_READ | \ - _PAGE_WRITE | _PAGE_EXECUTE) -#define PAGE_KERNEL __pgprot(_KERNPG_TABLE) - -#define PAGE_KERNEL_NOCACHE \ - __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \ - _PAGE_EXECUTE | _PAGE_ACCESSED | \ - _PAGE_DIRTY | _PAGE_SHARED) - -/* Make it a device mapping for maximum safety (e.g. for mapping device - registers into user-space via /dev/map). */ -#define pgprot_noncached(x) __pgprot(((x).pgprot & ~(_PAGE_CACHABLE)) | _PAGE_DEVICE) -#define pgprot_writecombine(prot) __pgprot(pgprot_val(prot) & ~_PAGE_CACHABLE) - -/* - * PTE level access routines. - * - * Note1: - * It's the tree walk leaf. This is physical address to be stored. - * - * Note 2: - * Regarding the choice of _PTE_EMPTY: - - We must choose a bit pattern that cannot be valid, whether or not the page - is present. bit[2]==1 => present, bit[2]==0 => swapped out. If swapped - out, bits [31:8], [6:3], [1:0] are under swapper control, so only bit[7] is - left for us to select. If we force bit[7]==0 when swapped out, we could use - the combination bit[7,2]=2'b10 to indicate an empty PTE. Alternatively, if - we force bit[7]==1 when swapped out, we can use all zeroes to indicate - empty. This is convenient, because the page tables get cleared to zero - when they are allocated. - - */ -#define _PTE_EMPTY 0x0 -#define pte_present(x) (pte_val(x) & _PAGE_PRESENT) -#define pte_clear(mm,addr,xp) (set_pte_at(mm, addr, xp, __pte(_PTE_EMPTY))) -#define pte_none(x) (pte_val(x) == _PTE_EMPTY) - -/* - * Some definitions to translate between mem_map, PTEs, and page - * addresses: - */ - -/* - * Given a PTE, return the index of the mem_map[] entry corresponding - * to the page frame the PTE. Get the absolute physical address, make - * a relative physical address and translate it to an index. - */ -#define pte_pagenr(x) (((unsigned long) (pte_val(x)) - \ - __MEMORY_START) >> PAGE_SHIFT) - -/* - * Given a PTE, return the "struct page *". - */ -#define pte_page(x) (mem_map + pte_pagenr(x)) - -/* - * Return number of (down rounded) MB corresponding to x pages. - */ -#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) - - -/* - * The following have defined behavior only work if pte_present() is true. - */ -static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } -static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } -static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_WRITE; } -static inline int pte_special(pte_t pte){ return pte_val(pte) & _PAGE_SPECIAL; } - -static inline pte_t pte_wrprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_WRITE)); return pte; } -static inline pte_t pte_mkclean(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_DIRTY)); return pte; } -static inline pte_t pte_mkold(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_ACCESSED)); return pte; } -static inline pte_t pte_mkwrite(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_WRITE)); return pte; } -static inline pte_t pte_mkdirty(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; } -static inline pte_t pte_mkyoung(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; } -static inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_SZHUGE)); return pte; } -static inline pte_t pte_mkspecial(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_SPECIAL)); return pte; } - -/* - * Conversion functions: convert a page and protection to a page entry. - * - * extern pte_t mk_pte(struct page *page, pgprot_t pgprot) - */ -#define mk_pte(page,pgprot) \ -({ \ - pte_t __pte; \ - \ - set_pte(&__pte, __pte((((page)-mem_map) << PAGE_SHIFT) | \ - __MEMORY_START | pgprot_val((pgprot)))); \ - __pte; \ -}) - -/* - * This takes a (absolute) physical page address that is used - * by the remapping functions - */ -#define mk_pte_phys(physpage, pgprot) \ -({ pte_t __pte; set_pte(&__pte, __pte(physpage | pgprot_val(pgprot))); __pte; }) - -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) -{ set_pte(&pte, __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot))); return pte; } - -/* Encode and decode a swap entry */ -#define __swp_type(x) (((x).val & 3) + (((x).val >> 1) & 0x3c)) -#define __swp_offset(x) ((x).val >> 8) -#define __swp_entry(type, offset) ((swp_entry_t) { ((offset << 8) + ((type & 0x3c) << 1) + (type & 3)) }) -#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) -#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) - -#endif /* !__ASSEMBLY__ */ - -#define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) -#define pfn_pmd(pfn, prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) - -#endif /* __ASM_SH_PGTABLE_64_H */ diff --git a/arch/sh/include/asm/posix_types.h b/arch/sh/include/asm/posix_types.h index 0d670fd94fe7..f8982b757c33 100644 --- a/arch/sh/include/asm/posix_types.h +++ b/arch/sh/include/asm/posix_types.h @@ -1,6 +1,2 @@ /* SPDX-License-Identifier: GPL-2.0 */ -# ifdef CONFIG_SUPERH32 -# include -# else -# include -# endif +#include diff --git a/arch/sh/include/asm/processor.h b/arch/sh/include/asm/processor.h index 6fbf8c80e498..3820d698846e 100644 --- a/arch/sh/include/asm/processor.h +++ b/arch/sh/include/asm/processor.h @@ -39,9 +39,6 @@ enum cpu_type { /* SH4AL-DSP types */ CPU_SH7343, CPU_SH7722, CPU_SH7366, CPU_SH7372, - /* SH-5 types */ - CPU_SH5_101, CPU_SH5_103, - /* Unknown subtype */ CPU_SH_NONE }; @@ -53,7 +50,6 @@ enum cpu_family { CPU_FAMILY_SH4, CPU_FAMILY_SH4A, CPU_FAMILY_SH4AL_DSP, - CPU_FAMILY_SH5, CPU_FAMILY_UNKNOWN, }; @@ -167,18 +163,12 @@ int vsyscall_init(void); */ #ifdef CONFIG_CPU_SH2A extern unsigned int instruction_size(unsigned int insn); -#elif defined(CONFIG_SUPERH32) -#define instruction_size(insn) (2) #else -#define instruction_size(insn) (4) +#define instruction_size(insn) (2) #endif #endif /* __ASSEMBLY__ */ -#ifdef CONFIG_SUPERH32 -# include -#else -# include -#endif +#include #endif /* __ASM_SH_PROCESSOR_H */ diff --git a/arch/sh/include/asm/processor_64.h b/arch/sh/include/asm/processor_64.h deleted file mode 100644 index 53efc9f51ef1..000000000000 --- a/arch/sh/include/asm/processor_64.h +++ /dev/null @@ -1,212 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_SH_PROCESSOR_64_H -#define __ASM_SH_PROCESSOR_64_H - -/* - * include/asm-sh/processor_64.h - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2003 Paul Mundt - * Copyright (C) 2004 Richard Curnow - */ -#ifndef __ASSEMBLY__ - -#include -#include -#include -#include - -#endif - -/* - * User space process size: 2GB - 4k. - */ -#define TASK_SIZE 0x7ffff000UL - -#define STACK_TOP TASK_SIZE -#define STACK_TOP_MAX STACK_TOP - -/* This decides where the kernel will search for a free chunk of vm - * space during mmap's. - */ -#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE / 3) - -/* - * Bit of SR register - * - * FD-bit: - * When it's set, it means the processor doesn't have right to use FPU, - * and it results exception when the floating operation is executed. - * - * IMASK-bit: - * Interrupt level mask - * - * STEP-bit: - * Single step bit - * - */ -#if defined(CONFIG_SH64_SR_WATCH) -#define SR_MMU 0x84000000 -#else -#define SR_MMU 0x80000000 -#endif - -#define SR_IMASK 0x000000f0 -#define SR_FD 0x00008000 -#define SR_SSTEP 0x08000000 - -#ifndef __ASSEMBLY__ - -/* - * FPU structure and data : require 8-byte alignment as we need to access it - with fld.p, fst.p - */ - -struct sh_fpu_hard_struct { - unsigned long fp_regs[64]; - unsigned int fpscr; - /* long status; * software status information */ -}; - -/* Dummy fpu emulator */ -struct sh_fpu_soft_struct { - unsigned long fp_regs[64]; - unsigned int fpscr; - unsigned char lookahead; - unsigned long entry_pc; -}; - -union thread_xstate { - struct sh_fpu_hard_struct hardfpu; - struct sh_fpu_soft_struct softfpu; - /* - * The structure definitions only produce 32 bit alignment, yet we need - * to access them using 64 bit load/store as well. - */ - unsigned long long alignment_dummy; -}; - -struct thread_struct { - unsigned long sp; - unsigned long pc; - - /* Various thread flags, see SH_THREAD_xxx */ - unsigned long flags; - - /* This stores the address of the pt_regs built during a context - switch, or of the register save area built for a kernel mode - exception. It is used for backtracing the stack of a sleeping task - or one that traps in kernel mode. */ - struct pt_regs *kregs; - /* This stores the address of the pt_regs constructed on entry from - user mode. It is a fixed value over the lifetime of a process, or - NULL for a kernel thread. */ - struct pt_regs *uregs; - - unsigned long address; - /* Hardware debugging registers may come here */ - - /* floating point info */ - union thread_xstate *xstate; - - /* - * fpu_counter contains the number of consecutive context switches - * that the FPU is used. If this is over a threshold, the lazy fpu - * saving becomes unlazy to save the trap. This is an unsigned char - * so that after 256 times the counter wraps and the behavior turns - * lazy again; this to deal with bursty apps that only use FPU for - * a short time - */ - unsigned char fpu_counter; -}; - -#define INIT_MMAP \ -{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL } - -#define INIT_THREAD { \ - .sp = sizeof(init_stack) + \ - (long) &init_stack, \ - .pc = 0, \ - .kregs = &fake_swapper_regs, \ - .uregs = NULL, \ - .address = 0, \ - .flags = 0, \ -} - -/* - * Do necessary setup to start up a newly executed thread. - */ -#define SR_USER (SR_MMU | SR_FD) - -#define start_thread(_regs, new_pc, new_sp) \ - _regs->sr = SR_USER; /* User mode. */ \ - _regs->pc = new_pc - 4; /* Compensate syscall exit */ \ - _regs->pc |= 1; /* Set SHmedia ! */ \ - _regs->regs[18] = 0; \ - _regs->regs[15] = new_sp - -/* Forward declaration, a strange C thing */ -struct task_struct; -struct mm_struct; - -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - -/* - * FPU lazy state save handling. - */ - -static inline void disable_fpu(void) -{ - unsigned long long __dummy; - - /* Set FD flag in SR */ - __asm__ __volatile__("getcon " __SR ", %0\n\t" - "or %0, %1, %0\n\t" - "putcon %0, " __SR "\n\t" - : "=&r" (__dummy) - : "r" (SR_FD)); -} - -static inline void enable_fpu(void) -{ - unsigned long long __dummy; - - /* Clear out FD flag in SR */ - __asm__ __volatile__("getcon " __SR ", %0\n\t" - "and %0, %1, %0\n\t" - "putcon %0, " __SR "\n\t" - : "=&r" (__dummy) - : "r" (~SR_FD)); -} - -/* Round to nearest, no exceptions on inexact, overflow, underflow, - zero-divide, invalid. Configure option for whether to flush denorms to - zero, or except if a denorm is encountered. */ -#if defined(CONFIG_SH64_FPU_DENORM_FLUSH) -#define FPSCR_INIT 0x00040000 -#else -#define FPSCR_INIT 0x00000000 -#endif - -#ifdef CONFIG_SH_FPU -/* Initialise the FP state of a task */ -void fpinit(struct sh_fpu_hard_struct *fpregs); -#else -#define fpinit(fpregs) do { } while (0) -#endif - -extern struct task_struct *last_task_used_math; - -/* - * Return saved PC of a blocked thread. - */ -#define thread_saved_pc(tsk) (tsk->thread.pc) - -extern unsigned long get_wchan(struct task_struct *p); - -#define KSTK_EIP(tsk) ((tsk)->thread.pc) -#define KSTK_ESP(tsk) ((tsk)->thread.sp) - -#endif /* __ASSEMBLY__ */ -#endif /* __ASM_SH_PROCESSOR_64_H */ diff --git a/arch/sh/include/asm/ptrace_64.h b/arch/sh/include/asm/ptrace_64.h deleted file mode 100644 index 6ee08229b433..000000000000 --- a/arch/sh/include/asm/ptrace_64.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_SH_PTRACE_64_H -#define __ASM_SH_PTRACE_64_H - -#include - - -#define MAX_REG_OFFSET offsetof(struct pt_regs, tregs[7]) -static inline long regs_return_value(struct pt_regs *regs) -{ - return regs->regs[3]; -} - -#endif /* __ASM_SH_PTRACE_64_H */ diff --git a/arch/sh/include/asm/string.h b/arch/sh/include/asm/string.h index 84fc5ed9c5b3..0f6331ec28ed 100644 --- a/arch/sh/include/asm/string.h +++ b/arch/sh/include/asm/string.h @@ -1,6 +1,2 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifdef CONFIG_SUPERH32 -# include -#else -# include -#endif +#include diff --git a/arch/sh/include/asm/string_64.h b/arch/sh/include/asm/string_64.h deleted file mode 100644 index d51d6150a4e2..000000000000 --- a/arch/sh/include/asm/string_64.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_SH_STRING_64_H -#define __ASM_SH_STRING_64_H - -#ifdef __KERNEL__ - -#define __HAVE_ARCH_MEMSET -extern void *memset(void *__s, int __c, size_t __count); - -#define __HAVE_ARCH_MEMCPY -extern void *memcpy(void *dest, const void *src, size_t count); - -#define __HAVE_ARCH_STRLEN -extern size_t strlen(const char *); - -#define __HAVE_ARCH_STRCPY -extern char *strcpy(char *__dest, const char *__src); - -#endif /* __KERNEL__ */ - -#endif /* __ASM_SH_STRING_64_H */ diff --git a/arch/sh/include/asm/switch_to.h b/arch/sh/include/asm/switch_to.h index 9eec80ab5aa2..bd139bcdeec1 100644 --- a/arch/sh/include/asm/switch_to.h +++ b/arch/sh/include/asm/switch_to.h @@ -4,13 +4,4 @@ * Copyright (C) 2003 Paul Mundt * Copyright (C) 2004 Richard Curnow */ -#ifndef __ASM_SH_SWITCH_TO_H -#define __ASM_SH_SWITCH_TO_H - -#ifdef CONFIG_SUPERH32 -# include -#else -# include -#endif - -#endif /* __ASM_SH_SWITCH_TO_H */ +#include diff --git a/arch/sh/include/asm/switch_to_64.h b/arch/sh/include/asm/switch_to_64.h deleted file mode 100644 index 2dbf2311669f..000000000000 --- a/arch/sh/include/asm/switch_to_64.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2003 Paul Mundt - * Copyright (C) 2004 Richard Curnow - */ -#ifndef __ASM_SH_SWITCH_TO_64_H -#define __ASM_SH_SWITCH_TO_64_H - -struct thread_struct; -struct task_struct; - -/* - * switch_to() should switch tasks to task nr n, first - */ -struct task_struct *sh64_switch_to(struct task_struct *prev, - struct thread_struct *prev_thread, - struct task_struct *next, - struct thread_struct *next_thread); - -#define switch_to(prev,next,last) \ -do { \ - if (last_task_used_math != next) { \ - struct pt_regs *regs = next->thread.uregs; \ - if (regs) regs->sr |= SR_FD; \ - } \ - last = sh64_switch_to(prev, &prev->thread, next, \ - &next->thread); \ -} while (0) - - -#endif /* __ASM_SH_SWITCH_TO_64_H */ diff --git a/arch/sh/include/asm/syscall.h b/arch/sh/include/asm/syscall.h index 90ba00002626..570699eb0e58 100644 --- a/arch/sh/include/asm/syscall.h +++ b/arch/sh/include/asm/syscall.h @@ -4,10 +4,6 @@ extern const unsigned long sys_call_table[]; -#ifdef CONFIG_SUPERH32 -# include -#else -# include -#endif +#include #endif /* __ASM_SH_SYSCALL_H */ diff --git a/arch/sh/include/asm/syscall_64.h b/arch/sh/include/asm/syscall_64.h deleted file mode 100644 index 72efcbc76f91..000000000000 --- a/arch/sh/include/asm/syscall_64.h +++ /dev/null @@ -1,75 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_SH_SYSCALL_64_H -#define __ASM_SH_SYSCALL_64_H - -#include -#include -#include -#include - -/* The system call number is given by the user in R9 */ -static inline long syscall_get_nr(struct task_struct *task, - struct pt_regs *regs) -{ - return (regs->syscall_nr >= 0) ? regs->regs[9] : -1L; -} - -static inline void syscall_rollback(struct task_struct *task, - struct pt_regs *regs) -{ - /* - * XXX: This needs some thought. On SH we don't - * save away the original R9 value anywhere. - */ -} - -static inline long syscall_get_error(struct task_struct *task, - struct pt_regs *regs) -{ - return IS_ERR_VALUE(regs->regs[9]) ? regs->regs[9] : 0; -} - -static inline long syscall_get_return_value(struct task_struct *task, - struct pt_regs *regs) -{ - return regs->regs[9]; -} - -static inline void syscall_set_return_value(struct task_struct *task, - struct pt_regs *regs, - int error, long val) -{ - if (error) - regs->regs[9] = -error; - else - regs->regs[9] = val; -} - -static inline void syscall_get_arguments(struct task_struct *task, - struct pt_regs *regs, - unsigned long *args) -{ - memcpy(args, ®s->regs[2], 6 * sizeof(args[0])); -} - -static inline void syscall_set_arguments(struct task_struct *task, - struct pt_regs *regs, - const unsigned long *args) -{ - memcpy(®s->regs[2], args, 6 * sizeof(args[0])); -} - -static inline int syscall_get_arch(struct task_struct *task) -{ - int arch = AUDIT_ARCH_SH; - -#ifdef CONFIG_64BIT - arch |= __AUDIT_ARCH_64BIT; -#endif -#ifdef CONFIG_CPU_LITTLE_ENDIAN - arch |= __AUDIT_ARCH_LE; -#endif - - return arch; -} -#endif /* __ASM_SH_SYSCALL_64_H */ diff --git a/arch/sh/include/asm/syscalls.h b/arch/sh/include/asm/syscalls.h index 995ef046232c..387105316d28 100644 --- a/arch/sh/include/asm/syscalls.h +++ b/arch/sh/include/asm/syscalls.h @@ -2,8 +2,6 @@ #ifndef __ASM_SH_SYSCALLS_H #define __ASM_SH_SYSCALLS_H -#ifdef __KERNEL__ - asmlinkage int old_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, int fd, unsigned long off); @@ -11,11 +9,6 @@ asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff); -#ifdef CONFIG_SUPERH32 -# include -#else -# include -#endif +#include -#endif /* __KERNEL__ */ #endif /* __ASM_SH_SYSCALLS_H */ diff --git a/arch/sh/include/asm/syscalls_64.h b/arch/sh/include/asm/syscalls_64.h deleted file mode 100644 index df42656cebea..000000000000 --- a/arch/sh/include/asm/syscalls_64.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_SH_SYSCALLS_64_H -#define __ASM_SH_SYSCALLS_64_H - -#ifdef __KERNEL__ - -#include -#include -#include - -struct pt_regs; - -/* Misc syscall related bits */ -asmlinkage long long do_syscall_trace_enter(struct pt_regs *regs); -asmlinkage void do_syscall_trace_leave(struct pt_regs *regs); - -#endif /* __KERNEL__ */ -#endif /* __ASM_SH_SYSCALLS_64_H */ diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h index cf5c792bf70b..6404be69d5fa 100644 --- a/arch/sh/include/asm/thread_info.h +++ b/arch/sh/include/asm/thread_info.h @@ -70,9 +70,7 @@ register unsigned long current_stack_pointer asm("r15") __used; static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; -#if defined(CONFIG_SUPERH64) - __asm__ __volatile__ ("getcon cr17, %0" : "=r" (ti)); -#elif defined(CONFIG_CPU_HAS_SR_RB) +#if defined(CONFIG_CPU_HAS_SR_RB) __asm__ __volatile__ ("stc r7_bank, %0" : "=r" (ti)); #else unsigned long __dummy; diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index bc77f3dd4261..360f713d009b 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -2,10 +2,6 @@ #ifndef __ASM_SH_TLB_H #define __ASM_SH_TLB_H -#ifdef CONFIG_SUPERH64 -# include -#endif - #ifndef __ASSEMBLY__ #include @@ -14,7 +10,7 @@ #include -#if defined(CONFIG_CPU_SH4) || defined(CONFIG_SUPERH64) +#if defined(CONFIG_CPU_SH4) extern void tlb_wire_entry(struct vm_area_struct *, unsigned long, pte_t); extern void tlb_unwire_entry(void); #else diff --git a/arch/sh/include/asm/tlb_64.h b/arch/sh/include/asm/tlb_64.h deleted file mode 100644 index 59fa0a23dad7..000000000000 --- a/arch/sh/include/asm/tlb_64.h +++ /dev/null @@ -1,68 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * include/asm-sh/tlb_64.h - * - * Copyright (C) 2003 Paul Mundt - */ -#ifndef __ASM_SH_TLB_64_H -#define __ASM_SH_TLB_64_H - -/* ITLB defines */ -#define ITLB_FIXED 0x00000000 /* First fixed ITLB, see head.S */ -#define ITLB_LAST_VAR_UNRESTRICTED 0x000003F0 /* Last ITLB */ - -/* DTLB defines */ -#define DTLB_FIXED 0x00800000 /* First fixed DTLB, see head.S */ -#define DTLB_LAST_VAR_UNRESTRICTED 0x008003F0 /* Last DTLB */ - -#ifndef __ASSEMBLY__ - -/** - * for_each_dtlb_entry - Iterate over free (non-wired) DTLB entries - * - * @tlb: TLB entry - */ -#define for_each_dtlb_entry(tlb) \ - for (tlb = cpu_data->dtlb.first; \ - tlb <= cpu_data->dtlb.last; \ - tlb += cpu_data->dtlb.step) - -/** - * for_each_itlb_entry - Iterate over free (non-wired) ITLB entries - * - * @tlb: TLB entry - */ -#define for_each_itlb_entry(tlb) \ - for (tlb = cpu_data->itlb.first; \ - tlb <= cpu_data->itlb.last; \ - tlb += cpu_data->itlb.step) - -/** - * __flush_tlb_slot - Flushes TLB slot @slot. - * - * @slot: Address of TLB slot. - */ -static inline void __flush_tlb_slot(unsigned long long slot) -{ - __asm__ __volatile__ ("putcfg %0, 0, r63\n" : : "r" (slot)); -} - -#ifdef CONFIG_MMU -/* arch/sh64/mm/tlb.c */ -int sh64_tlb_init(void); -unsigned long long sh64_next_free_dtlb_entry(void); -unsigned long long sh64_get_wired_dtlb_entry(void); -int sh64_put_wired_dtlb_entry(unsigned long long entry); -void sh64_setup_tlb_slot(unsigned long long config_addr, unsigned long eaddr, - unsigned long asid, unsigned long paddr); -void sh64_teardown_tlb_slot(unsigned long long config_addr); -#else -#define sh64_tlb_init() do { } while (0) -#define sh64_next_free_dtlb_entry() (0) -#define sh64_get_wired_dtlb_entry() (0) -#define sh64_put_wired_dtlb_entry(entry) do { } while (0) -#define sh64_setup_tlb_slot(conf, virt, asid, phys) do { } while (0) -#define sh64_teardown_tlb_slot(addr) do { } while (0) -#endif /* CONFIG_MMU */ -#endif /* __ASSEMBLY__ */ -#endif /* __ASM_SH_TLB_64_H */ diff --git a/arch/sh/include/asm/traps.h b/arch/sh/include/asm/traps.h index 8844ed0c0fde..ba831bc7e08f 100644 --- a/arch/sh/include/asm/traps.h +++ b/arch/sh/include/asm/traps.h @@ -4,11 +4,7 @@ #include -#ifdef CONFIG_SUPERH32 # include -#else -# include -#endif BUILD_TRAP_HANDLER(address_error); BUILD_TRAP_HANDLER(debug); diff --git a/arch/sh/include/asm/traps_64.h b/arch/sh/include/asm/traps_64.h deleted file mode 100644 index f28db6dfbe45..000000000000 --- a/arch/sh/include/asm/traps_64.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2003 Paul Mundt - * Copyright (C) 2004 Richard Curnow - */ -#ifndef __ASM_SH_TRAPS_64_H -#define __ASM_SH_TRAPS_64_H - -#include - -extern void phys_stext(void); - -#define lookup_exception_vector() \ -({ \ - unsigned long _vec; \ - \ - __asm__ __volatile__ ( \ - "getcon " __EXPEVT ", %0\n\t" \ - : "=r" (_vec) \ - ); \ - \ - _vec; \ -}) - -static inline void trigger_address_error(void) -{ - phys_stext(); -} - -#define BUILD_TRAP_HANDLER(name) \ -asmlinkage void name##_trap_handler(unsigned int vec, struct pt_regs *regs) -#define TRAP_HANDLER_DECL - -#endif /* __ASM_SH_TRAPS_64_H */ diff --git a/arch/sh/include/asm/types.h b/arch/sh/include/asm/types.h index df96c511bb6e..68eb24ad2013 100644 --- a/arch/sh/include/asm/types.h +++ b/arch/sh/include/asm/types.h @@ -9,13 +9,8 @@ */ #ifndef __ASSEMBLY__ -#ifdef CONFIG_SUPERH32 typedef u16 insn_size_t; typedef u32 reg_size_t; -#else -typedef u32 insn_size_t; -typedef u64 reg_size_t; -#endif #endif /* __ASSEMBLY__ */ #endif /* __ASM_SH_TYPES_H */ diff --git a/arch/sh/include/asm/uaccess.h b/arch/sh/include/asm/uaccess.h index 5fe751ad7582..73f3b48d4a34 100644 --- a/arch/sh/include/asm/uaccess.h +++ b/arch/sh/include/asm/uaccess.h @@ -96,11 +96,7 @@ struct __large_struct { unsigned long buf[100]; }; __pu_err; \ }) -#ifdef CONFIG_SUPERH32 # include -#else -# include -#endif extern long strncpy_from_user(char *dest, const char __user *src, long count); diff --git a/arch/sh/include/asm/uaccess_64.h b/arch/sh/include/asm/uaccess_64.h deleted file mode 100644 index 0c19d02dc566..000000000000 --- a/arch/sh/include/asm/uaccess_64.h +++ /dev/null @@ -1,85 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_SH_UACCESS_64_H -#define __ASM_SH_UACCESS_64_H - -/* - * include/asm-sh/uaccess_64.h - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2003, 2004 Paul Mundt - * - * User space memory access functions - * - * Copyright (C) 1999 Niibe Yutaka - * - * Based on: - * MIPS implementation version 1.15 by - * Copyright (C) 1996, 1997, 1998 by Ralf Baechle - * and i386 version. - */ - -#define __get_user_size(x,ptr,size,retval) \ -do { \ - retval = 0; \ - x = 0; \ - switch (size) { \ - case 1: \ - retval = __get_user_asm_b((void *)&x, \ - (long)ptr); \ - break; \ - case 2: \ - retval = __get_user_asm_w((void *)&x, \ - (long)ptr); \ - break; \ - case 4: \ - retval = __get_user_asm_l((void *)&x, \ - (long)ptr); \ - break; \ - case 8: \ - retval = __get_user_asm_q((void *)&x, \ - (long)ptr); \ - break; \ - default: \ - __get_user_unknown(); \ - break; \ - } \ -} while (0) - -extern long __get_user_asm_b(void *, long); -extern long __get_user_asm_w(void *, long); -extern long __get_user_asm_l(void *, long); -extern long __get_user_asm_q(void *, long); -extern void __get_user_unknown(void); - -#define __put_user_size(x,ptr,size,retval) \ -do { \ - retval = 0; \ - switch (size) { \ - case 1: \ - retval = __put_user_asm_b((void *)&x, \ - (__force long)ptr); \ - break; \ - case 2: \ - retval = __put_user_asm_w((void *)&x, \ - (__force long)ptr); \ - break; \ - case 4: \ - retval = __put_user_asm_l((void *)&x, \ - (__force long)ptr); \ - break; \ - case 8: \ - retval = __put_user_asm_q((void *)&x, \ - (__force long)ptr); \ - break; \ - default: \ - __put_user_unknown(); \ - } \ -} while (0) - -extern long __put_user_asm_b(void *, long); -extern long __put_user_asm_w(void *, long); -extern long __put_user_asm_l(void *, long); -extern long __put_user_asm_q(void *, long); -extern void __put_user_unknown(void); - -#endif /* __ASM_SH_UACCESS_64_H */ diff --git a/arch/sh/include/asm/unistd.h b/arch/sh/include/asm/unistd.h index 9c7d9d9999c6..d6e126250136 100644 --- a/arch/sh/include/asm/unistd.h +++ b/arch/sh/include/asm/unistd.h @@ -1,9 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -# ifdef CONFIG_SUPERH32 -# include -# else -# include -# endif +#include #define NR_syscalls __NR_syscalls diff --git a/arch/sh/include/asm/user.h b/arch/sh/include/asm/user.h index e97f2efed527..7dfd3f6461e6 100644 --- a/arch/sh/include/asm/user.h +++ b/arch/sh/include/asm/user.h @@ -28,19 +28,12 @@ * to write an integer number of pages. */ -#if defined(__SH5__) || defined(CONFIG_CPU_SH5) -struct user_fpu_struct { - unsigned long fp_regs[32]; - unsigned int fpscr; -}; -#else struct user_fpu_struct { unsigned long fp_regs[16]; unsigned long xfp_regs[16]; unsigned long fpscr; unsigned long fpul; }; -#endif struct user { struct pt_regs regs; /* entire machine state */ diff --git a/arch/sh/include/asm/vermagic.h b/arch/sh/include/asm/vermagic.h index 13d8eaa9188e..5b2057c39170 100644 --- a/arch/sh/include/asm/vermagic.h +++ b/arch/sh/include/asm/vermagic.h @@ -10,8 +10,6 @@ # define MODULE_PROC_FAMILY "SH3LE " # elif defined CONFIG_CPU_SH4 # define MODULE_PROC_FAMILY "SH4LE " -# elif defined CONFIG_CPU_SH5 -# define MODULE_PROC_FAMILY "SH5LE " # else # error unknown processor family # endif @@ -22,8 +20,6 @@ # define MODULE_PROC_FAMILY "SH3BE " # elif defined CONFIG_CPU_SH4 # define MODULE_PROC_FAMILY "SH4BE " -# elif defined CONFIG_CPU_SH5 -# define MODULE_PROC_FAMILY "SH5BE " # else # error unknown processor family # endif diff --git a/arch/sh/include/asm/vmlinux.lds.h b/arch/sh/include/asm/vmlinux.lds.h index 992955685874..8d96c4f9b35b 100644 --- a/arch/sh/include/asm/vmlinux.lds.h +++ b/arch/sh/include/asm/vmlinux.lds.h @@ -15,12 +15,4 @@ #define DWARF_EH_FRAME #endif -#ifdef CONFIG_SUPERH64 -#define EXTRA_TEXT \ - *(.text64) \ - *(.text..SHmedia32) -#else -#define EXTRA_TEXT -#endif - #endif /* __ASM_SH_VMLINUX_LDS_H */ diff --git a/arch/sh/include/cpu-sh5/cpu/addrspace.h b/arch/sh/include/cpu-sh5/cpu/addrspace.h deleted file mode 100644 index 6dd1e72f31b2..000000000000 --- a/arch/sh/include/cpu-sh5/cpu/addrspace.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_SH_CPU_SH5_ADDRSPACE_H -#define __ASM_SH_CPU_SH5_ADDRSPACE_H - -#define PHYS_PERIPHERAL_BLOCK 0x09000000 -#define PHYS_DMAC_BLOCK 0x0e000000 -#define PHYS_PCI_BLOCK 0x60000000 -#define PHYS_EMI_BLOCK 0xff000000 - -/* No segmentation.. */ - -#endif /* __ASM_SH_CPU_SH5_ADDRSPACE_H */ diff --git a/arch/sh/include/cpu-sh5/cpu/cache.h b/arch/sh/include/cpu-sh5/cpu/cache.h deleted file mode 100644 index ef49538f386f..000000000000 --- a/arch/sh/include/cpu-sh5/cpu/cache.h +++ /dev/null @@ -1,94 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_SH_CPU_SH5_CACHE_H -#define __ASM_SH_CPU_SH5_CACHE_H - -/* - * include/asm-sh/cpu-sh5/cache.h - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2003, 2004 Paul Mundt - */ - -#define L1_CACHE_SHIFT 5 - -/* Valid and Dirty bits */ -#define SH_CACHE_VALID (1LL<<0) -#define SH_CACHE_UPDATED (1LL<<57) - -/* Unimplemented compat bits.. */ -#define SH_CACHE_COMBINED 0 -#define SH_CACHE_ASSOC 0 - -/* Cache flags */ -#define SH_CACHE_MODE_WT (1LL<<0) -#define SH_CACHE_MODE_WB (1LL<<1) - -/* - * Control Registers. - */ -#define ICCR_BASE 0x01600000 /* Instruction Cache Control Register */ -#define ICCR_REG0 0 /* Register 0 offset */ -#define ICCR_REG1 1 /* Register 1 offset */ -#define ICCR0 ICCR_BASE+ICCR_REG0 -#define ICCR1 ICCR_BASE+ICCR_REG1 - -#define ICCR0_OFF 0x0 /* Set ICACHE off */ -#define ICCR0_ON 0x1 /* Set ICACHE on */ -#define ICCR0_ICI 0x2 /* Invalidate all in IC */ - -#define ICCR1_NOLOCK 0x0 /* Set No Locking */ - -#define OCCR_BASE 0x01E00000 /* Operand Cache Control Register */ -#define OCCR_REG0 0 /* Register 0 offset */ -#define OCCR_REG1 1 /* Register 1 offset */ -#define OCCR0 OCCR_BASE+OCCR_REG0 -#define OCCR1 OCCR_BASE+OCCR_REG1 - -#define OCCR0_OFF 0x0 /* Set OCACHE off */ -#define OCCR0_ON 0x1 /* Set OCACHE on */ -#define OCCR0_OCI 0x2 /* Invalidate all in OC */ -#define OCCR0_WT 0x4 /* Set OCACHE in WT Mode */ -#define OCCR0_WB 0x0 /* Set OCACHE in WB Mode */ - -#define OCCR1_NOLOCK 0x0 /* Set No Locking */ - -/* - * SH-5 - * A bit of description here, for neff=32. - * - * |<--- tag (19 bits) --->| - * +-----------------------------+-----------------+------+----------+------+ - * | | | ways |set index |offset| - * +-----------------------------+-----------------+------+----------+------+ - * ^ 2 bits 8 bits 5 bits - * +- Bit 31 - * - * Cacheline size is based on offset: 5 bits = 32 bytes per line - * A cache line is identified by a tag + set but OCACHETAG/ICACHETAG - * have a broader space for registers. These are outlined by - * CACHE_?C_*_STEP below. - * - */ - -/* Instruction cache */ -#define CACHE_IC_ADDRESS_ARRAY 0x01000000 - -/* Operand Cache */ -#define CACHE_OC_ADDRESS_ARRAY 0x01800000 - -/* These declarations relate to cache 'synonyms' in the operand cache. A - 'synonym' occurs where effective address bits overlap between those used for - indexing the cache sets and those passed to the MMU for translation. In the - case of SH5-101 & SH5-103, only bit 12 is affected for 4k pages. */ - -#define CACHE_OC_N_SYNBITS 1 /* Number of synonym bits */ -#define CACHE_OC_SYN_SHIFT 12 -/* Mask to select synonym bit(s) */ -#define CACHE_OC_SYN_MASK (((1UL<). -** Assigns symbolic names to control & target registers. -*/ - -/* - * Define some useful aliases for control registers. - */ -#define SR cr0 -#define SSR cr1 -#define PSSR cr2 - /* cr3 UNDEFINED */ -#define INTEVT cr4 -#define EXPEVT cr5 -#define PEXPEVT cr6 -#define TRA cr7 -#define SPC cr8 -#define PSPC cr9 -#define RESVEC cr10 -#define VBR cr11 - /* cr12 UNDEFINED */ -#define TEA cr13 - /* cr14-cr15 UNDEFINED */ -#define DCR cr16 -#define KCR0 cr17 -#define KCR1 cr18 - /* cr19-cr31 UNDEFINED */ - /* cr32-cr61 RESERVED */ -#define CTC cr62 -#define USR cr63 - -/* - * ABI dependent registers (general purpose set) - */ -#define RET r2 -#define ARG1 r2 -#define ARG2 r3 -#define ARG3 r4 -#define ARG4 r5 -#define ARG5 r6 -#define ARG6 r7 -#define SP r15 -#define LINK r18 -#define ZERO r63 - -/* - * Status register defines: used only by assembly sources (and - * syntax independednt) - */ -#define SR_RESET_VAL 0x0000000050008000 -#define SR_HARMLESS 0x00000000500080f0 /* Write ignores for most */ -#define SR_ENABLE_FPU 0xffffffffffff7fff /* AND with this */ - -#if defined (CONFIG_SH64_SR_WATCH) -#define SR_ENABLE_MMU 0x0000000084000000 /* OR with this */ -#else -#define SR_ENABLE_MMU 0x0000000080000000 /* OR with this */ -#endif - -#define SR_UNBLOCK_EXC 0xffffffffefffffff /* AND with this */ -#define SR_BLOCK_EXC 0x0000000010000000 /* OR with this */ - -#else /* Not __ASSEMBLY__ syntax */ - -/* -** Stringify reg. name -*/ -#define __str(x) #x - -/* Stringify control register names for use in inline assembly */ -#define __SR __str(SR) -#define __SSR __str(SSR) -#define __PSSR __str(PSSR) -#define __INTEVT __str(INTEVT) -#define __EXPEVT __str(EXPEVT) -#define __PEXPEVT __str(PEXPEVT) -#define __TRA __str(TRA) -#define __SPC __str(SPC) -#define __PSPC __str(PSPC) -#define __RESVEC __str(RESVEC) -#define __VBR __str(VBR) -#define __TEA __str(TEA) -#define __DCR __str(DCR) -#define __KCR0 __str(KCR0) -#define __KCR1 __str(KCR1) -#define __CTC __str(CTC) -#define __USR __str(USR) - -#endif /* __ASSEMBLY__ */ -#endif /* __ASM_SH_CPU_SH5_REGISTERS_H */ diff --git a/arch/sh/include/cpu-sh5/cpu/rtc.h b/arch/sh/include/cpu-sh5/cpu/rtc.h deleted file mode 100644 index d7e25d435f4a..000000000000 --- a/arch/sh/include/cpu-sh5/cpu/rtc.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_SH_CPU_SH5_RTC_H -#define __ASM_SH_CPU_SH5_RTC_H - -#define rtc_reg_size sizeof(u32) -#define RTC_BIT_INVERTED 0 /* The SH-5 RTC is surprisingly sane! */ -#define RTC_DEF_CAPABILITIES RTC_CAP_4_DIGIT_YEAR - -#endif /* __ASM_SH_CPU_SH5_RTC_H */ diff --git a/arch/sh/include/uapi/asm/posix_types.h b/arch/sh/include/uapi/asm/posix_types.h index 2644fdd444e6..adc998a64c76 100644 --- a/arch/sh/include/uapi/asm/posix_types.h +++ b/arch/sh/include/uapi/asm/posix_types.h @@ -1,8 +1,2 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __KERNEL__ -# ifdef __SH5__ -# include -# else -# include -# endif -#endif /* __KERNEL__ */ +#include diff --git a/arch/sh/include/uapi/asm/posix_types_64.h b/arch/sh/include/uapi/asm/posix_types_64.h deleted file mode 100644 index 3a9128d4aee3..000000000000 --- a/arch/sh/include/uapi/asm/posix_types_64.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __ASM_SH_POSIX_TYPES_64_H -#define __ASM_SH_POSIX_TYPES_64_H - -typedef unsigned short __kernel_mode_t; -#define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_ipc_pid_t; -#define __kernel_ipc_pid_t __kernel_ipc_pid_t -typedef unsigned short __kernel_uid_t; -#define __kernel_uid_t __kernel_uid_t -typedef unsigned short __kernel_gid_t; -#define __kernel_gid_t __kernel_gid_t -typedef long unsigned int __kernel_size_t; -#define __kernel_size_t __kernel_size_t -typedef int __kernel_ssize_t; -#define __kernel_ssize_t __kernel_ssize_t -typedef int __kernel_ptrdiff_t; -#define __kernel_ptrdiff_t __kernel_ptrdiff_t - -typedef unsigned short __kernel_old_uid_t; -#define __kernel_old_uid_t __kernel_old_uid_t -typedef unsigned short __kernel_old_gid_t; -#define __kernel_old_gid_t __kernel_old_gid_t -typedef unsigned short __kernel_old_dev_t; -#define __kernel_old_dev_t __kernel_old_dev_t - -#include - -#endif /* __ASM_SH_POSIX_TYPES_64_H */ diff --git a/arch/sh/include/uapi/asm/ptrace.h b/arch/sh/include/uapi/asm/ptrace.h index 4ec9c2b65fdb..5c88e46b7773 100644 --- a/arch/sh/include/uapi/asm/ptrace.h +++ b/arch/sh/include/uapi/asm/ptrace.h @@ -25,11 +25,6 @@ #define PT_DATA_ADDR 248 /* &(struct user)->start_data */ #define PT_TEXT_LEN 252 -#if defined(__SH5__) || defined(CONFIG_CPU_SH5) -#include -#else #include -#endif - #endif /* _UAPI__ASM_SH_PTRACE_H */ diff --git a/arch/sh/include/uapi/asm/ptrace_64.h b/arch/sh/include/uapi/asm/ptrace_64.h deleted file mode 100644 index a6f84eba5277..000000000000 --- a/arch/sh/include/uapi/asm/ptrace_64.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI__ASM_SH_PTRACE_64_H -#define _UAPI__ASM_SH_PTRACE_64_H - -struct pt_regs { - unsigned long long pc; - unsigned long long sr; - long long syscall_nr; - unsigned long long regs[63]; - unsigned long long tregs[8]; - unsigned long long pad[2]; -}; - - -#endif /* _UAPI__ASM_SH_PTRACE_64_H */ diff --git a/arch/sh/include/uapi/asm/sigcontext.h b/arch/sh/include/uapi/asm/sigcontext.h index d2b7e4f033c0..a9cc8bad0f36 100644 --- a/arch/sh/include/uapi/asm/sigcontext.h +++ b/arch/sh/include/uapi/asm/sigcontext.h @@ -5,18 +5,6 @@ struct sigcontext { unsigned long oldmask; -#if defined(__SH5__) || defined(CONFIG_CPU_SH5) - /* CPU registers */ - unsigned long long sc_regs[63]; - unsigned long long sc_tregs[8]; - unsigned long long sc_pc; - unsigned long long sc_sr; - - /* FPU registers */ - unsigned long long sc_fpregs[32]; - unsigned int sc_fpscr; - unsigned int sc_fpvalid; -#else /* CPU registers */ unsigned long sc_regs[16]; unsigned long sc_pc; @@ -32,7 +20,6 @@ struct sigcontext { unsigned int sc_fpscr; unsigned int sc_fpul; unsigned int sc_ownedfp; -#endif }; #endif /* __ASM_SH_SIGCONTEXT_H */ diff --git a/arch/sh/include/uapi/asm/stat.h b/arch/sh/include/uapi/asm/stat.h index 659b87c7c25a..b0ca755ea08d 100644 --- a/arch/sh/include/uapi/asm/stat.h +++ b/arch/sh/include/uapi/asm/stat.h @@ -16,66 +16,6 @@ struct __old_kernel_stat { unsigned long st_ctime; }; -#if defined(__SH5__) || defined(CONFIG_CPU_SH5) -struct stat { - unsigned short st_dev; - unsigned short __pad1; - unsigned long st_ino; - unsigned short st_mode; - unsigned short st_nlink; - unsigned short st_uid; - unsigned short st_gid; - unsigned short st_rdev; - unsigned short __pad2; - unsigned long st_size; - unsigned long st_blksize; - unsigned long st_blocks; - unsigned long st_atime; - unsigned long st_atime_nsec; - unsigned long st_mtime; - unsigned long st_mtime_nsec; - unsigned long st_ctime; - unsigned long st_ctime_nsec; - unsigned long __unused4; - unsigned long __unused5; -}; - -/* This matches struct stat64 in glibc2.1, hence the absolutely - * insane amounts of padding around dev_t's. - */ -struct stat64 { - unsigned short st_dev; - unsigned char __pad0[10]; - - unsigned long st_ino; - unsigned int st_mode; - unsigned int st_nlink; - - unsigned long st_uid; - unsigned long st_gid; - - unsigned short st_rdev; - unsigned char __pad3[10]; - - long long st_size; - unsigned long st_blksize; - - unsigned long st_blocks; /* Number 512-byte blocks allocated. */ - unsigned long __pad4; /* future possible st_blocks high bits */ - - unsigned long st_atime; - unsigned long st_atime_nsec; - - unsigned long st_mtime; - unsigned long st_mtime_nsec; - - unsigned long st_ctime; - unsigned long st_ctime_nsec; /* will be high 32 bits of ctime someday */ - - unsigned long __unused1; - unsigned long __unused2; -}; -#else struct stat { unsigned long st_dev; unsigned long st_ino; @@ -134,6 +74,5 @@ struct stat64 { }; #define STAT_HAVE_NSEC 1 -#endif #endif /* __ASM_SH_STAT_H */ diff --git a/arch/sh/include/uapi/asm/swab.h b/arch/sh/include/uapi/asm/swab.h index f0b02152745c..c727d381a30a 100644 --- a/arch/sh/include/uapi/asm/swab.h +++ b/arch/sh/include/uapi/asm/swab.h @@ -13,14 +13,9 @@ static inline __attribute_const__ __u32 __arch_swab32(__u32 x) { __asm__( -#ifdef __SH5__ - "byterev %1, %0\n\t" - "shari %0, 32, %0" -#else "swap.b %1, %0\n\t" "swap.w %0, %0\n\t" "swap.b %0, %0" -#endif : "=r" (x) : "r" (x)); @@ -31,12 +26,7 @@ static inline __attribute_const__ __u32 __arch_swab32(__u32 x) static inline __attribute_const__ __u16 __arch_swab16(__u16 x) { __asm__( -#ifdef __SH5__ - "byterev %1, %0\n\t" - "shari %0, 32, %0" -#else "swap.b %1, %0" -#endif : "=r" (x) : "r" (x)); diff --git a/arch/sh/include/uapi/asm/unistd.h b/arch/sh/include/uapi/asm/unistd.h index 9e0b4e5e6da2..0f7c7772a2fb 100644 --- a/arch/sh/include/uapi/asm/unistd.h +++ b/arch/sh/include/uapi/asm/unistd.h @@ -1,8 +1,2 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __KERNEL__ -# ifdef __SH5__ -# include -# else -# include -# endif -#endif +#include diff --git a/arch/sh/include/uapi/asm/unistd_64.h b/arch/sh/include/uapi/asm/unistd_64.h deleted file mode 100644 index 75da54851f02..000000000000 --- a/arch/sh/include/uapi/asm/unistd_64.h +++ /dev/null @@ -1,423 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __ASM_SH_UNISTD_64_H -#define __ASM_SH_UNISTD_64_H - -/* - * include/asm-sh/unistd_64.h - * - * This file contains the system call numbers. - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2003 - 2007 Paul Mundt - * Copyright (C) 2004 Sean McGoogan - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ -#define __NR_restart_syscall 0 -#define __NR_exit 1 -#define __NR_fork 2 -#define __NR_read 3 -#define __NR_write 4 -#define __NR_open 5 -#define __NR_close 6 -#define __NR_waitpid 7 -#define __NR_creat 8 -#define __NR_link 9 -#define __NR_unlink 10 -#define __NR_execve 11 -#define __NR_chdir 12 -#define __NR_time 13 -#define __NR_mknod 14 -#define __NR_chmod 15 -#define __NR_lchown 16 - /* 17 was sys_break */ -#define __NR_oldstat 18 -#define __NR_lseek 19 -#define __NR_getpid 20 -#define __NR_mount 21 -#define __NR_umount 22 -#define __NR_setuid 23 -#define __NR_getuid 24 -#define __NR_stime 25 -#define __NR_ptrace 26 -#define __NR_alarm 27 -#define __NR_oldfstat 28 -#define __NR_pause 29 -#define __NR_utime 30 - /* 31 was sys_stty */ - /* 32 was sys_gtty */ -#define __NR_access 33 -#define __NR_nice 34 - /* 35 was sys_ftime */ -#define __NR_sync 36 -#define __NR_kill 37 -#define __NR_rename 38 -#define __NR_mkdir 39 -#define __NR_rmdir 40 -#define __NR_dup 41 -#define __NR_pipe 42 -#define __NR_times 43 - /* 44 was sys_prof */ -#define __NR_brk 45 -#define __NR_setgid 46 -#define __NR_getgid 47 -#define __NR_signal 48 -#define __NR_geteuid 49 -#define __NR_getegid 50 -#define __NR_acct 51 -#define __NR_umount2 52 - /* 53 was sys_lock */ -#define __NR_ioctl 54 -#define __NR_fcntl 55 - /* 56 was sys_mpx */ -#define __NR_setpgid 57 - /* 58 was sys_ulimit */ - /* 59 was sys_olduname */ -#define __NR_umask 60 -#define __NR_chroot 61 -#define __NR_ustat 62 -#define __NR_dup2 63 -#define __NR_getppid 64 -#define __NR_getpgrp 65 -#define __NR_setsid 66 -#define __NR_sigaction 67 -#define __NR_sgetmask 68 -#define __NR_ssetmask 69 -#define __NR_setreuid 70 -#define __NR_setregid 71 -#define __NR_sigsuspend 72 -#define __NR_sigpending 73 -#define __NR_sethostname 74 -#define __NR_setrlimit 75 -#define __NR_getrlimit 76 /* Back compatible 2Gig limited rlimit */ -#define __NR_getrusage 77 -#define __NR_gettimeofday 78 -#define __NR_settimeofday 79 -#define __NR_getgroups 80 -#define __NR_setgroups 81 - /* 82 was sys_select */ -#define __NR_symlink 83 -#define __NR_oldlstat 84 -#define __NR_readlink 85 -#define __NR_uselib 86 -#define __NR_swapon 87 -#define __NR_reboot 88 -#define __NR_readdir 89 -#define __NR_mmap 90 -#define __NR_munmap 91 -#define __NR_truncate 92 -#define __NR_ftruncate 93 -#define __NR_fchmod 94 -#define __NR_fchown 95 -#define __NR_getpriority 96 -#define __NR_setpriority 97 - /* 98 was sys_profil */ -#define __NR_statfs 99 -#define __NR_fstatfs 100 - /* 101 was sys_ioperm */ -#define __NR_socketcall 102 /* old implementation of socket systemcall */ -#define __NR_syslog 103 -#define __NR_setitimer 104 -#define __NR_getitimer 105 -#define __NR_stat 106 -#define __NR_lstat 107 -#define __NR_fstat 108 -#define __NR_olduname 109 - /* 110 was sys_iopl */ -#define __NR_vhangup 111 - /* 112 was sys_idle */ - /* 113 was sys_vm86old */ -#define __NR_wait4 114 -#define __NR_swapoff 115 -#define __NR_sysinfo 116 -#define __NR_ipc 117 -#define __NR_fsync 118 -#define __NR_sigreturn 119 -#define __NR_clone 120 -#define __NR_setdomainname 121 -#define __NR_uname 122 -#define __NR_cacheflush 123 -#define __NR_adjtimex 124 -#define __NR_mprotect 125 -#define __NR_sigprocmask 126 - /* 127 was sys_create_module */ -#define __NR_init_module 128 -#define __NR_delete_module 129 - /* 130 was sys_get_kernel_syms */ -#define __NR_quotactl 131 -#define __NR_getpgid 132 -#define __NR_fchdir 133 -#define __NR_bdflush 134 -#define __NR_sysfs 135 -#define __NR_personality 136 - /* 137 was sys_afs_syscall */ -#define __NR_setfsuid 138 -#define __NR_setfsgid 139 -#define __NR__llseek 140 -#define __NR_getdents 141 -#define __NR__newselect 142 -#define __NR_flock 143 -#define __NR_msync 144 -#define __NR_readv 145 -#define __NR_writev 146 -#define __NR_getsid 147 -#define __NR_fdatasync 148 -#define __NR__sysctl 149 -#define __NR_mlock 150 -#define __NR_munlock 151 -#define __NR_mlockall 152 -#define __NR_munlockall 153 -#define __NR_sched_setparam 154 -#define __NR_sched_getparam 155 -#define __NR_sched_setscheduler 156 -#define __NR_sched_getscheduler 157 -#define __NR_sched_yield 158 -#define __NR_sched_get_priority_max 159 -#define __NR_sched_get_priority_min 160 -#define __NR_sched_rr_get_interval 161 -#define __NR_nanosleep 162 -#define __NR_mremap 163 -#define __NR_setresuid 164 -#define __NR_getresuid 165 - /* 166 was sys_vm86 */ - /* 167 was sys_query_module */ -#define __NR_poll 168 -#define __NR_nfsservctl 169 -#define __NR_setresgid 170 -#define __NR_getresgid 171 -#define __NR_prctl 172 -#define __NR_rt_sigreturn 173 -#define __NR_rt_sigaction 174 -#define __NR_rt_sigprocmask 175 -#define __NR_rt_sigpending 176 -#define __NR_rt_sigtimedwait 177 -#define __NR_rt_sigqueueinfo 178 -#define __NR_rt_sigsuspend 179 -#define __NR_pread64 180 -#define __NR_pwrite64 181 -#define __NR_chown 182 -#define __NR_getcwd 183 -#define __NR_capget 184 -#define __NR_capset 185 -#define __NR_sigaltstack 186 -#define __NR_sendfile 187 - /* 188 reserved for getpmsg */ - /* 189 reserved for putpmsg */ -#define __NR_vfork 190 -#define __NR_ugetrlimit 191 /* SuS compliant getrlimit */ -#define __NR_mmap2 192 -#define __NR_truncate64 193 -#define __NR_ftruncate64 194 -#define __NR_stat64 195 -#define __NR_lstat64 196 -#define __NR_fstat64 197 -#define __NR_lchown32 198 -#define __NR_getuid32 199 -#define __NR_getgid32 200 -#define __NR_geteuid32 201 -#define __NR_getegid32 202 -#define __NR_setreuid32 203 -#define __NR_setregid32 204 -#define __NR_getgroups32 205 -#define __NR_setgroups32 206 -#define __NR_fchown32 207 -#define __NR_setresuid32 208 -#define __NR_getresuid32 209 -#define __NR_setresgid32 210 -#define __NR_getresgid32 211 -#define __NR_chown32 212 -#define __NR_setuid32 213 -#define __NR_setgid32 214 -#define __NR_setfsuid32 215 -#define __NR_setfsgid32 216 -#define __NR_pivot_root 217 -#define __NR_mincore 218 -#define __NR_madvise 219 - -/* Non-multiplexed socket family */ -#define __NR_socket 220 -#define __NR_bind 221 -#define __NR_connect 222 -#define __NR_listen 223 -#define __NR_accept 224 -#define __NR_getsockname 225 -#define __NR_getpeername 226 -#define __NR_socketpair 227 -#define __NR_send 228 -#define __NR_sendto 229 -#define __NR_recv 230 -#define __NR_recvfrom 231 -#define __NR_shutdown 232 -#define __NR_setsockopt 233 -#define __NR_getsockopt 234 -#define __NR_sendmsg 235 -#define __NR_recvmsg 236 - -/* Non-multiplexed IPC family */ -#define __NR_semop 237 -#define __NR_semget 238 -#define __NR_semctl 239 -#define __NR_msgsnd 240 -#define __NR_msgrcv 241 -#define __NR_msgget 242 -#define __NR_msgctl 243 -#define __NR_shmat 244 -#define __NR_shmdt 245 -#define __NR_shmget 246 -#define __NR_shmctl 247 - -#define __NR_getdents64 248 -#define __NR_fcntl64 249 - /* 250 is reserved for tux */ - /* 251 is unused */ -#define __NR_gettid 252 -#define __NR_readahead 253 -#define __NR_setxattr 254 -#define __NR_lsetxattr 255 -#define __NR_fsetxattr 256 -#define __NR_getxattr 257 -#define __NR_lgetxattr 258 -#define __NR_fgetxattr 259 -#define __NR_listxattr 260 -#define __NR_llistxattr 261 -#define __NR_flistxattr 262 -#define __NR_removexattr 263 -#define __NR_lremovexattr 264 -#define __NR_fremovexattr 265 -#define __NR_tkill 266 -#define __NR_sendfile64 267 -#define __NR_futex 268 -#define __NR_sched_setaffinity 269 -#define __NR_sched_getaffinity 270 - /* 271 is reserved for set_thread_area */ - /* 272 is reserved for get_thread_area */ -#define __NR_io_setup 273 -#define __NR_io_destroy 274 -#define __NR_io_getevents 275 -#define __NR_io_submit 276 -#define __NR_io_cancel 277 -#define __NR_fadvise64 278 - /* 279 is unused */ -#define __NR_exit_group 280 - -#define __NR_lookup_dcookie 281 -#define __NR_epoll_create 282 -#define __NR_epoll_ctl 283 -#define __NR_epoll_wait 284 -#define __NR_remap_file_pages 285 -#define __NR_set_tid_address 286 -#define __NR_timer_create 287 -#define __NR_timer_settime (__NR_timer_create+1) -#define __NR_timer_gettime (__NR_timer_create+2) -#define __NR_timer_getoverrun (__NR_timer_create+3) -#define __NR_timer_delete (__NR_timer_create+4) -#define __NR_clock_settime (__NR_timer_create+5) -#define __NR_clock_gettime (__NR_timer_create+6) -#define __NR_clock_getres (__NR_timer_create+7) -#define __NR_clock_nanosleep (__NR_timer_create+8) -#define __NR_statfs64 296 -#define __NR_fstatfs64 297 -#define __NR_tgkill 298 -#define __NR_utimes 299 -#define __NR_fadvise64_64 300 - /* 301 is reserved for vserver */ - /* 302 is reserved for mbind */ - /* 303 is reserved for get_mempolicy */ - /* 304 is reserved for set_mempolicy */ -#define __NR_mq_open 305 -#define __NR_mq_unlink (__NR_mq_open+1) -#define __NR_mq_timedsend (__NR_mq_open+2) -#define __NR_mq_timedreceive (__NR_mq_open+3) -#define __NR_mq_notify (__NR_mq_open+4) -#define __NR_mq_getsetattr (__NR_mq_open+5) - /* 311 is reserved for kexec */ -#define __NR_waitid 312 -#define __NR_add_key 313 -#define __NR_request_key 314 -#define __NR_keyctl 315 -#define __NR_ioprio_set 316 -#define __NR_ioprio_get 317 -#define __NR_inotify_init 318 -#define __NR_inotify_add_watch 319 -#define __NR_inotify_rm_watch 320 - /* 321 is unused */ -#define __NR_migrate_pages 322 -#define __NR_openat 323 -#define __NR_mkdirat 324 -#define __NR_mknodat 325 -#define __NR_fchownat 326 -#define __NR_futimesat 327 -#define __NR_fstatat64 328 -#define __NR_unlinkat 329 -#define __NR_renameat 330 -#define __NR_linkat 331 -#define __NR_symlinkat 332 -#define __NR_readlinkat 333 -#define __NR_fchmodat 334 -#define __NR_faccessat 335 -#define __NR_pselect6 336 -#define __NR_ppoll 337 -#define __NR_unshare 338 -#define __NR_set_robust_list 339 -#define __NR_get_robust_list 340 -#define __NR_splice 341 -#define __NR_sync_file_range 342 -#define __NR_tee 343 -#define __NR_vmsplice 344 -#define __NR_move_pages 345 -#define __NR_getcpu 346 -#define __NR_epoll_pwait 347 -#define __NR_utimensat 348 -#define __NR_signalfd 349 -#define __NR_timerfd_create 350 -#define __NR_eventfd 351 -#define __NR_fallocate 352 -#define __NR_timerfd_settime 353 -#define __NR_timerfd_gettime 354 -#define __NR_signalfd4 355 -#define __NR_eventfd2 356 -#define __NR_epoll_create1 357 -#define __NR_dup3 358 -#define __NR_pipe2 359 -#define __NR_inotify_init1 360 -#define __NR_preadv 361 -#define __NR_pwritev 362 -#define __NR_rt_tgsigqueueinfo 363 -#define __NR_perf_event_open 364 -#define __NR_recvmmsg 365 -#define __NR_accept4 366 -#define __NR_fanotify_init 367 -#define __NR_fanotify_mark 368 -#define __NR_prlimit64 369 -#define __NR_name_to_handle_at 370 -#define __NR_open_by_handle_at 371 -#define __NR_clock_adjtime 372 -#define __NR_syncfs 373 -#define __NR_sendmmsg 374 -#define __NR_setns 375 -#define __NR_process_vm_readv 376 -#define __NR_process_vm_writev 377 -#define __NR_kcmp 378 -#define __NR_finit_module 379 -#define __NR_sched_getattr 380 -#define __NR_sched_setattr 381 -#define __NR_renameat2 382 -#define __NR_seccomp 383 -#define __NR_getrandom 384 -#define __NR_memfd_create 385 -#define __NR_bpf 386 -#define __NR_execveat 387 -#define __NR_userfaultfd 388 -#define __NR_membarrier 389 -#define __NR_mlock2 390 -#define __NR_copy_file_range 391 -#define __NR_preadv2 392 -#define __NR_pwritev2 393 - -#ifdef __KERNEL__ -#define __NR_syscalls 394 -#endif - -#endif /* __ASM_SH_UNISTD_64_H */ diff --git a/arch/sh/kernel/Makefile b/arch/sh/kernel/Makefile index 59673f8a3379..b0f5574b6228 100644 --- a/arch/sh/kernel/Makefile +++ b/arch/sh/kernel/Makefile @@ -3,7 +3,7 @@ # Makefile for the Linux/SuperH kernel. # -extra-y := head_$(BITS).o vmlinux.lds +extra-y := head_32.o vmlinux.lds ifdef CONFIG_FUNCTION_TRACER # Do not profile debug and lowlevel utilities @@ -13,26 +13,26 @@ endif CFLAGS_REMOVE_return_address.o = -pg obj-y := debugtraps.o dumpstack.o \ - idle.o io.o irq.o irq_$(BITS).o kdebugfs.o \ + idle.o io.o irq.o irq_32.o kdebugfs.o \ machvec.o nmi_debug.o process.o \ - process_$(BITS).o ptrace.o ptrace_$(BITS).o \ + process_32.o ptrace.o ptrace_32.o \ reboot.o return_address.o \ - setup.o signal_$(BITS).o sys_sh.o \ - syscalls_$(BITS).o time.o topology.o traps.o \ - traps_$(BITS).o unwinder.o + setup.o signal_32.o sys_sh.o \ + syscalls_32.o time.o topology.o traps.o \ + traps_32.o unwinder.o ifndef CONFIG_GENERIC_IOMAP obj-y += iomap.o obj-$(CONFIG_HAS_IOPORT_MAP) += ioport.o endif -obj-$(CONFIG_SUPERH32) += sys_sh32.o +obj-y += sys_sh32.o obj-y += cpu/ obj-$(CONFIG_VSYSCALL) += vsyscall/ obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SH_STANDARD_BIOS) += sh_bios.o obj-$(CONFIG_KGDB) += kgdb.o -obj-$(CONFIG_MODULES) += sh_ksyms_$(BITS).o module.o +obj-$(CONFIG_MODULES) += sh_ksyms_32.o module.o obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_STACKTRACE) += stacktrace.o diff --git a/arch/sh/kernel/cpu/Makefile b/arch/sh/kernel/cpu/Makefile index f7c22ea98b0f..46118236bf04 100644 --- a/arch/sh/kernel/cpu/Makefile +++ b/arch/sh/kernel/cpu/Makefile @@ -7,7 +7,6 @@ obj-$(CONFIG_CPU_SH2) = sh2/ obj-$(CONFIG_CPU_SH2A) = sh2a/ obj-$(CONFIG_CPU_SH3) = sh3/ obj-$(CONFIG_CPU_SH4) = sh4/ -obj-$(CONFIG_CPU_SH5) = sh5/ # Special cases for family ancestry. diff --git a/arch/sh/kernel/cpu/init.c b/arch/sh/kernel/cpu/init.c index ce7291e12a30..1d008745877f 100644 --- a/arch/sh/kernel/cpu/init.c +++ b/arch/sh/kernel/cpu/init.c @@ -103,7 +103,7 @@ void __attribute__ ((weak)) l2_cache_init(void) /* * Generic first-level cache init */ -#if defined(CONFIG_SUPERH32) && !defined(CONFIG_CPU_J2) +#if !defined(CONFIG_CPU_J2) static void cache_init(void) { unsigned long ccr, flags; diff --git a/arch/sh/kernel/cpu/irq/Makefile b/arch/sh/kernel/cpu/irq/Makefile index 8b91cb96411b..e4578cde46ba 100644 --- a/arch/sh/kernel/cpu/irq/Makefile +++ b/arch/sh/kernel/cpu/irq/Makefile @@ -2,6 +2,5 @@ # # Makefile for the Linux/SuperH CPU-specific IRQ handlers. # -obj-$(CONFIG_SUPERH32) += imask.o -obj-$(CONFIG_CPU_SH5) += intc-sh5.o +obj-y += imask.o obj-$(CONFIG_CPU_HAS_IPR_IRQ) += ipr.o diff --git a/arch/sh/kernel/cpu/irq/intc-sh5.c b/arch/sh/kernel/cpu/irq/intc-sh5.c deleted file mode 100644 index 1b3050facda8..000000000000 --- a/arch/sh/kernel/cpu/irq/intc-sh5.c +++ /dev/null @@ -1,194 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * arch/sh/kernel/cpu/irq/intc-sh5.c - * - * Interrupt Controller support for SH5 INTC. - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2003 Paul Mundt - * - * Per-interrupt selective. IRLM=0 (Fixed priority) is not - * supported being useless without a cascaded interrupt - * controller. - */ -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Maybe the generic Peripheral block could move to a more - * generic include file. INTC Block will be defined here - * and only here to make INTC self-contained in a single - * file. - */ -#define INTC_BLOCK_OFFSET 0x01000000 - -/* Base */ -#define INTC_BASE PHYS_PERIPHERAL_BLOCK + \ - INTC_BLOCK_OFFSET - -/* Address */ -#define INTC_ICR_SET (intc_virt + 0x0) -#define INTC_ICR_CLEAR (intc_virt + 0x8) -#define INTC_INTPRI_0 (intc_virt + 0x10) -#define INTC_INTSRC_0 (intc_virt + 0x50) -#define INTC_INTSRC_1 (intc_virt + 0x58) -#define INTC_INTREQ_0 (intc_virt + 0x60) -#define INTC_INTREQ_1 (intc_virt + 0x68) -#define INTC_INTENB_0 (intc_virt + 0x70) -#define INTC_INTENB_1 (intc_virt + 0x78) -#define INTC_INTDSB_0 (intc_virt + 0x80) -#define INTC_INTDSB_1 (intc_virt + 0x88) - -#define INTC_ICR_IRLM 0x1 -#define INTC_INTPRI_PREGS 8 /* 8 Priority Registers */ -#define INTC_INTPRI_PPREG 8 /* 8 Priorities per Register */ - - -/* - * Mapper between the vector ordinal and the IRQ number - * passed to kernel/device drivers. - */ -int intc_evt_to_irq[(0xE20/0x20)+1] = { - -1, -1, -1, -1, -1, -1, -1, -1, /* 0x000 - 0x0E0 */ - -1, -1, -1, -1, -1, -1, -1, -1, /* 0x100 - 0x1E0 */ - 0, 0, 0, 0, 0, 1, 0, 0, /* 0x200 - 0x2E0 */ - 2, 0, 0, 3, 0, 0, 0, -1, /* 0x300 - 0x3E0 */ - 32, 33, 34, 35, 36, 37, 38, -1, /* 0x400 - 0x4E0 */ - -1, -1, -1, 63, -1, -1, -1, -1, /* 0x500 - 0x5E0 */ - -1, -1, 18, 19, 20, 21, 22, -1, /* 0x600 - 0x6E0 */ - 39, 40, 41, 42, -1, -1, -1, -1, /* 0x700 - 0x7E0 */ - 4, 5, 6, 7, -1, -1, -1, -1, /* 0x800 - 0x8E0 */ - -1, -1, -1, -1, -1, -1, -1, -1, /* 0x900 - 0x9E0 */ - 12, 13, 14, 15, 16, 17, -1, -1, /* 0xA00 - 0xAE0 */ - -1, -1, -1, -1, -1, -1, -1, -1, /* 0xB00 - 0xBE0 */ - -1, -1, -1, -1, -1, -1, -1, -1, /* 0xC00 - 0xCE0 */ - -1, -1, -1, -1, -1, -1, -1, -1, /* 0xD00 - 0xDE0 */ - -1, -1 /* 0xE00 - 0xE20 */ -}; - -static unsigned long intc_virt; -static int irlm; /* IRL mode */ - -static void enable_intc_irq(struct irq_data *data) -{ - unsigned int irq = data->irq; - unsigned long reg; - unsigned long bitmask; - - if ((irq <= IRQ_IRL3) && (irlm == NO_PRIORITY)) - printk("Trying to use straight IRL0-3 with an encoding platform.\n"); - - if (irq < 32) { - reg = INTC_INTENB_0; - bitmask = 1 << irq; - } else { - reg = INTC_INTENB_1; - bitmask = 1 << (irq - 32); - } - - __raw_writel(bitmask, reg); -} - -static void disable_intc_irq(struct irq_data *data) -{ - unsigned int irq = data->irq; - unsigned long reg; - unsigned long bitmask; - - if (irq < 32) { - reg = INTC_INTDSB_0; - bitmask = 1 << irq; - } else { - reg = INTC_INTDSB_1; - bitmask = 1 << (irq - 32); - } - - __raw_writel(bitmask, reg); -} - -static struct irq_chip intc_irq_type = { - .name = "INTC", - .irq_enable = enable_intc_irq, - .irq_disable = disable_intc_irq, -}; - -void __init plat_irq_setup(void) -{ - unsigned long long __dummy0, __dummy1=~0x00000000100000f0; - unsigned long reg; - int i; - - intc_virt = (unsigned long)ioremap(INTC_BASE, 1024); - if (!intc_virt) { - panic("Unable to remap INTC\n"); - } - - - /* Set default: per-line enable/disable, priority driven ack/eoi */ - for (i = 0; i < NR_INTC_IRQS; i++) - irq_set_chip_and_handler(i, &intc_irq_type, handle_level_irq); - - - /* Disable all interrupts and set all priorities to 0 to avoid trouble */ - __raw_writel(-1, INTC_INTDSB_0); - __raw_writel(-1, INTC_INTDSB_1); - - for (reg = INTC_INTPRI_0, i = 0; i < INTC_INTPRI_PREGS; i++, reg += 8) - __raw_writel( NO_PRIORITY, reg); - - -#ifdef CONFIG_SH_CAYMAN - { - unsigned long data; - - /* Set IRLM */ - /* If all the priorities are set to 'no priority', then - * assume we are using encoded mode. - */ - irlm = platform_int_priority[IRQ_IRL0] + - platform_int_priority[IRQ_IRL1] + - platform_int_priority[IRQ_IRL2] + - platform_int_priority[IRQ_IRL3]; - if (irlm == NO_PRIORITY) { - /* IRLM = 0 */ - reg = INTC_ICR_CLEAR; - i = IRQ_INTA; - printk("Trying to use encoded IRL0-3. IRLs unsupported.\n"); - } else { - /* IRLM = 1 */ - reg = INTC_ICR_SET; - i = IRQ_IRL0; - } - __raw_writel(INTC_ICR_IRLM, reg); - - /* Set interrupt priorities according to platform description */ - for (data = 0, reg = INTC_INTPRI_0; i < NR_INTC_IRQS; i++) { - data |= platform_int_priority[i] << - ((i % INTC_INTPRI_PPREG) * 4); - if ((i % INTC_INTPRI_PPREG) == (INTC_INTPRI_PPREG - 1)) { - /* Upon the 7th, set Priority Register */ - __raw_writel(data, reg); - data = 0; - reg += 8; - } - } - } -#endif - - /* - * And now let interrupts come in. - * sti() is not enough, we need to - * lower priority, too. - */ - __asm__ __volatile__("getcon " __SR ", %0\n\t" - "and %0, %1, %0\n\t" - "putcon %0, " __SR "\n\t" - : "=&r" (__dummy0) - : "r" (__dummy1)); -} diff --git a/arch/sh/kernel/cpu/proc.c b/arch/sh/kernel/cpu/proc.c index 85961b4f9c69..a306bcd6b341 100644 --- a/arch/sh/kernel/cpu/proc.c +++ b/arch/sh/kernel/cpu/proc.c @@ -24,7 +24,6 @@ static const char *cpu_name[] = { [CPU_SH7343] = "SH7343", [CPU_SH7785] = "SH7785", [CPU_SH7786] = "SH7786", [CPU_SH7757] = "SH7757", [CPU_SH7722] = "SH7722", [CPU_SHX3] = "SH-X3", - [CPU_SH5_101] = "SH5-101", [CPU_SH5_103] = "SH5-103", [CPU_MXG] = "MX-G", [CPU_SH7723] = "SH7723", [CPU_SH7366] = "SH7366", [CPU_SH7724] = "SH7724", [CPU_SH7372] = "SH7372", [CPU_SH7734] = "SH7734", diff --git a/arch/sh/kernel/cpu/sh5/Makefile b/arch/sh/kernel/cpu/sh5/Makefile deleted file mode 100644 index 97d23ec3005f..000000000000 --- a/arch/sh/kernel/cpu/sh5/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for the Linux/SuperH SH-5 backends. -# -obj-y := entry.o probe.o switchto.o - -obj-$(CONFIG_SH_FPU) += fpu.o -obj-$(CONFIG_KALLSYMS) += unwind.o - -# CPU subtype setup -obj-$(CONFIG_CPU_SH5) += setup-sh5.o - -# Primary on-chip clocks (common) -clock-$(CONFIG_CPU_SH5) := clock-sh5.o - -obj-y += $(clock-y) diff --git a/arch/sh/kernel/cpu/sh5/clock-sh5.c b/arch/sh/kernel/cpu/sh5/clock-sh5.c deleted file mode 100644 index dee6be2c2344..000000000000 --- a/arch/sh/kernel/cpu/sh5/clock-sh5.c +++ /dev/null @@ -1,76 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * arch/sh/kernel/cpu/sh5/clock-sh5.c - * - * SH-5 support for the clock framework - * - * Copyright (C) 2008 Paul Mundt - */ -#include -#include -#include -#include - -static int ifc_table[] = { 2, 4, 6, 8, 10, 12, 16, 24 }; - -/* Clock, Power and Reset Controller */ -#define CPRC_BLOCK_OFF 0x01010000 -#define CPRC_BASE (PHYS_PERIPHERAL_BLOCK + CPRC_BLOCK_OFF) - -static unsigned long cprc_base; - -static void master_clk_init(struct clk *clk) -{ - int idx = (__raw_readl(cprc_base + 0x00) >> 6) & 0x0007; - clk->rate *= ifc_table[idx]; -} - -static struct sh_clk_ops sh5_master_clk_ops = { - .init = master_clk_init, -}; - -static unsigned long module_clk_recalc(struct clk *clk) -{ - int idx = (__raw_readw(cprc_base) >> 12) & 0x0007; - return clk->parent->rate / ifc_table[idx]; -} - -static struct sh_clk_ops sh5_module_clk_ops = { - .recalc = module_clk_recalc, -}; - -static unsigned long bus_clk_recalc(struct clk *clk) -{ - int idx = (__raw_readw(cprc_base) >> 3) & 0x0007; - return clk->parent->rate / ifc_table[idx]; -} - -static struct sh_clk_ops sh5_bus_clk_ops = { - .recalc = bus_clk_recalc, -}; - -static unsigned long cpu_clk_recalc(struct clk *clk) -{ - int idx = (__raw_readw(cprc_base) & 0x0007); - return clk->parent->rate / ifc_table[idx]; -} - -static struct sh_clk_ops sh5_cpu_clk_ops = { - .recalc = cpu_clk_recalc, -}; - -static struct sh_clk_ops *sh5_clk_ops[] = { - &sh5_master_clk_ops, - &sh5_module_clk_ops, - &sh5_bus_clk_ops, - &sh5_cpu_clk_ops, -}; - -void __init arch_init_clk_ops(struct sh_clk_ops **ops, int idx) -{ - cprc_base = (unsigned long)ioremap(CPRC_BASE, 1024); - BUG_ON(!cprc_base); - - if (idx < ARRAY_SIZE(sh5_clk_ops)) - *ops = sh5_clk_ops[idx]; -} diff --git a/arch/sh/kernel/cpu/sh5/entry.S b/arch/sh/kernel/cpu/sh5/entry.S deleted file mode 100644 index 81c8b64b977f..000000000000 --- a/arch/sh/kernel/cpu/sh5/entry.S +++ /dev/null @@ -1,2000 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * arch/sh/kernel/cpu/sh5/entry.S - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2004 - 2008 Paul Mundt - * Copyright (C) 2003, 2004 Richard Curnow - */ -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * SR fields. - */ -#define SR_ASID_MASK 0x00ff0000 -#define SR_FD_MASK 0x00008000 -#define SR_SS 0x08000000 -#define SR_BL 0x10000000 -#define SR_MD 0x40000000 - -/* - * Event code. - */ -#define EVENT_INTERRUPT 0 -#define EVENT_FAULT_TLB 1 -#define EVENT_FAULT_NOT_TLB 2 -#define EVENT_DEBUG 3 - -/* EXPEVT values */ -#define RESET_CAUSE 0x20 -#define DEBUGSS_CAUSE 0x980 - -/* - * Frame layout. Quad index. - */ -#define FRAME_T(x) FRAME_TBASE+(x*8) -#define FRAME_R(x) FRAME_RBASE+(x*8) -#define FRAME_S(x) FRAME_SBASE+(x*8) -#define FSPC 0 -#define FSSR 1 -#define FSYSCALL_ID 2 - -/* Arrange the save frame to be a multiple of 32 bytes long */ -#define FRAME_SBASE 0 -#define FRAME_RBASE (FRAME_SBASE+(3*8)) /* SYSCALL_ID - SSR - SPC */ -#define FRAME_TBASE (FRAME_RBASE+(63*8)) /* r0 - r62 */ -#define FRAME_PBASE (FRAME_TBASE+(8*8)) /* tr0 -tr7 */ -#define FRAME_SIZE (FRAME_PBASE+(2*8)) /* pad0-pad1 */ - -#define FP_FRAME_SIZE FP_FRAME_BASE+(33*8) /* dr0 - dr31 + fpscr */ -#define FP_FRAME_BASE 0 - -#define SAVED_R2 0*8 -#define SAVED_R3 1*8 -#define SAVED_R4 2*8 -#define SAVED_R5 3*8 -#define SAVED_R18 4*8 -#define SAVED_R6 5*8 -#define SAVED_TR0 6*8 - -/* These are the registers saved in the TLB path that aren't saved in the first - level of the normal one. */ -#define TLB_SAVED_R25 7*8 -#define TLB_SAVED_TR1 8*8 -#define TLB_SAVED_TR2 9*8 -#define TLB_SAVED_TR3 10*8 -#define TLB_SAVED_TR4 11*8 -/* Save R0/R1 : PT-migrating compiler currently dishounours -ffixed-r0 and -ffixed-r1 causing - breakage otherwise. */ -#define TLB_SAVED_R0 12*8 -#define TLB_SAVED_R1 13*8 - -#define CLI() \ - getcon SR, r6; \ - ori r6, 0xf0, r6; \ - putcon r6, SR; - -#define STI() \ - getcon SR, r6; \ - andi r6, ~0xf0, r6; \ - putcon r6, SR; - -#ifdef CONFIG_PREEMPTION -# define preempt_stop() CLI() -#else -# define preempt_stop() -# define resume_kernel restore_all -#endif - - .section .data, "aw" - -#define FAST_TLBMISS_STACK_CACHELINES 4 -#define FAST_TLBMISS_STACK_QUADWORDS (4*FAST_TLBMISS_STACK_CACHELINES) - -/* Register back-up area for all exceptions */ - .balign 32 - /* Allow for 16 quadwords to be pushed by fast tlbmiss handling - * register saves etc. */ - .fill FAST_TLBMISS_STACK_QUADWORDS, 8, 0x0 -/* This is 32 byte aligned by construction */ -/* Register back-up area for all exceptions */ -reg_save_area: - .quad 0 - .quad 0 - .quad 0 - .quad 0 - - .quad 0 - .quad 0 - .quad 0 - .quad 0 - - .quad 0 - .quad 0 - .quad 0 - .quad 0 - - .quad 0 - .quad 0 - -/* Save area for RESVEC exceptions. We cannot use reg_save_area because of - * reentrancy. Note this area may be accessed via physical address. - * Align so this fits a whole single cache line, for ease of purging. - */ - .balign 32,0,32 -resvec_save_area: - .quad 0 - .quad 0 - .quad 0 - .quad 0 - .quad 0 - .balign 32,0,32 - -/* Jump table of 3rd level handlers */ -trap_jtable: - .long do_exception_error /* 0x000 */ - .long do_exception_error /* 0x020 */ -#ifdef CONFIG_MMU - .long tlb_miss_load /* 0x040 */ - .long tlb_miss_store /* 0x060 */ -#else - .long do_exception_error - .long do_exception_error -#endif - ! ARTIFICIAL pseudo-EXPEVT setting - .long do_debug_interrupt /* 0x080 */ -#ifdef CONFIG_MMU - .long tlb_miss_load /* 0x0A0 */ - .long tlb_miss_store /* 0x0C0 */ -#else - .long do_exception_error - .long do_exception_error -#endif - .long do_address_error_load /* 0x0E0 */ - .long do_address_error_store /* 0x100 */ -#ifdef CONFIG_SH_FPU - .long do_fpu_error /* 0x120 */ -#else - .long do_exception_error /* 0x120 */ -#endif - .long do_exception_error /* 0x140 */ - .long system_call /* 0x160 */ - .long do_reserved_inst /* 0x180 */ - .long do_illegal_slot_inst /* 0x1A0 */ - .long do_exception_error /* 0x1C0 - NMI */ - .long do_exception_error /* 0x1E0 */ - .rept 15 - .long do_IRQ /* 0x200 - 0x3C0 */ - .endr - .long do_exception_error /* 0x3E0 */ - .rept 32 - .long do_IRQ /* 0x400 - 0x7E0 */ - .endr - .long fpu_error_or_IRQA /* 0x800 */ - .long fpu_error_or_IRQB /* 0x820 */ - .long do_IRQ /* 0x840 */ - .long do_IRQ /* 0x860 */ - .rept 6 - .long do_exception_error /* 0x880 - 0x920 */ - .endr - .long breakpoint_trap_handler /* 0x940 */ - .long do_exception_error /* 0x960 */ - .long do_single_step /* 0x980 */ - - .rept 3 - .long do_exception_error /* 0x9A0 - 0x9E0 */ - .endr - .long do_IRQ /* 0xA00 */ - .long do_IRQ /* 0xA20 */ -#ifdef CONFIG_MMU - .long itlb_miss_or_IRQ /* 0xA40 */ -#else - .long do_IRQ -#endif - .long do_IRQ /* 0xA60 */ - .long do_IRQ /* 0xA80 */ -#ifdef CONFIG_MMU - .long itlb_miss_or_IRQ /* 0xAA0 */ -#else - .long do_IRQ -#endif - .long do_exception_error /* 0xAC0 */ - .long do_address_error_exec /* 0xAE0 */ - .rept 8 - .long do_exception_error /* 0xB00 - 0xBE0 */ - .endr - .rept 18 - .long do_IRQ /* 0xC00 - 0xE20 */ - .endr - - .section .text64, "ax" - -/* - * --- Exception/Interrupt/Event Handling Section - */ - -/* - * VBR and RESVEC blocks. - * - * First level handler for VBR-based exceptions. - * - * To avoid waste of space, align to the maximum text block size. - * This is assumed to be at most 128 bytes or 32 instructions. - * DO NOT EXCEED 32 instructions on the first level handlers ! - * - * Also note that RESVEC is contained within the VBR block - * where the room left (1KB - TEXT_SIZE) allows placing - * the RESVEC block (at most 512B + TEXT_SIZE). - * - * So first (and only) level handler for RESVEC-based exceptions. - * - * Where the fault/interrupt is handled (not_a_tlb_miss, tlb_miss - * and interrupt) we are a lot tight with register space until - * saving onto the stack frame, which is done in handle_exception(). - * - */ - -#define TEXT_SIZE 128 -#define BLOCK_SIZE 1664 /* Dynamic check, 13*128 */ - - .balign TEXT_SIZE -LVBR_block: - .space 256, 0 /* Power-on class handler, */ - /* not required here */ -not_a_tlb_miss: - synco /* TAKum03020 (but probably a good idea anyway.) */ - /* Save original stack pointer into KCR1 */ - putcon SP, KCR1 - - /* Save other original registers into reg_save_area */ - movi reg_save_area, SP - st.q SP, SAVED_R2, r2 - st.q SP, SAVED_R3, r3 - st.q SP, SAVED_R4, r4 - st.q SP, SAVED_R5, r5 - st.q SP, SAVED_R6, r6 - st.q SP, SAVED_R18, r18 - gettr tr0, r3 - st.q SP, SAVED_TR0, r3 - - /* Set args for Non-debug, Not a TLB miss class handler */ - getcon EXPEVT, r2 - movi ret_from_exception, r3 - ori r3, 1, r3 - movi EVENT_FAULT_NOT_TLB, r4 - or SP, ZERO, r5 - getcon KCR1, SP - pta handle_exception, tr0 - blink tr0, ZERO - - .balign 256 - ! VBR+0x200 - nop - .balign 256 - ! VBR+0x300 - nop - .balign 256 - /* - * Instead of the natural .balign 1024 place RESVEC here - * respecting the final 1KB alignment. - */ - .balign TEXT_SIZE - /* - * Instead of '.space 1024-TEXT_SIZE' place the RESVEC - * block making sure the final alignment is correct. - */ -#ifdef CONFIG_MMU -tlb_miss: - synco /* TAKum03020 (but probably a good idea anyway.) */ - putcon SP, KCR1 - movi reg_save_area, SP - /* SP is guaranteed 32-byte aligned. */ - st.q SP, TLB_SAVED_R0 , r0 - st.q SP, TLB_SAVED_R1 , r1 - st.q SP, SAVED_R2 , r2 - st.q SP, SAVED_R3 , r3 - st.q SP, SAVED_R4 , r4 - st.q SP, SAVED_R5 , r5 - st.q SP, SAVED_R6 , r6 - st.q SP, SAVED_R18, r18 - - /* Save R25 for safety; as/ld may want to use it to achieve the call to - * the code in mm/tlbmiss.c */ - st.q SP, TLB_SAVED_R25, r25 - gettr tr0, r2 - gettr tr1, r3 - gettr tr2, r4 - gettr tr3, r5 - gettr tr4, r18 - st.q SP, SAVED_TR0 , r2 - st.q SP, TLB_SAVED_TR1 , r3 - st.q SP, TLB_SAVED_TR2 , r4 - st.q SP, TLB_SAVED_TR3 , r5 - st.q SP, TLB_SAVED_TR4 , r18 - - pt do_fast_page_fault, tr0 - getcon SSR, r2 - getcon EXPEVT, r3 - getcon TEA, r4 - shlri r2, 30, r2 - andi r2, 1, r2 /* r2 = SSR.MD */ - blink tr0, LINK - - pt fixup_to_invoke_general_handler, tr1 - - /* If the fast path handler fixed the fault, just drop through quickly - to the restore code right away to return to the excepting context. - */ - bnei/u r2, 0, tr1 - -fast_tlb_miss_restore: - ld.q SP, SAVED_TR0, r2 - ld.q SP, TLB_SAVED_TR1, r3 - ld.q SP, TLB_SAVED_TR2, r4 - - ld.q SP, TLB_SAVED_TR3, r5 - ld.q SP, TLB_SAVED_TR4, r18 - - ptabs r2, tr0 - ptabs r3, tr1 - ptabs r4, tr2 - ptabs r5, tr3 - ptabs r18, tr4 - - ld.q SP, TLB_SAVED_R0, r0 - ld.q SP, TLB_SAVED_R1, r1 - ld.q SP, SAVED_R2, r2 - ld.q SP, SAVED_R3, r3 - ld.q SP, SAVED_R4, r4 - ld.q SP, SAVED_R5, r5 - ld.q SP, SAVED_R6, r6 - ld.q SP, SAVED_R18, r18 - ld.q SP, TLB_SAVED_R25, r25 - - getcon KCR1, SP - rte - nop /* for safety, in case the code is run on sh5-101 cut1.x */ - -fixup_to_invoke_general_handler: - - /* OK, new method. Restore stuff that's not expected to get saved into - the 'first-level' reg save area, then just fall through to setting - up the registers and calling the second-level handler. */ - - /* 2nd level expects r2,3,4,5,6,18,tr0 to be saved. So we must restore - r25,tr1-4 and save r6 to get into the right state. */ - - ld.q SP, TLB_SAVED_TR1, r3 - ld.q SP, TLB_SAVED_TR2, r4 - ld.q SP, TLB_SAVED_TR3, r5 - ld.q SP, TLB_SAVED_TR4, r18 - ld.q SP, TLB_SAVED_R25, r25 - - ld.q SP, TLB_SAVED_R0, r0 - ld.q SP, TLB_SAVED_R1, r1 - - ptabs/u r3, tr1 - ptabs/u r4, tr2 - ptabs/u r5, tr3 - ptabs/u r18, tr4 - - /* Set args for Non-debug, TLB miss class handler */ - getcon EXPEVT, r2 - movi ret_from_exception, r3 - ori r3, 1, r3 - movi EVENT_FAULT_TLB, r4 - or SP, ZERO, r5 - getcon KCR1, SP - pta handle_exception, tr0 - blink tr0, ZERO -#else /* CONFIG_MMU */ - .balign 256 -#endif - -/* NB TAKE GREAT CARE HERE TO ENSURE THAT THE INTERRUPT CODE - DOES END UP AT VBR+0x600 */ - nop - nop - nop - nop - nop - nop - - .balign 256 - /* VBR + 0x600 */ - -interrupt: - synco /* TAKum03020 (but probably a good idea anyway.) */ - /* Save original stack pointer into KCR1 */ - putcon SP, KCR1 - - /* Save other original registers into reg_save_area */ - movi reg_save_area, SP - st.q SP, SAVED_R2, r2 - st.q SP, SAVED_R3, r3 - st.q SP, SAVED_R4, r4 - st.q SP, SAVED_R5, r5 - st.q SP, SAVED_R6, r6 - st.q SP, SAVED_R18, r18 - gettr tr0, r3 - st.q SP, SAVED_TR0, r3 - - /* Set args for interrupt class handler */ - getcon INTEVT, r2 - movi ret_from_irq, r3 - ori r3, 1, r3 - movi EVENT_INTERRUPT, r4 - or SP, ZERO, r5 - getcon KCR1, SP - pta handle_exception, tr0 - blink tr0, ZERO - .balign TEXT_SIZE /* let's waste the bare minimum */ - -LVBR_block_end: /* Marker. Used for total checking */ - - .balign 256 -LRESVEC_block: - /* Panic handler. Called with MMU off. Possible causes/actions: - * - Reset: Jump to program start. - * - Single Step: Turn off Single Step & return. - * - Others: Call panic handler, passing PC as arg. - * (this may need to be extended...) - */ -reset_or_panic: - synco /* TAKum03020 (but probably a good idea anyway.) */ - putcon SP, DCR - /* First save r0-1 and tr0, as we need to use these */ - movi resvec_save_area-CONFIG_PAGE_OFFSET, SP - st.q SP, 0, r0 - st.q SP, 8, r1 - gettr tr0, r0 - st.q SP, 32, r0 - - /* Check cause */ - getcon EXPEVT, r0 - movi RESET_CAUSE, r1 - sub r1, r0, r1 /* r1=0 if reset */ - movi _stext-CONFIG_PAGE_OFFSET, r0 - ori r0, 1, r0 - ptabs r0, tr0 - beqi r1, 0, tr0 /* Jump to start address if reset */ - - getcon EXPEVT, r0 - movi DEBUGSS_CAUSE, r1 - sub r1, r0, r1 /* r1=0 if single step */ - pta single_step_panic, tr0 - beqi r1, 0, tr0 /* jump if single step */ - - /* Now jump to where we save the registers. */ - movi panic_stash_regs-CONFIG_PAGE_OFFSET, r1 - ptabs r1, tr0 - blink tr0, r63 - -single_step_panic: - /* We are in a handler with Single Step set. We need to resume the - * handler, by turning on MMU & turning off Single Step. */ - getcon SSR, r0 - movi SR_MMU, r1 - or r0, r1, r0 - movi ~SR_SS, r1 - and r0, r1, r0 - putcon r0, SSR - /* Restore EXPEVT, as the rte won't do this */ - getcon PEXPEVT, r0 - putcon r0, EXPEVT - /* Restore regs */ - ld.q SP, 32, r0 - ptabs r0, tr0 - ld.q SP, 0, r0 - ld.q SP, 8, r1 - getcon DCR, SP - synco - rte - - - .balign 256 -debug_exception: - synco /* TAKum03020 (but probably a good idea anyway.) */ - /* - * Single step/software_break_point first level handler. - * Called with MMU off, so the first thing we do is enable it - * by doing an rte with appropriate SSR. - */ - putcon SP, DCR - /* Save SSR & SPC, together with R0 & R1, as we need to use 2 regs. */ - movi resvec_save_area-CONFIG_PAGE_OFFSET, SP - - /* With the MMU off, we are bypassing the cache, so purge any - * data that will be made stale by the following stores. - */ - ocbp SP, 0 - synco - - st.q SP, 0, r0 - st.q SP, 8, r1 - getcon SPC, r0 - st.q SP, 16, r0 - getcon SSR, r0 - st.q SP, 24, r0 - - /* Enable MMU, block exceptions, set priv mode, disable single step */ - movi SR_MMU | SR_BL | SR_MD, r1 - or r0, r1, r0 - movi ~SR_SS, r1 - and r0, r1, r0 - putcon r0, SSR - /* Force control to debug_exception_2 when rte is executed */ - movi debug_exeception_2, r0 - ori r0, 1, r0 /* force SHmedia, just in case */ - putcon r0, SPC - getcon DCR, SP - synco - rte -debug_exeception_2: - /* Restore saved regs */ - putcon SP, KCR1 - movi resvec_save_area, SP - ld.q SP, 24, r0 - putcon r0, SSR - ld.q SP, 16, r0 - putcon r0, SPC - ld.q SP, 0, r0 - ld.q SP, 8, r1 - - /* Save other original registers into reg_save_area */ - movi reg_save_area, SP - st.q SP, SAVED_R2, r2 - st.q SP, SAVED_R3, r3 - st.q SP, SAVED_R4, r4 - st.q SP, SAVED_R5, r5 - st.q SP, SAVED_R6, r6 - st.q SP, SAVED_R18, r18 - gettr tr0, r3 - st.q SP, SAVED_TR0, r3 - - /* Set args for debug class handler */ - getcon EXPEVT, r2 - movi ret_from_exception, r3 - ori r3, 1, r3 - movi EVENT_DEBUG, r4 - or SP, ZERO, r5 - getcon KCR1, SP - pta handle_exception, tr0 - blink tr0, ZERO - - .balign 256 -debug_interrupt: - /* !!! WE COME HERE IN REAL MODE !!! */ - /* Hook-up debug interrupt to allow various debugging options to be - * hooked into its handler. */ - /* Save original stack pointer into KCR1 */ - synco - putcon SP, KCR1 - movi resvec_save_area-CONFIG_PAGE_OFFSET, SP - ocbp SP, 0 - ocbp SP, 32 - synco - - /* Save other original registers into reg_save_area thru real addresses */ - st.q SP, SAVED_R2, r2 - st.q SP, SAVED_R3, r3 - st.q SP, SAVED_R4, r4 - st.q SP, SAVED_R5, r5 - st.q SP, SAVED_R6, r6 - st.q SP, SAVED_R18, r18 - gettr tr0, r3 - st.q SP, SAVED_TR0, r3 - - /* move (spc,ssr)->(pspc,pssr). The rte will shift - them back again, so that they look like the originals - as far as the real handler code is concerned. */ - getcon spc, r6 - putcon r6, pspc - getcon ssr, r6 - putcon r6, pssr - - ! construct useful SR for handle_exception - movi 3, r6 - shlli r6, 30, r6 - getcon sr, r18 - or r18, r6, r6 - putcon r6, ssr - - ! SSR is now the current SR with the MD and MMU bits set - ! i.e. the rte will switch back to priv mode and put - ! the mmu back on - - ! construct spc - movi handle_exception, r18 - ori r18, 1, r18 ! for safety (do we need this?) - putcon r18, spc - - /* Set args for Non-debug, Not a TLB miss class handler */ - - ! EXPEVT==0x80 is unused, so 'steal' this value to put the - ! debug interrupt handler in the vectoring table - movi 0x80, r2 - movi ret_from_exception, r3 - ori r3, 1, r3 - movi EVENT_FAULT_NOT_TLB, r4 - - or SP, ZERO, r5 - movi CONFIG_PAGE_OFFSET, r6 - add r6, r5, r5 - getcon KCR1, SP - - synco ! for safety - rte ! -> handle_exception, switch back to priv mode again - -LRESVEC_block_end: /* Marker. Unused. */ - - .balign TEXT_SIZE - -/* - * Second level handler for VBR-based exceptions. Pre-handler. - * In common to all stack-frame sensitive handlers. - * - * Inputs: - * (KCR0) Current [current task union] - * (KCR1) Original SP - * (r2) INTEVT/EXPEVT - * (r3) appropriate return address - * (r4) Event (0 = interrupt, 1 = TLB miss fault, 2 = Not TLB miss fault, 3=debug) - * (r5) Pointer to reg_save_area - * (SP) Original SP - * - * Available registers: - * (r6) - * (r18) - * (tr0) - * - */ -handle_exception: - /* Common 2nd level handler. */ - - /* First thing we need an appropriate stack pointer */ - getcon SSR, r6 - shlri r6, 30, r6 - andi r6, 1, r6 - pta stack_ok, tr0 - bne r6, ZERO, tr0 /* Original stack pointer is fine */ - - /* Set stack pointer for user fault */ - getcon KCR0, SP - movi THREAD_SIZE, r6 /* Point to the end */ - add SP, r6, SP - -stack_ok: - -/* DEBUG : check for underflow/overflow of the kernel stack */ - pta no_underflow, tr0 - getcon KCR0, r6 - movi 1024, r18 - add r6, r18, r6 - bge SP, r6, tr0 ! ? below 1k from bottom of stack : danger zone - -/* Just panic to cause a crash. */ -bad_sp: - ld.b r63, 0, r6 - nop - -no_underflow: - pta bad_sp, tr0 - getcon kcr0, r6 - movi THREAD_SIZE, r18 - add r18, r6, r6 - bgt SP, r6, tr0 ! sp above the stack - - /* Make some room for the BASIC frame. */ - movi -(FRAME_SIZE), r6 - add SP, r6, SP - -/* Could do this with no stalling if we had another spare register, but the - code below will be OK. */ - ld.q r5, SAVED_R2, r6 - ld.q r5, SAVED_R3, r18 - st.q SP, FRAME_R(2), r6 - ld.q r5, SAVED_R4, r6 - st.q SP, FRAME_R(3), r18 - ld.q r5, SAVED_R5, r18 - st.q SP, FRAME_R(4), r6 - ld.q r5, SAVED_R6, r6 - st.q SP, FRAME_R(5), r18 - ld.q r5, SAVED_R18, r18 - st.q SP, FRAME_R(6), r6 - ld.q r5, SAVED_TR0, r6 - st.q SP, FRAME_R(18), r18 - st.q SP, FRAME_T(0), r6 - - /* Keep old SP around */ - getcon KCR1, r6 - - /* Save the rest of the general purpose registers */ - st.q SP, FRAME_R(0), r0 - st.q SP, FRAME_R(1), r1 - st.q SP, FRAME_R(7), r7 - st.q SP, FRAME_R(8), r8 - st.q SP, FRAME_R(9), r9 - st.q SP, FRAME_R(10), r10 - st.q SP, FRAME_R(11), r11 - st.q SP, FRAME_R(12), r12 - st.q SP, FRAME_R(13), r13 - st.q SP, FRAME_R(14), r14 - - /* SP is somewhere else */ - st.q SP, FRAME_R(15), r6 - - st.q SP, FRAME_R(16), r16 - st.q SP, FRAME_R(17), r17 - /* r18 is saved earlier. */ - st.q SP, FRAME_R(19), r19 - st.q SP, FRAME_R(20), r20 - st.q SP, FRAME_R(21), r21 - st.q SP, FRAME_R(22), r22 - st.q SP, FRAME_R(23), r23 - st.q SP, FRAME_R(24), r24 - st.q SP, FRAME_R(25), r25 - st.q SP, FRAME_R(26), r26 - st.q SP, FRAME_R(27), r27 - st.q SP, FRAME_R(28), r28 - st.q SP, FRAME_R(29), r29 - st.q SP, FRAME_R(30), r30 - st.q SP, FRAME_R(31), r31 - st.q SP, FRAME_R(32), r32 - st.q SP, FRAME_R(33), r33 - st.q SP, FRAME_R(34), r34 - st.q SP, FRAME_R(35), r35 - st.q SP, FRAME_R(36), r36 - st.q SP, FRAME_R(37), r37 - st.q SP, FRAME_R(38), r38 - st.q SP, FRAME_R(39), r39 - st.q SP, FRAME_R(40), r40 - st.q SP, FRAME_R(41), r41 - st.q SP, FRAME_R(42), r42 - st.q SP, FRAME_R(43), r43 - st.q SP, FRAME_R(44), r44 - st.q SP, FRAME_R(45), r45 - st.q SP, FRAME_R(46), r46 - st.q SP, FRAME_R(47), r47 - st.q SP, FRAME_R(48), r48 - st.q SP, FRAME_R(49), r49 - st.q SP, FRAME_R(50), r50 - st.q SP, FRAME_R(51), r51 - st.q SP, FRAME_R(52), r52 - st.q SP, FRAME_R(53), r53 - st.q SP, FRAME_R(54), r54 - st.q SP, FRAME_R(55), r55 - st.q SP, FRAME_R(56), r56 - st.q SP, FRAME_R(57), r57 - st.q SP, FRAME_R(58), r58 - st.q SP, FRAME_R(59), r59 - st.q SP, FRAME_R(60), r60 - st.q SP, FRAME_R(61), r61 - st.q SP, FRAME_R(62), r62 - - /* - * Save the S* registers. - */ - getcon SSR, r61 - st.q SP, FRAME_S(FSSR), r61 - getcon SPC, r62 - st.q SP, FRAME_S(FSPC), r62 - movi -1, r62 /* Reset syscall_nr */ - st.q SP, FRAME_S(FSYSCALL_ID), r62 - - /* Save the rest of the target registers */ - gettr tr1, r6 - st.q SP, FRAME_T(1), r6 - gettr tr2, r6 - st.q SP, FRAME_T(2), r6 - gettr tr3, r6 - st.q SP, FRAME_T(3), r6 - gettr tr4, r6 - st.q SP, FRAME_T(4), r6 - gettr tr5, r6 - st.q SP, FRAME_T(5), r6 - gettr tr6, r6 - st.q SP, FRAME_T(6), r6 - gettr tr7, r6 - st.q SP, FRAME_T(7), r6 - - ! setup FP so that unwinder can wind back through nested kernel mode - ! exceptions - add SP, ZERO, r14 - - /* For syscall and debug race condition, get TRA now */ - getcon TRA, r5 - - /* We are in a safe position to turn SR.BL off, but set IMASK=0xf - * Also set FD, to catch FPU usage in the kernel. - * - * benedict.gaster@superh.com 29/07/2002 - * - * On all SH5-101 revisions it is unsafe to raise the IMASK and at the - * same time change BL from 1->0, as any pending interrupt of a level - * higher than he previous value of IMASK will leak through and be - * taken unexpectedly. - * - * To avoid this we raise the IMASK and then issue another PUTCON to - * enable interrupts. - */ - getcon SR, r6 - movi SR_IMASK | SR_FD, r7 - or r6, r7, r6 - putcon r6, SR - movi SR_UNBLOCK_EXC, r7 - and r6, r7, r6 - putcon r6, SR - - - /* Now call the appropriate 3rd level handler */ - or r3, ZERO, LINK - movi trap_jtable, r3 - shlri r2, 3, r2 - ldx.l r2, r3, r3 - shlri r2, 2, r2 - ptabs r3, tr0 - or SP, ZERO, r3 - blink tr0, ZERO - -/* - * Second level handler for VBR-based exceptions. Post-handlers. - * - * Post-handlers for interrupts (ret_from_irq), exceptions - * (ret_from_exception) and common reentrance doors (restore_all - * to get back to the original context, ret_from_syscall loop to - * check kernel exiting). - * - * ret_with_reschedule and work_notifysig are an inner lables of - * the ret_from_syscall loop. - * - * In common to all stack-frame sensitive handlers. - * - * Inputs: - * (SP) struct pt_regs *, original register's frame pointer (basic) - * - */ - .global ret_from_irq -ret_from_irq: - ld.q SP, FRAME_S(FSSR), r6 - shlri r6, 30, r6 - andi r6, 1, r6 - pta resume_kernel, tr0 - bne r6, ZERO, tr0 /* no further checks */ - STI() - pta ret_with_reschedule, tr0 - blink tr0, ZERO /* Do not check softirqs */ - - .global ret_from_exception -ret_from_exception: - preempt_stop() - - ld.q SP, FRAME_S(FSSR), r6 - shlri r6, 30, r6 - andi r6, 1, r6 - pta resume_kernel, tr0 - bne r6, ZERO, tr0 /* no further checks */ - - /* Check softirqs */ - -#ifdef CONFIG_PREEMPTION - pta ret_from_syscall, tr0 - blink tr0, ZERO - -resume_kernel: - CLI() - - pta restore_all, tr0 - - getcon KCR0, r6 - ld.l r6, TI_PRE_COUNT, r7 - beq/u r7, ZERO, tr0 - -need_resched: - ld.l r6, TI_FLAGS, r7 - movi (1 << TIF_NEED_RESCHED), r8 - and r8, r7, r8 - bne r8, ZERO, tr0 - - getcon SR, r7 - andi r7, 0xf0, r7 - bne r7, ZERO, tr0 - - movi preempt_schedule_irq, r7 - ori r7, 1, r7 - ptabs r7, tr1 - blink tr1, LINK - - pta need_resched, tr1 - blink tr1, ZERO -#endif - - .global ret_from_syscall -ret_from_syscall: - -ret_with_reschedule: - getcon KCR0, r6 ! r6 contains current_thread_info - ld.l r6, TI_FLAGS, r7 ! r7 contains current_thread_info->flags - - movi _TIF_NEED_RESCHED, r8 - and r8, r7, r8 - pta work_resched, tr0 - bne r8, ZERO, tr0 - - pta restore_all, tr1 - - movi (_TIF_SIGPENDING|_TIF_NOTIFY_RESUME), r8 - and r8, r7, r8 - pta work_notifysig, tr0 - bne r8, ZERO, tr0 - - blink tr1, ZERO - -work_resched: - pta ret_from_syscall, tr0 - gettr tr0, LINK - movi schedule, r6 - ptabs r6, tr0 - blink tr0, ZERO /* Call schedule(), return on top */ - -work_notifysig: - gettr tr1, LINK - - movi do_notify_resume, r6 - ptabs r6, tr0 - or SP, ZERO, r2 - or r7, ZERO, r3 - blink tr0, LINK /* Call do_notify_resume(regs, current_thread_info->flags), return here */ - -restore_all: - /* Do prefetches */ - - ld.q SP, FRAME_T(0), r6 - ld.q SP, FRAME_T(1), r7 - ld.q SP, FRAME_T(2), r8 - ld.q SP, FRAME_T(3), r9 - ptabs r6, tr0 - ptabs r7, tr1 - ptabs r8, tr2 - ptabs r9, tr3 - ld.q SP, FRAME_T(4), r6 - ld.q SP, FRAME_T(5), r7 - ld.q SP, FRAME_T(6), r8 - ld.q SP, FRAME_T(7), r9 - ptabs r6, tr4 - ptabs r7, tr5 - ptabs r8, tr6 - ptabs r9, tr7 - - ld.q SP, FRAME_R(0), r0 - ld.q SP, FRAME_R(1), r1 - ld.q SP, FRAME_R(2), r2 - ld.q SP, FRAME_R(3), r3 - ld.q SP, FRAME_R(4), r4 - ld.q SP, FRAME_R(5), r5 - ld.q SP, FRAME_R(6), r6 - ld.q SP, FRAME_R(7), r7 - ld.q SP, FRAME_R(8), r8 - ld.q SP, FRAME_R(9), r9 - ld.q SP, FRAME_R(10), r10 - ld.q SP, FRAME_R(11), r11 - ld.q SP, FRAME_R(12), r12 - ld.q SP, FRAME_R(13), r13 - ld.q SP, FRAME_R(14), r14 - - ld.q SP, FRAME_R(16), r16 - ld.q SP, FRAME_R(17), r17 - ld.q SP, FRAME_R(18), r18 - ld.q SP, FRAME_R(19), r19 - ld.q SP, FRAME_R(20), r20 - ld.q SP, FRAME_R(21), r21 - ld.q SP, FRAME_R(22), r22 - ld.q SP, FRAME_R(23), r23 - ld.q SP, FRAME_R(24), r24 - ld.q SP, FRAME_R(25), r25 - ld.q SP, FRAME_R(26), r26 - ld.q SP, FRAME_R(27), r27 - ld.q SP, FRAME_R(28), r28 - ld.q SP, FRAME_R(29), r29 - ld.q SP, FRAME_R(30), r30 - ld.q SP, FRAME_R(31), r31 - ld.q SP, FRAME_R(32), r32 - ld.q SP, FRAME_R(33), r33 - ld.q SP, FRAME_R(34), r34 - ld.q SP, FRAME_R(35), r35 - ld.q SP, FRAME_R(36), r36 - ld.q SP, FRAME_R(37), r37 - ld.q SP, FRAME_R(38), r38 - ld.q SP, FRAME_R(39), r39 - ld.q SP, FRAME_R(40), r40 - ld.q SP, FRAME_R(41), r41 - ld.q SP, FRAME_R(42), r42 - ld.q SP, FRAME_R(43), r43 - ld.q SP, FRAME_R(44), r44 - ld.q SP, FRAME_R(45), r45 - ld.q SP, FRAME_R(46), r46 - ld.q SP, FRAME_R(47), r47 - ld.q SP, FRAME_R(48), r48 - ld.q SP, FRAME_R(49), r49 - ld.q SP, FRAME_R(50), r50 - ld.q SP, FRAME_R(51), r51 - ld.q SP, FRAME_R(52), r52 - ld.q SP, FRAME_R(53), r53 - ld.q SP, FRAME_R(54), r54 - ld.q SP, FRAME_R(55), r55 - ld.q SP, FRAME_R(56), r56 - ld.q SP, FRAME_R(57), r57 - ld.q SP, FRAME_R(58), r58 - - getcon SR, r59 - movi SR_BLOCK_EXC, r60 - or r59, r60, r59 - putcon r59, SR /* SR.BL = 1, keep nesting out */ - ld.q SP, FRAME_S(FSSR), r61 - ld.q SP, FRAME_S(FSPC), r62 - movi SR_ASID_MASK, r60 - and r59, r60, r59 - andc r61, r60, r61 /* Clear out older ASID */ - or r59, r61, r61 /* Retain current ASID */ - putcon r61, SSR - putcon r62, SPC - - /* Ignore FSYSCALL_ID */ - - ld.q SP, FRAME_R(59), r59 - ld.q SP, FRAME_R(60), r60 - ld.q SP, FRAME_R(61), r61 - ld.q SP, FRAME_R(62), r62 - - /* Last touch */ - ld.q SP, FRAME_R(15), SP - rte - nop - -/* - * Third level handlers for VBR-based exceptions. Adapting args to - * and/or deflecting to fourth level handlers. - * - * Fourth level handlers interface. - * Most are C-coded handlers directly pointed by the trap_jtable. - * (Third = Fourth level) - * Inputs: - * (r2) fault/interrupt code, entry number (e.g. NMI = 14, - * IRL0-3 (0000) = 16, RTLBMISS = 2, SYSCALL = 11, etc ...) - * (r3) struct pt_regs *, original register's frame pointer - * (r4) Event (0 = interrupt, 1 = TLB miss fault, 2 = Not TLB miss fault) - * (r5) TRA control register (for syscall/debug benefit only) - * (LINK) return address - * (SP) = r3 - * - * Kernel TLB fault handlers will get a slightly different interface. - * (r2) struct pt_regs *, original register's frame pointer - * (r3) page fault error code (see asm/thread_info.h) - * (r4) Effective Address of fault - * (LINK) return address - * (SP) = r2 - * - * fpu_error_or_IRQ? is a helper to deflect to the right cause. - * - */ -#ifdef CONFIG_MMU -tlb_miss_load: - or SP, ZERO, r2 - or ZERO, ZERO, r3 /* Read */ - getcon TEA, r4 - pta call_do_page_fault, tr0 - beq ZERO, ZERO, tr0 - -tlb_miss_store: - or SP, ZERO, r2 - movi FAULT_CODE_WRITE, r3 /* Write */ - getcon TEA, r4 - pta call_do_page_fault, tr0 - beq ZERO, ZERO, tr0 - -itlb_miss_or_IRQ: - pta its_IRQ, tr0 - beqi/u r4, EVENT_INTERRUPT, tr0 - - /* ITLB miss */ - or SP, ZERO, r2 - movi FAULT_CODE_ITLB, r3 - getcon TEA, r4 - /* Fall through */ - -call_do_page_fault: - movi do_page_fault, r6 - ptabs r6, tr0 - blink tr0, ZERO -#endif /* CONFIG_MMU */ - -fpu_error_or_IRQA: - pta its_IRQ, tr0 - beqi/l r4, EVENT_INTERRUPT, tr0 -#ifdef CONFIG_SH_FPU - movi fpu_state_restore_trap_handler, r6 -#else - movi do_exception_error, r6 -#endif - ptabs r6, tr0 - blink tr0, ZERO - -fpu_error_or_IRQB: - pta its_IRQ, tr0 - beqi/l r4, EVENT_INTERRUPT, tr0 -#ifdef CONFIG_SH_FPU - movi fpu_state_restore_trap_handler, r6 -#else - movi do_exception_error, r6 -#endif - ptabs r6, tr0 - blink tr0, ZERO - -its_IRQ: - movi do_IRQ, r6 - ptabs r6, tr0 - blink tr0, ZERO - -/* - * system_call/unknown_trap third level handler: - * - * Inputs: - * (r2) fault/interrupt code, entry number (TRAP = 11) - * (r3) struct pt_regs *, original register's frame pointer - * (r4) Not used. Event (0=interrupt, 1=TLB miss fault, 2=Not TLB miss fault) - * (r5) TRA Control Reg (0x00xyzzzz: x=1 SYSCALL, y = #args, z=nr) - * (SP) = r3 - * (LINK) return address: ret_from_exception - * (*r3) Syscall parms: SC#, arg0, arg1, ..., arg5 in order (Saved r2/r7) - * - * Outputs: - * (*r3) Syscall reply (Saved r2) - * (LINK) In case of syscall only it can be scrapped. - * Common second level post handler will be ret_from_syscall. - * Common (non-trace) exit point to that is syscall_ret (saving - * result to r2). Common bad exit point is syscall_bad (returning - * ENOSYS then saved to r2). - * - */ - -unknown_trap: - /* Unknown Trap or User Trace */ - movi do_unknown_trapa, r6 - ptabs r6, tr0 - ld.q r3, FRAME_R(9), r2 /* r2 = #arg << 16 | syscall # */ - andi r2, 0x1ff, r2 /* r2 = syscall # */ - blink tr0, LINK - - pta syscall_ret, tr0 - blink tr0, ZERO - - /* New syscall implementation*/ -system_call: - pta unknown_trap, tr0 - or r5, ZERO, r4 /* TRA (=r5) -> r4 */ - shlri r4, 20, r4 - bnei r4, 1, tr0 /* unknown_trap if not 0x1yzzzz */ - - /* It's a system call */ - st.q r3, FRAME_S(FSYSCALL_ID), r5 /* ID (0x1yzzzz) -> stack */ - andi r5, 0x1ff, r5 /* syscall # -> r5 */ - - STI() - - pta syscall_allowed, tr0 - movi NR_syscalls - 1, r4 /* Last valid */ - bgeu/l r4, r5, tr0 - -syscall_bad: - /* Return ENOSYS ! */ - movi -(ENOSYS), r2 /* Fall-through */ - - .global syscall_ret -syscall_ret: - st.q SP, FRAME_R(9), r2 /* Expecting SP back to BASIC frame */ - ld.q SP, FRAME_S(FSPC), r2 - addi r2, 4, r2 /* Move PC, being pre-execution event */ - st.q SP, FRAME_S(FSPC), r2 - pta ret_from_syscall, tr0 - blink tr0, ZERO - - -/* A different return path for ret_from_fork, because we now need - * to call schedule_tail with the later kernels. Because prev is - * loaded into r2 by switch_to() means we can just call it straight away - */ - -.global ret_from_fork -ret_from_fork: - - movi schedule_tail,r5 - ori r5, 1, r5 - ptabs r5, tr0 - blink tr0, LINK - - ld.q SP, FRAME_S(FSPC), r2 - addi r2, 4, r2 /* Move PC, being pre-execution event */ - st.q SP, FRAME_S(FSPC), r2 - pta ret_from_syscall, tr0 - blink tr0, ZERO - -.global ret_from_kernel_thread -ret_from_kernel_thread: - - movi schedule_tail,r5 - ori r5, 1, r5 - ptabs r5, tr0 - blink tr0, LINK - - ld.q SP, FRAME_R(2), r2 - ld.q SP, FRAME_R(3), r3 - ptabs r3, tr0 - blink tr0, LINK - - ld.q SP, FRAME_S(FSPC), r2 - addi r2, 4, r2 /* Move PC, being pre-execution event */ - st.q SP, FRAME_S(FSPC), r2 - pta ret_from_syscall, tr0 - blink tr0, ZERO - -syscall_allowed: - /* Use LINK to deflect the exit point, default is syscall_ret */ - pta syscall_ret, tr0 - gettr tr0, LINK - pta syscall_notrace, tr0 - - getcon KCR0, r2 - ld.l r2, TI_FLAGS, r4 - movi _TIF_WORK_SYSCALL_MASK, r6 - and r6, r4, r6 - beq/l r6, ZERO, tr0 - - /* Trace it by calling syscall_trace before and after */ - movi do_syscall_trace_enter, r4 - or SP, ZERO, r2 - ptabs r4, tr0 - blink tr0, LINK - - /* Save the retval */ - st.q SP, FRAME_R(2), r2 - - /* Reload syscall number as r5 is trashed by do_syscall_trace_enter */ - ld.q SP, FRAME_S(FSYSCALL_ID), r5 - andi r5, 0x1ff, r5 - - pta syscall_ret_trace, tr0 - gettr tr0, LINK - -syscall_notrace: - /* Now point to the appropriate 4th level syscall handler */ - movi sys_call_table, r4 - shlli r5, 2, r5 - ldx.l r4, r5, r5 - ptabs r5, tr0 - - /* Prepare original args */ - ld.q SP, FRAME_R(2), r2 - ld.q SP, FRAME_R(3), r3 - ld.q SP, FRAME_R(4), r4 - ld.q SP, FRAME_R(5), r5 - ld.q SP, FRAME_R(6), r6 - ld.q SP, FRAME_R(7), r7 - - /* And now the trick for those syscalls requiring regs * ! */ - or SP, ZERO, r8 - - /* Call it */ - blink tr0, ZERO /* LINK is already properly set */ - -syscall_ret_trace: - /* We get back here only if under trace */ - st.q SP, FRAME_R(9), r2 /* Save return value */ - - movi do_syscall_trace_leave, LINK - or SP, ZERO, r2 - ptabs LINK, tr0 - blink tr0, LINK - - /* This needs to be done after any syscall tracing */ - ld.q SP, FRAME_S(FSPC), r2 - addi r2, 4, r2 /* Move PC, being pre-execution event */ - st.q SP, FRAME_S(FSPC), r2 - - pta ret_from_syscall, tr0 - blink tr0, ZERO /* Resume normal return sequence */ - -/* - * --- Switch to running under a particular ASID and return the previous ASID value - * --- The caller is assumed to have done a cli before calling this. - * - * Input r2 : new ASID - * Output r2 : old ASID - */ - - .global switch_and_save_asid -switch_and_save_asid: - getcon sr, r0 - movi 255, r4 - shlli r4, 16, r4 /* r4 = mask to select ASID */ - and r0, r4, r3 /* r3 = shifted old ASID */ - andi r2, 255, r2 /* mask down new ASID */ - shlli r2, 16, r2 /* align new ASID against SR.ASID */ - andc r0, r4, r0 /* efface old ASID from SR */ - or r0, r2, r0 /* insert the new ASID */ - putcon r0, ssr - movi 1f, r0 - putcon r0, spc - rte - nop -1: - ptabs LINK, tr0 - shlri r3, 16, r2 /* r2 = old ASID */ - blink tr0, r63 - - .global route_to_panic_handler -route_to_panic_handler: - /* Switch to real mode, goto panic_handler, don't return. Useful for - last-chance debugging, e.g. if no output wants to go to the console. - */ - - movi panic_handler - CONFIG_PAGE_OFFSET, r1 - ptabs r1, tr0 - pta 1f, tr1 - gettr tr1, r0 - putcon r0, spc - getcon sr, r0 - movi 1, r1 - shlli r1, 31, r1 - andc r0, r1, r0 - putcon r0, ssr - rte - nop -1: /* Now in real mode */ - blink tr0, r63 - nop - - .global peek_real_address_q -peek_real_address_q: - /* Two args: - r2 : real mode address to peek - r2(out) : result quadword - - This is provided as a cheapskate way of manipulating device - registers for debugging (to avoid the need to ioremap the debug - module, and to avoid the need to ioremap the watchpoint - controller in a way that identity maps sufficient bits to avoid the - SH5-101 cut2 silicon defect). - - This code is not performance critical - */ - - add.l r2, r63, r2 /* sign extend address */ - getcon sr, r0 /* r0 = saved original SR */ - movi 1, r1 - shlli r1, 28, r1 - or r0, r1, r1 /* r0 with block bit set */ - putcon r1, sr /* now in critical section */ - movi 1, r36 - shlli r36, 31, r36 - andc r1, r36, r1 /* turn sr.mmu off in real mode section */ - - putcon r1, ssr - movi .peek0 - CONFIG_PAGE_OFFSET, r36 /* real mode target address */ - movi 1f, r37 /* virtual mode return addr */ - putcon r36, spc - - synco - rte - nop - -.peek0: /* come here in real mode, don't touch caches!! - still in critical section (sr.bl==1) */ - putcon r0, ssr - putcon r37, spc - /* Here's the actual peek. If the address is bad, all bets are now off - * what will happen (handlers invoked in real-mode = bad news) */ - ld.q r2, 0, r2 - synco - rte /* Back to virtual mode */ - nop - -1: - ptabs LINK, tr0 - blink tr0, r63 - - .global poke_real_address_q -poke_real_address_q: - /* Two args: - r2 : real mode address to poke - r3 : quadword value to write. - - This is provided as a cheapskate way of manipulating device - registers for debugging (to avoid the need to ioremap the debug - module, and to avoid the need to ioremap the watchpoint - controller in a way that identity maps sufficient bits to avoid the - SH5-101 cut2 silicon defect). - - This code is not performance critical - */ - - add.l r2, r63, r2 /* sign extend address */ - getcon sr, r0 /* r0 = saved original SR */ - movi 1, r1 - shlli r1, 28, r1 - or r0, r1, r1 /* r0 with block bit set */ - putcon r1, sr /* now in critical section */ - movi 1, r36 - shlli r36, 31, r36 - andc r1, r36, r1 /* turn sr.mmu off in real mode section */ - - putcon r1, ssr - movi .poke0-CONFIG_PAGE_OFFSET, r36 /* real mode target address */ - movi 1f, r37 /* virtual mode return addr */ - putcon r36, spc - - synco - rte - nop - -.poke0: /* come here in real mode, don't touch caches!! - still in critical section (sr.bl==1) */ - putcon r0, ssr - putcon r37, spc - /* Here's the actual poke. If the address is bad, all bets are now off - * what will happen (handlers invoked in real-mode = bad news) */ - st.q r2, 0, r3 - synco - rte /* Back to virtual mode */ - nop - -1: - ptabs LINK, tr0 - blink tr0, r63 - -#ifdef CONFIG_MMU -/* - * --- User Access Handling Section - */ - -/* - * User Access support. It all moved to non inlined Assembler - * functions in here. - * - * __kernel_size_t __copy_user(void *__to, const void *__from, - * __kernel_size_t __n) - * - * Inputs: - * (r2) target address - * (r3) source address - * (r4) size in bytes - * - * Ouputs: - * (*r2) target data - * (r2) non-copied bytes - * - * If a fault occurs on the user pointer, bail out early and return the - * number of bytes not copied in r2. - * Strategy : for large blocks, call a real memcpy function which can - * move >1 byte at a time using unaligned ld/st instructions, and can - * manipulate the cache using prefetch + alloco to improve the speed - * further. If a fault occurs in that function, just revert to the - * byte-by-byte approach used for small blocks; this is rare so the - * performance hit for that case does not matter. - * - * For small blocks it's not worth the overhead of setting up and calling - * the memcpy routine; do the copy a byte at a time. - * - */ - .global __copy_user -__copy_user: - pta __copy_user_byte_by_byte, tr1 - movi 16, r0 ! this value is a best guess, should tune it by benchmarking - bge/u r0, r4, tr1 - pta copy_user_memcpy, tr0 - addi SP, -32, SP - /* Save arguments in case we have to fix-up unhandled page fault */ - st.q SP, 0, r2 - st.q SP, 8, r3 - st.q SP, 16, r4 - st.q SP, 24, r35 ! r35 is callee-save - /* Save LINK in a register to reduce RTS time later (otherwise - ld SP,*,LINK;ptabs LINK;trn;blink trn,r63 becomes a critical path) */ - ori LINK, 0, r35 - blink tr0, LINK - - /* Copy completed normally if we get back here */ - ptabs r35, tr0 - ld.q SP, 24, r35 - /* don't restore r2-r4, pointless */ - /* set result=r2 to zero as the copy must have succeeded. */ - or r63, r63, r2 - addi SP, 32, SP - blink tr0, r63 ! RTS - - .global __copy_user_fixup -__copy_user_fixup: - /* Restore stack frame */ - ori r35, 0, LINK - ld.q SP, 24, r35 - ld.q SP, 16, r4 - ld.q SP, 8, r3 - ld.q SP, 0, r2 - addi SP, 32, SP - /* Fall through to original code, in the 'same' state we entered with */ - -/* The slow byte-by-byte method is used if the fast copy traps due to a bad - user address. In that rare case, the speed drop can be tolerated. */ -__copy_user_byte_by_byte: - pta ___copy_user_exit, tr1 - pta ___copy_user1, tr0 - beq/u r4, r63, tr1 /* early exit for zero length copy */ - sub r2, r3, r0 - addi r0, -1, r0 - -___copy_user1: - ld.b r3, 0, r5 /* Fault address 1 */ - - /* Could rewrite this to use just 1 add, but the second comes 'free' - due to load latency */ - addi r3, 1, r3 - addi r4, -1, r4 /* No real fixup required */ -___copy_user2: - stx.b r3, r0, r5 /* Fault address 2 */ - bne r4, ZERO, tr0 - -___copy_user_exit: - or r4, ZERO, r2 - ptabs LINK, tr0 - blink tr0, ZERO - -/* - * __kernel_size_t __clear_user(void *addr, __kernel_size_t size) - * - * Inputs: - * (r2) target address - * (r3) size in bytes - * - * Ouputs: - * (*r2) zero-ed target data - * (r2) non-zero-ed bytes - */ - .global __clear_user -__clear_user: - pta ___clear_user_exit, tr1 - pta ___clear_user1, tr0 - beq/u r3, r63, tr1 - -___clear_user1: - st.b r2, 0, ZERO /* Fault address */ - addi r2, 1, r2 - addi r3, -1, r3 /* No real fixup required */ - bne r3, ZERO, tr0 - -___clear_user_exit: - or r3, ZERO, r2 - ptabs LINK, tr0 - blink tr0, ZERO - -#endif /* CONFIG_MMU */ - -/* - * extern long __get_user_asm_?(void *val, long addr) - * - * Inputs: - * (r2) dest address - * (r3) source address (in User Space) - * - * Ouputs: - * (r2) -EFAULT (faulting) - * 0 (not faulting) - */ - .global __get_user_asm_b -__get_user_asm_b: - or r2, ZERO, r4 - movi -(EFAULT), r2 /* r2 = reply, no real fixup */ - -___get_user_asm_b1: - ld.b r3, 0, r5 /* r5 = data */ - st.b r4, 0, r5 - or ZERO, ZERO, r2 - -___get_user_asm_b_exit: - ptabs LINK, tr0 - blink tr0, ZERO - - - .global __get_user_asm_w -__get_user_asm_w: - or r2, ZERO, r4 - movi -(EFAULT), r2 /* r2 = reply, no real fixup */ - -___get_user_asm_w1: - ld.w r3, 0, r5 /* r5 = data */ - st.w r4, 0, r5 - or ZERO, ZERO, r2 - -___get_user_asm_w_exit: - ptabs LINK, tr0 - blink tr0, ZERO - - - .global __get_user_asm_l -__get_user_asm_l: - or r2, ZERO, r4 - movi -(EFAULT), r2 /* r2 = reply, no real fixup */ - -___get_user_asm_l1: - ld.l r3, 0, r5 /* r5 = data */ - st.l r4, 0, r5 - or ZERO, ZERO, r2 - -___get_user_asm_l_exit: - ptabs LINK, tr0 - blink tr0, ZERO - - - .global __get_user_asm_q -__get_user_asm_q: - or r2, ZERO, r4 - movi -(EFAULT), r2 /* r2 = reply, no real fixup */ - -___get_user_asm_q1: - ld.q r3, 0, r5 /* r5 = data */ - st.q r4, 0, r5 - or ZERO, ZERO, r2 - -___get_user_asm_q_exit: - ptabs LINK, tr0 - blink tr0, ZERO - -/* - * extern long __put_user_asm_?(void *pval, long addr) - * - * Inputs: - * (r2) kernel pointer to value - * (r3) dest address (in User Space) - * - * Ouputs: - * (r2) -EFAULT (faulting) - * 0 (not faulting) - */ - .global __put_user_asm_b -__put_user_asm_b: - ld.b r2, 0, r4 /* r4 = data */ - movi -(EFAULT), r2 /* r2 = reply, no real fixup */ - -___put_user_asm_b1: - st.b r3, 0, r4 - or ZERO, ZERO, r2 - -___put_user_asm_b_exit: - ptabs LINK, tr0 - blink tr0, ZERO - - - .global __put_user_asm_w -__put_user_asm_w: - ld.w r2, 0, r4 /* r4 = data */ - movi -(EFAULT), r2 /* r2 = reply, no real fixup */ - -___put_user_asm_w1: - st.w r3, 0, r4 - or ZERO, ZERO, r2 - -___put_user_asm_w_exit: - ptabs LINK, tr0 - blink tr0, ZERO - - - .global __put_user_asm_l -__put_user_asm_l: - ld.l r2, 0, r4 /* r4 = data */ - movi -(EFAULT), r2 /* r2 = reply, no real fixup */ - -___put_user_asm_l1: - st.l r3, 0, r4 - or ZERO, ZERO, r2 - -___put_user_asm_l_exit: - ptabs LINK, tr0 - blink tr0, ZERO - - - .global __put_user_asm_q -__put_user_asm_q: - ld.q r2, 0, r4 /* r4 = data */ - movi -(EFAULT), r2 /* r2 = reply, no real fixup */ - -___put_user_asm_q1: - st.q r3, 0, r4 - or ZERO, ZERO, r2 - -___put_user_asm_q_exit: - ptabs LINK, tr0 - blink tr0, ZERO - -panic_stash_regs: - /* The idea is : when we get an unhandled panic, we dump the registers - to a known memory location, the just sit in a tight loop. - This allows the human to look at the memory region through the GDB - session (assuming the debug module's SHwy initiator isn't locked up - or anything), to hopefully analyze the cause of the panic. */ - - /* On entry, former r15 (SP) is in DCR - former r0 is at resvec_saved_area + 0 - former r1 is at resvec_saved_area + 8 - former tr0 is at resvec_saved_area + 32 - DCR is the only register whose value is lost altogether. - */ - - movi 0xffffffff80000000, r0 ! phy of dump area - ld.q SP, 0x000, r1 ! former r0 - st.q r0, 0x000, r1 - ld.q SP, 0x008, r1 ! former r1 - st.q r0, 0x008, r1 - st.q r0, 0x010, r2 - st.q r0, 0x018, r3 - st.q r0, 0x020, r4 - st.q r0, 0x028, r5 - st.q r0, 0x030, r6 - st.q r0, 0x038, r7 - st.q r0, 0x040, r8 - st.q r0, 0x048, r9 - st.q r0, 0x050, r10 - st.q r0, 0x058, r11 - st.q r0, 0x060, r12 - st.q r0, 0x068, r13 - st.q r0, 0x070, r14 - getcon dcr, r14 - st.q r0, 0x078, r14 - st.q r0, 0x080, r16 - st.q r0, 0x088, r17 - st.q r0, 0x090, r18 - st.q r0, 0x098, r19 - st.q r0, 0x0a0, r20 - st.q r0, 0x0a8, r21 - st.q r0, 0x0b0, r22 - st.q r0, 0x0b8, r23 - st.q r0, 0x0c0, r24 - st.q r0, 0x0c8, r25 - st.q r0, 0x0d0, r26 - st.q r0, 0x0d8, r27 - st.q r0, 0x0e0, r28 - st.q r0, 0x0e8, r29 - st.q r0, 0x0f0, r30 - st.q r0, 0x0f8, r31 - st.q r0, 0x100, r32 - st.q r0, 0x108, r33 - st.q r0, 0x110, r34 - st.q r0, 0x118, r35 - st.q r0, 0x120, r36 - st.q r0, 0x128, r37 - st.q r0, 0x130, r38 - st.q r0, 0x138, r39 - st.q r0, 0x140, r40 - st.q r0, 0x148, r41 - st.q r0, 0x150, r42 - st.q r0, 0x158, r43 - st.q r0, 0x160, r44 - st.q r0, 0x168, r45 - st.q r0, 0x170, r46 - st.q r0, 0x178, r47 - st.q r0, 0x180, r48 - st.q r0, 0x188, r49 - st.q r0, 0x190, r50 - st.q r0, 0x198, r51 - st.q r0, 0x1a0, r52 - st.q r0, 0x1a8, r53 - st.q r0, 0x1b0, r54 - st.q r0, 0x1b8, r55 - st.q r0, 0x1c0, r56 - st.q r0, 0x1c8, r57 - st.q r0, 0x1d0, r58 - st.q r0, 0x1d8, r59 - st.q r0, 0x1e0, r60 - st.q r0, 0x1e8, r61 - st.q r0, 0x1f0, r62 - st.q r0, 0x1f8, r63 ! bogus, but for consistency's sake... - - ld.q SP, 0x020, r1 ! former tr0 - st.q r0, 0x200, r1 - gettr tr1, r1 - st.q r0, 0x208, r1 - gettr tr2, r1 - st.q r0, 0x210, r1 - gettr tr3, r1 - st.q r0, 0x218, r1 - gettr tr4, r1 - st.q r0, 0x220, r1 - gettr tr5, r1 - st.q r0, 0x228, r1 - gettr tr6, r1 - st.q r0, 0x230, r1 - gettr tr7, r1 - st.q r0, 0x238, r1 - - getcon sr, r1 - getcon ssr, r2 - getcon pssr, r3 - getcon spc, r4 - getcon pspc, r5 - getcon intevt, r6 - getcon expevt, r7 - getcon pexpevt, r8 - getcon tra, r9 - getcon tea, r10 - getcon kcr0, r11 - getcon kcr1, r12 - getcon vbr, r13 - getcon resvec, r14 - - st.q r0, 0x240, r1 - st.q r0, 0x248, r2 - st.q r0, 0x250, r3 - st.q r0, 0x258, r4 - st.q r0, 0x260, r5 - st.q r0, 0x268, r6 - st.q r0, 0x270, r7 - st.q r0, 0x278, r8 - st.q r0, 0x280, r9 - st.q r0, 0x288, r10 - st.q r0, 0x290, r11 - st.q r0, 0x298, r12 - st.q r0, 0x2a0, r13 - st.q r0, 0x2a8, r14 - - getcon SPC,r2 - getcon SSR,r3 - getcon EXPEVT,r4 - /* Prepare to jump to C - physical address */ - movi panic_handler-CONFIG_PAGE_OFFSET, r1 - ori r1, 1, r1 - ptabs r1, tr0 - getcon DCR, SP - blink tr0, ZERO - nop - nop - nop - nop - - - - -/* - * --- Signal Handling Section - */ - -/* - * extern long long _sa_default_rt_restorer - * extern long long _sa_default_restorer - * - * or, better, - * - * extern void _sa_default_rt_restorer(void) - * extern void _sa_default_restorer(void) - * - * Code prototypes to do a sys_rt_sigreturn() or sys_sysreturn() - * from user space. Copied into user space by signal management. - * Both must be quad aligned and 2 quad long (4 instructions). - * - */ - .balign 8 - .global sa_default_rt_restorer -sa_default_rt_restorer: - movi 0x10, r9 - shori __NR_rt_sigreturn, r9 - trapa r9 - nop - - .balign 8 - .global sa_default_restorer -sa_default_restorer: - movi 0x10, r9 - shori __NR_sigreturn, r9 - trapa r9 - nop - -/* - * --- __ex_table Section - */ - -/* - * User Access Exception Table. - */ - .section __ex_table, "a" - - .global asm_uaccess_start /* Just a marker */ -asm_uaccess_start: - -#ifdef CONFIG_MMU - .long ___copy_user1, ___copy_user_exit - .long ___copy_user2, ___copy_user_exit - .long ___clear_user1, ___clear_user_exit -#endif - .long ___get_user_asm_b1, ___get_user_asm_b_exit - .long ___get_user_asm_w1, ___get_user_asm_w_exit - .long ___get_user_asm_l1, ___get_user_asm_l_exit - .long ___get_user_asm_q1, ___get_user_asm_q_exit - .long ___put_user_asm_b1, ___put_user_asm_b_exit - .long ___put_user_asm_w1, ___put_user_asm_w_exit - .long ___put_user_asm_l1, ___put_user_asm_l_exit - .long ___put_user_asm_q1, ___put_user_asm_q_exit - - .global asm_uaccess_end /* Just a marker */ -asm_uaccess_end: - - - - -/* - * --- .init.text Section - */ - - __INIT - -/* - * void trap_init (void) - * - */ - .global trap_init -trap_init: - addi SP, -24, SP /* Room to save r28/r29/r30 */ - st.q SP, 0, r28 - st.q SP, 8, r29 - st.q SP, 16, r30 - - /* Set VBR and RESVEC */ - movi LVBR_block, r19 - andi r19, -4, r19 /* reset MMUOFF + reserved */ - /* For RESVEC exceptions we force the MMU off, which means we need the - physical address. */ - movi LRESVEC_block-CONFIG_PAGE_OFFSET, r20 - andi r20, -4, r20 /* reset reserved */ - ori r20, 1, r20 /* set MMUOFF */ - putcon r19, VBR - putcon r20, RESVEC - - /* Sanity check */ - movi LVBR_block_end, r21 - andi r21, -4, r21 - movi BLOCK_SIZE, r29 /* r29 = expected size */ - or r19, ZERO, r30 - add r19, r29, r19 - - /* - * Ugly, but better loop forever now than crash afterwards. - * We should print a message, but if we touch LVBR or - * LRESVEC blocks we should not be surprised if we get stuck - * in trap_init(). - */ - pta trap_init_loop, tr1 - gettr tr1, r28 /* r28 = trap_init_loop */ - sub r21, r30, r30 /* r30 = actual size */ - - /* - * VBR/RESVEC handlers overlap by being bigger than - * allowed. Very bad. Just loop forever. - * (r28) panic/loop address - * (r29) expected size - * (r30) actual size - */ -trap_init_loop: - bne r19, r21, tr1 - - /* Now that exception vectors are set up reset SR.BL */ - getcon SR, r22 - movi SR_UNBLOCK_EXC, r23 - and r22, r23, r22 - putcon r22, SR - - addi SP, 24, SP - ptabs LINK, tr0 - blink tr0, ZERO - diff --git a/arch/sh/kernel/cpu/sh5/fpu.c b/arch/sh/kernel/cpu/sh5/fpu.c deleted file mode 100644 index 3966b5ee8e93..000000000000 --- a/arch/sh/kernel/cpu/sh5/fpu.c +++ /dev/null @@ -1,106 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * arch/sh/kernel/cpu/sh5/fpu.c - * - * Copyright (C) 2001 Manuela Cirronis, Paolo Alberelli - * Copyright (C) 2002 STMicroelectronics Limited - * Author : Stuart Menefy - * - * Started from SH4 version: - * Copyright (C) 1999, 2000 Kaz Kojima & Niibe Yutaka - */ -#include -#include -#include - -void save_fpu(struct task_struct *tsk) -{ - asm volatile("fst.p %0, (0*8), fp0\n\t" - "fst.p %0, (1*8), fp2\n\t" - "fst.p %0, (2*8), fp4\n\t" - "fst.p %0, (3*8), fp6\n\t" - "fst.p %0, (4*8), fp8\n\t" - "fst.p %0, (5*8), fp10\n\t" - "fst.p %0, (6*8), fp12\n\t" - "fst.p %0, (7*8), fp14\n\t" - "fst.p %0, (8*8), fp16\n\t" - "fst.p %0, (9*8), fp18\n\t" - "fst.p %0, (10*8), fp20\n\t" - "fst.p %0, (11*8), fp22\n\t" - "fst.p %0, (12*8), fp24\n\t" - "fst.p %0, (13*8), fp26\n\t" - "fst.p %0, (14*8), fp28\n\t" - "fst.p %0, (15*8), fp30\n\t" - "fst.p %0, (16*8), fp32\n\t" - "fst.p %0, (17*8), fp34\n\t" - "fst.p %0, (18*8), fp36\n\t" - "fst.p %0, (19*8), fp38\n\t" - "fst.p %0, (20*8), fp40\n\t" - "fst.p %0, (21*8), fp42\n\t" - "fst.p %0, (22*8), fp44\n\t" - "fst.p %0, (23*8), fp46\n\t" - "fst.p %0, (24*8), fp48\n\t" - "fst.p %0, (25*8), fp50\n\t" - "fst.p %0, (26*8), fp52\n\t" - "fst.p %0, (27*8), fp54\n\t" - "fst.p %0, (28*8), fp56\n\t" - "fst.p %0, (29*8), fp58\n\t" - "fst.p %0, (30*8), fp60\n\t" - "fst.p %0, (31*8), fp62\n\t" - - "fgetscr fr63\n\t" - "fst.s %0, (32*8), fr63\n\t" - : /* no output */ - : "r" (&tsk->thread.xstate->hardfpu) - : "memory"); -} - -void restore_fpu(struct task_struct *tsk) -{ - asm volatile("fld.p %0, (0*8), fp0\n\t" - "fld.p %0, (1*8), fp2\n\t" - "fld.p %0, (2*8), fp4\n\t" - "fld.p %0, (3*8), fp6\n\t" - "fld.p %0, (4*8), fp8\n\t" - "fld.p %0, (5*8), fp10\n\t" - "fld.p %0, (6*8), fp12\n\t" - "fld.p %0, (7*8), fp14\n\t" - "fld.p %0, (8*8), fp16\n\t" - "fld.p %0, (9*8), fp18\n\t" - "fld.p %0, (10*8), fp20\n\t" - "fld.p %0, (11*8), fp22\n\t" - "fld.p %0, (12*8), fp24\n\t" - "fld.p %0, (13*8), fp26\n\t" - "fld.p %0, (14*8), fp28\n\t" - "fld.p %0, (15*8), fp30\n\t" - "fld.p %0, (16*8), fp32\n\t" - "fld.p %0, (17*8), fp34\n\t" - "fld.p %0, (18*8), fp36\n\t" - "fld.p %0, (19*8), fp38\n\t" - "fld.p %0, (20*8), fp40\n\t" - "fld.p %0, (21*8), fp42\n\t" - "fld.p %0, (22*8), fp44\n\t" - "fld.p %0, (23*8), fp46\n\t" - "fld.p %0, (24*8), fp48\n\t" - "fld.p %0, (25*8), fp50\n\t" - "fld.p %0, (26*8), fp52\n\t" - "fld.p %0, (27*8), fp54\n\t" - "fld.p %0, (28*8), fp56\n\t" - "fld.p %0, (29*8), fp58\n\t" - "fld.p %0, (30*8), fp60\n\t" - - "fld.s %0, (32*8), fr63\n\t" - "fputscr fr63\n\t" - - "fld.p %0, (31*8), fp62\n\t" - : /* no output */ - : "r" (&tsk->thread.xstate->hardfpu) - : "memory"); -} - -asmlinkage void do_fpu_error(unsigned long ex, struct pt_regs *regs) -{ - regs->pc += 4; - - force_sig(SIGFPE); -} diff --git a/arch/sh/kernel/cpu/sh5/probe.c b/arch/sh/kernel/cpu/sh5/probe.c deleted file mode 100644 index 947250188065..000000000000 --- a/arch/sh/kernel/cpu/sh5/probe.c +++ /dev/null @@ -1,72 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * arch/sh/kernel/cpu/sh5/probe.c - * - * CPU Subtype Probing for SH-5. - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2003 - 2007 Paul Mundt - */ -#include -#include -#include -#include -#include -#include - -void cpu_probe(void) -{ - unsigned long long cir; - - /* - * Do peeks in real mode to avoid having to set up a mapping for - * the WPC registers. On SH5-101 cut2, such a mapping would be - * exposed to an address translation erratum which would make it - * hard to set up correctly. - */ - cir = peek_real_address_q(0x0d000008); - if ((cir & 0xffff) == 0x5103) - boot_cpu_data.type = CPU_SH5_103; - else if (((cir >> 32) & 0xffff) == 0x51e2) - /* CPU.VCR aliased at CIR address on SH5-101 */ - boot_cpu_data.type = CPU_SH5_101; - - boot_cpu_data.family = CPU_FAMILY_SH5; - - /* - * First, setup some sane values for the I-cache. - */ - boot_cpu_data.icache.ways = 4; - boot_cpu_data.icache.sets = 256; - boot_cpu_data.icache.linesz = L1_CACHE_BYTES; - boot_cpu_data.icache.way_incr = (1 << 13); - boot_cpu_data.icache.entry_shift = 5; - boot_cpu_data.icache.way_size = boot_cpu_data.icache.sets * - boot_cpu_data.icache.linesz; - boot_cpu_data.icache.entry_mask = 0x1fe0; - boot_cpu_data.icache.flags = 0; - - /* - * Next, setup some sane values for the D-cache. - * - * On the SH5, these are pretty consistent with the I-cache settings, - * so we just copy over the existing definitions.. these can be fixed - * up later, especially if we add runtime CPU probing. - * - * Though in the meantime it saves us from having to duplicate all of - * the above definitions.. - */ - boot_cpu_data.dcache = boot_cpu_data.icache; - - /* - * Setup any cache-related flags here - */ -#if defined(CONFIG_CACHE_WRITETHROUGH) - set_bit(SH_CACHE_MODE_WT, &(boot_cpu_data.dcache.flags)); -#elif defined(CONFIG_CACHE_WRITEBACK) - set_bit(SH_CACHE_MODE_WB, &(boot_cpu_data.dcache.flags)); -#endif - - /* Setup some I/D TLB defaults */ - sh64_tlb_init(); -} diff --git a/arch/sh/kernel/cpu/sh5/setup-sh5.c b/arch/sh/kernel/cpu/sh5/setup-sh5.c deleted file mode 100644 index dc8476d67244..000000000000 --- a/arch/sh/kernel/cpu/sh5/setup-sh5.c +++ /dev/null @@ -1,121 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * SH5-101/SH5-103 CPU Setup - * - * Copyright (C) 2009 Paul Mundt - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static struct plat_sci_port scif0_platform_data = { - .flags = UPF_IOREMAP, - .scscr = SCSCR_REIE, - .type = PORT_SCIF, -}; - -static struct resource scif0_resources[] = { - DEFINE_RES_MEM(PHYS_PERIPHERAL_BLOCK + 0x01030000, 0x100), - DEFINE_RES_IRQ(39), - DEFINE_RES_IRQ(40), - DEFINE_RES_IRQ(42), -}; - -static struct platform_device scif0_device = { - .name = "sh-sci", - .id = 0, - .resource = scif0_resources, - .num_resources = ARRAY_SIZE(scif0_resources), - .dev = { - .platform_data = &scif0_platform_data, - }, -}; - -static struct resource rtc_resources[] = { - [0] = { - .start = PHYS_PERIPHERAL_BLOCK + 0x01040000, - .end = PHYS_PERIPHERAL_BLOCK + 0x01040000 + 0x58 - 1, - .flags = IORESOURCE_IO, - }, - [1] = { - /* Period IRQ */ - .start = IRQ_PRI, - .flags = IORESOURCE_IRQ, - }, - [2] = { - /* Carry IRQ */ - .start = IRQ_CUI, - .flags = IORESOURCE_IRQ, - }, - [3] = { - /* Alarm IRQ */ - .start = IRQ_ATI, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device rtc_device = { - .name = "sh-rtc", - .id = -1, - .num_resources = ARRAY_SIZE(rtc_resources), - .resource = rtc_resources, -}; - -#define TMU_BLOCK_OFF 0x01020000 -#define TMU_BASE PHYS_PERIPHERAL_BLOCK + TMU_BLOCK_OFF - -static struct sh_timer_config tmu0_platform_data = { - .channels_mask = 7, -}; - -static struct resource tmu0_resources[] = { - DEFINE_RES_MEM(TMU_BASE, 0x30), - DEFINE_RES_IRQ(IRQ_TUNI0), - DEFINE_RES_IRQ(IRQ_TUNI1), - DEFINE_RES_IRQ(IRQ_TUNI2), -}; - -static struct platform_device tmu0_device = { - .name = "sh-tmu", - .id = 0, - .dev = { - .platform_data = &tmu0_platform_data, - }, - .resource = tmu0_resources, - .num_resources = ARRAY_SIZE(tmu0_resources), -}; - -static struct platform_device *sh5_early_devices[] __initdata = { - &scif0_device, - &tmu0_device, -}; - -static struct platform_device *sh5_devices[] __initdata = { - &rtc_device, -}; - -static int __init sh5_devices_setup(void) -{ - int ret; - - ret = platform_add_devices(sh5_early_devices, - ARRAY_SIZE(sh5_early_devices)); - if (unlikely(ret != 0)) - return ret; - - return platform_add_devices(sh5_devices, - ARRAY_SIZE(sh5_devices)); -} -arch_initcall(sh5_devices_setup); - -void __init plat_early_device_setup(void) -{ - sh_early_platform_add_devices(sh5_early_devices, - ARRAY_SIZE(sh5_early_devices)); -} diff --git a/arch/sh/kernel/cpu/sh5/switchto.S b/arch/sh/kernel/cpu/sh5/switchto.S deleted file mode 100644 index d1beff755632..000000000000 --- a/arch/sh/kernel/cpu/sh5/switchto.S +++ /dev/null @@ -1,195 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * arch/sh/kernel/cpu/sh5/switchto.S - * - * sh64 context switch - * - * Copyright (C) 2004 Richard Curnow -*/ - - .section .text..SHmedia32,"ax" - .little - - .balign 32 - - .type sh64_switch_to,@function - .global sh64_switch_to - .global __sh64_switch_to_end -sh64_switch_to: - -/* Incoming args - r2 - prev - r3 - &prev->thread - r4 - next - r5 - &next->thread - - Outgoing results - r2 - last (=prev) : this just stays in r2 throughout - - Want to create a full (struct pt_regs) on the stack to allow backtracing - functions to work. However, we only need to populate the callee-save - register slots in this structure; since we're a function our ancestors must - have themselves preserved all caller saved state in the stack. This saves - some wasted effort since we won't need to look at the values. - - In particular, all caller-save registers are immediately available for - scratch use. - -*/ - -#define FRAME_SIZE (76*8 + 8) - - movi FRAME_SIZE, r0 - sub.l r15, r0, r15 - ! Do normal-style register save to support backtrace - - st.l r15, 0, r18 ! save link reg - st.l r15, 4, r14 ! save fp - add.l r15, r63, r14 ! setup frame pointer - - ! hopefully this looks normal to the backtrace now. - - addi.l r15, 8, r1 ! base of pt_regs - addi.l r1, 24, r0 ! base of pt_regs.regs - addi.l r0, (63*8), r8 ! base of pt_regs.trregs - - /* Note : to be fixed? - struct pt_regs is really designed for holding the state on entry - to an exception, i.e. pc,sr,regs etc. However, for the context - switch state, some of this is not required. But the unwinder takes - struct pt_regs * as an arg so we have to build this structure - to allow unwinding switched tasks in show_state() */ - - st.q r0, ( 9*8), r9 - st.q r0, (10*8), r10 - st.q r0, (11*8), r11 - st.q r0, (12*8), r12 - st.q r0, (13*8), r13 - st.q r0, (14*8), r14 ! for unwind, want to look as though we took a trap at - ! the point where the process is left in suspended animation, i.e. current - ! fp here, not the saved one. - st.q r0, (16*8), r16 - - st.q r0, (24*8), r24 - st.q r0, (25*8), r25 - st.q r0, (26*8), r26 - st.q r0, (27*8), r27 - st.q r0, (28*8), r28 - st.q r0, (29*8), r29 - st.q r0, (30*8), r30 - st.q r0, (31*8), r31 - st.q r0, (32*8), r32 - st.q r0, (33*8), r33 - st.q r0, (34*8), r34 - st.q r0, (35*8), r35 - - st.q r0, (44*8), r44 - st.q r0, (45*8), r45 - st.q r0, (46*8), r46 - st.q r0, (47*8), r47 - st.q r0, (48*8), r48 - st.q r0, (49*8), r49 - st.q r0, (50*8), r50 - st.q r0, (51*8), r51 - st.q r0, (52*8), r52 - st.q r0, (53*8), r53 - st.q r0, (54*8), r54 - st.q r0, (55*8), r55 - st.q r0, (56*8), r56 - st.q r0, (57*8), r57 - st.q r0, (58*8), r58 - st.q r0, (59*8), r59 - - ! do this early as pta->gettr has no pipeline forwarding (=> 5 cycle latency) - ! Use a local label to avoid creating a symbol that will confuse the ! - ! backtrace - pta .Lsave_pc, tr0 - - gettr tr5, r45 - gettr tr6, r46 - gettr tr7, r47 - st.q r8, (5*8), r45 - st.q r8, (6*8), r46 - st.q r8, (7*8), r47 - - ! Now switch context - gettr tr0, r9 - st.l r3, 0, r15 ! prev->thread.sp - st.l r3, 8, r1 ! prev->thread.kregs - st.l r3, 4, r9 ! prev->thread.pc - st.q r1, 0, r9 ! save prev->thread.pc into pt_regs->pc - - ! Load PC for next task (init value or save_pc later) - ld.l r5, 4, r18 ! next->thread.pc - ! Switch stacks - ld.l r5, 0, r15 ! next->thread.sp - ptabs r18, tr0 - - ! Update current - ld.l r4, 4, r9 ! next->thread_info (2nd element of next task_struct) - putcon r9, kcr0 ! current = next->thread_info - - ! go to save_pc for a reschedule, or the initial thread.pc for a new process - blink tr0, r63 - - ! Restore (when we come back to a previously saved task) -.Lsave_pc: - addi.l r15, 32, r0 ! r0 = next's regs - addi.l r0, (63*8), r8 ! r8 = next's tr_regs - - ld.q r8, (5*8), r45 - ld.q r8, (6*8), r46 - ld.q r8, (7*8), r47 - ptabs r45, tr5 - ptabs r46, tr6 - ptabs r47, tr7 - - ld.q r0, ( 9*8), r9 - ld.q r0, (10*8), r10 - ld.q r0, (11*8), r11 - ld.q r0, (12*8), r12 - ld.q r0, (13*8), r13 - ld.q r0, (14*8), r14 - ld.q r0, (16*8), r16 - - ld.q r0, (24*8), r24 - ld.q r0, (25*8), r25 - ld.q r0, (26*8), r26 - ld.q r0, (27*8), r27 - ld.q r0, (28*8), r28 - ld.q r0, (29*8), r29 - ld.q r0, (30*8), r30 - ld.q r0, (31*8), r31 - ld.q r0, (32*8), r32 - ld.q r0, (33*8), r33 - ld.q r0, (34*8), r34 - ld.q r0, (35*8), r35 - - ld.q r0, (44*8), r44 - ld.q r0, (45*8), r45 - ld.q r0, (46*8), r46 - ld.q r0, (47*8), r47 - ld.q r0, (48*8), r48 - ld.q r0, (49*8), r49 - ld.q r0, (50*8), r50 - ld.q r0, (51*8), r51 - ld.q r0, (52*8), r52 - ld.q r0, (53*8), r53 - ld.q r0, (54*8), r54 - ld.q r0, (55*8), r55 - ld.q r0, (56*8), r56 - ld.q r0, (57*8), r57 - ld.q r0, (58*8), r58 - ld.q r0, (59*8), r59 - - ! epilogue - ld.l r15, 0, r18 - ld.l r15, 4, r14 - ptabs r18, tr0 - movi FRAME_SIZE, r0 - add r15, r0, r15 - blink tr0, r63 -__sh64_switch_to_end: -.LFE1: - .size sh64_switch_to,.LFE1-sh64_switch_to - diff --git a/arch/sh/kernel/cpu/sh5/unwind.c b/arch/sh/kernel/cpu/sh5/unwind.c deleted file mode 100644 index 3cb0cd9cea29..000000000000 --- a/arch/sh/kernel/cpu/sh5/unwind.c +++ /dev/null @@ -1,342 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * arch/sh/kernel/cpu/sh5/unwind.c - * - * Copyright (C) 2004 Paul Mundt - * Copyright (C) 2004 Richard Curnow - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static u8 regcache[63]; - -/* - * Finding the previous stack frame isn't horribly straightforward as it is - * on some other platforms. In the sh64 case, we don't have "linked" stack - * frames, so we need to do a bit of work to determine the previous frame, - * and in turn, the previous r14/r18 pair. - * - * There are generally a few cases which determine where we can find out - * the r14/r18 values. In the general case, this can be determined by poking - * around the prologue of the symbol PC is in (note that we absolutely must - * have frame pointer support as well as the kernel symbol table mapped, - * otherwise we can't even get this far). - * - * In other cases, such as the interrupt/exception path, we can poke around - * the sp/fp. - * - * Notably, this entire approach is somewhat error prone, and in the event - * that the previous frame cannot be determined, that's all we can do. - * Either way, this still leaves us with a more correct backtrace then what - * we would be able to come up with by walking the stack (which is garbage - * for anything beyond the first frame). - * -- PFM. - */ -static int lookup_prev_stack_frame(unsigned long fp, unsigned long pc, - unsigned long *pprev_fp, unsigned long *pprev_pc, - struct pt_regs *regs) -{ - const char *sym; - char namebuf[128]; - unsigned long offset; - unsigned long prologue = 0; - unsigned long fp_displacement = 0; - unsigned long fp_prev = 0; - unsigned long offset_r14 = 0, offset_r18 = 0; - int i, found_prologue_end = 0; - - sym = kallsyms_lookup(pc, NULL, &offset, NULL, namebuf); - if (!sym) - return -EINVAL; - - prologue = pc - offset; - if (!prologue) - return -EINVAL; - - /* Validate fp, to avoid risk of dereferencing a bad pointer later. - Assume 128Mb since that's the amount of RAM on a Cayman. Modify - when there is an SH-5 board with more. */ - if ((fp < (unsigned long) phys_to_virt(__MEMORY_START)) || - (fp >= (unsigned long)(phys_to_virt(__MEMORY_START)) + 128*1024*1024) || - ((fp & 7) != 0)) { - return -EINVAL; - } - - /* - * Depth to walk, depth is completely arbitrary. - */ - for (i = 0; i < 100; i++, prologue += sizeof(unsigned long)) { - unsigned long op; - u8 major, minor; - u8 src, dest, disp; - - op = *(unsigned long *)prologue; - - major = (op >> 26) & 0x3f; - src = (op >> 20) & 0x3f; - minor = (op >> 16) & 0xf; - disp = (op >> 10) & 0x3f; - dest = (op >> 4) & 0x3f; - - /* - * Stack frame creation happens in a number of ways.. in the - * general case when the stack frame is less than 511 bytes, - * it's generally created by an addi or addi.l: - * - * addi/addi.l r15, -FRAME_SIZE, r15 - * - * in the event that the frame size is bigger than this, it's - * typically created using a movi/sub pair as follows: - * - * movi FRAME_SIZE, rX - * sub r15, rX, r15 - */ - - switch (major) { - case (0x00 >> 2): - switch (minor) { - case 0x8: /* add.l */ - case 0x9: /* add */ - /* Look for r15, r63, r14 */ - if (src == 15 && disp == 63 && dest == 14) - found_prologue_end = 1; - - break; - case 0xa: /* sub.l */ - case 0xb: /* sub */ - if (src != 15 || dest != 15) - continue; - - fp_displacement -= regcache[disp]; - fp_prev = fp - fp_displacement; - break; - } - break; - case (0xa8 >> 2): /* st.l */ - if (src != 15) - continue; - - switch (dest) { - case 14: - if (offset_r14 || fp_displacement == 0) - continue; - - offset_r14 = (u64)(((((s64)op >> 10) & 0x3ff) << 54) >> 54); - offset_r14 *= sizeof(unsigned long); - offset_r14 += fp_displacement; - break; - case 18: - if (offset_r18 || fp_displacement == 0) - continue; - - offset_r18 = (u64)(((((s64)op >> 10) & 0x3ff) << 54) >> 54); - offset_r18 *= sizeof(unsigned long); - offset_r18 += fp_displacement; - break; - } - - break; - case (0xcc >> 2): /* movi */ - if (dest >= 63) { - printk(KERN_NOTICE "%s: Invalid dest reg %d " - "specified in movi handler. Failed " - "opcode was 0x%lx: ", __func__, - dest, op); - - continue; - } - - /* Sign extend */ - regcache[dest] = - sign_extend64((((u64)op >> 10) & 0xffff), 9); - break; - case (0xd0 >> 2): /* addi */ - case (0xd4 >> 2): /* addi.l */ - /* Look for r15, -FRAME_SIZE, r15 */ - if (src != 15 || dest != 15) - continue; - - /* Sign extended frame size.. */ - fp_displacement += - (u64)(((((s64)op >> 10) & 0x3ff) << 54) >> 54); - fp_prev = fp - fp_displacement; - break; - } - - if (found_prologue_end && offset_r14 && (offset_r18 || *pprev_pc) && fp_prev) - break; - } - - if (offset_r14 == 0 || fp_prev == 0) { - if (!offset_r14) - pr_debug("Unable to find r14 offset\n"); - if (!fp_prev) - pr_debug("Unable to find previous fp\n"); - - return -EINVAL; - } - - /* For innermost leaf function, there might not be a offset_r18 */ - if (!*pprev_pc && (offset_r18 == 0)) - return -EINVAL; - - *pprev_fp = *(unsigned long *)(fp_prev + offset_r14); - - if (offset_r18) - *pprev_pc = *(unsigned long *)(fp_prev + offset_r18); - - *pprev_pc &= ~1; - - return 0; -} - -/* - * Don't put this on the stack since we'll want to call in to - * sh64_unwinder_dump() when we're close to underflowing the stack - * anyway. - */ -static struct pt_regs here_regs; - -extern const char syscall_ret; -extern const char ret_from_syscall; -extern const char ret_from_exception; -extern const char ret_from_irq; - -static void sh64_unwind_inner(const struct stacktrace_ops *ops, - void *data, struct pt_regs *regs); - -static inline void unwind_nested(const struct stacktrace_ops *ops, void *data, - unsigned long pc, unsigned long fp) -{ - if ((fp >= __MEMORY_START) && - ((fp & 7) == 0)) - sh64_unwind_inner(ops, data, (struct pt_regs *)fp); -} - -static void sh64_unwind_inner(const struct stacktrace_ops *ops, - void *data, struct pt_regs *regs) -{ - unsigned long pc, fp; - int ofs = 0; - int first_pass; - - pc = regs->pc & ~1; - fp = regs->regs[14]; - - first_pass = 1; - for (;;) { - int cond; - unsigned long next_fp, next_pc; - - if (pc == ((unsigned long)&syscall_ret & ~1)) { - printk("SYSCALL\n"); - unwind_nested(ops, data, pc, fp); - return; - } - - if (pc == ((unsigned long)&ret_from_syscall & ~1)) { - printk("SYSCALL (PREEMPTED)\n"); - unwind_nested(ops, data, pc, fp); - return; - } - - /* In this case, the PC is discovered by lookup_prev_stack_frame but - it has 4 taken off it to look like the 'caller' */ - if (pc == ((unsigned long)&ret_from_exception & ~1)) { - printk("EXCEPTION\n"); - unwind_nested(ops, data, pc, fp); - return; - } - - if (pc == ((unsigned long)&ret_from_irq & ~1)) { - printk("IRQ\n"); - unwind_nested(ops, data, pc, fp); - return; - } - - cond = ((pc >= __MEMORY_START) && (fp >= __MEMORY_START) && - ((pc & 3) == 0) && ((fp & 7) == 0)); - - pc -= ofs; - - ops->address(data, pc, 1); - - if (first_pass) { - /* If the innermost frame is a leaf function, it's - * possible that r18 is never saved out to the stack. - */ - next_pc = regs->regs[18]; - } else { - next_pc = 0; - } - - if (lookup_prev_stack_frame(fp, pc, &next_fp, &next_pc, regs) == 0) { - ofs = sizeof(unsigned long); - pc = next_pc & ~1; - fp = next_fp; - } else { - printk("Unable to lookup previous stack frame\n"); - break; - } - first_pass = 0; - } - - printk("\n"); -} - -static void sh64_unwinder_dump(struct task_struct *task, - struct pt_regs *regs, - unsigned long *sp, - const struct stacktrace_ops *ops, - void *data) -{ - if (!regs) { - /* - * Fetch current regs if we have no other saved state to back - * trace from. - */ - regs = &here_regs; - - __asm__ __volatile__ ("ori r14, 0, %0" : "=r" (regs->regs[14])); - __asm__ __volatile__ ("ori r15, 0, %0" : "=r" (regs->regs[15])); - __asm__ __volatile__ ("ori r18, 0, %0" : "=r" (regs->regs[18])); - - __asm__ __volatile__ ("gettr tr0, %0" : "=r" (regs->tregs[0])); - __asm__ __volatile__ ("gettr tr1, %0" : "=r" (regs->tregs[1])); - __asm__ __volatile__ ("gettr tr2, %0" : "=r" (regs->tregs[2])); - __asm__ __volatile__ ("gettr tr3, %0" : "=r" (regs->tregs[3])); - __asm__ __volatile__ ("gettr tr4, %0" : "=r" (regs->tregs[4])); - __asm__ __volatile__ ("gettr tr5, %0" : "=r" (regs->tregs[5])); - __asm__ __volatile__ ("gettr tr6, %0" : "=r" (regs->tregs[6])); - __asm__ __volatile__ ("gettr tr7, %0" : "=r" (regs->tregs[7])); - - __asm__ __volatile__ ( - "pta 0f, tr0\n\t" - "blink tr0, %0\n\t" - "0: nop" - : "=r" (regs->pc) - ); - } - - sh64_unwind_inner(ops, data, regs); -} - -static struct unwinder sh64_unwinder = { - .name = "sh64-unwinder", - .dump = sh64_unwinder_dump, - .rating = 150, -}; - -static int __init sh64_unwinder_init(void) -{ - return unwinder_register(&sh64_unwinder); -} -early_initcall(sh64_unwinder_init); diff --git a/arch/sh/kernel/head_64.S b/arch/sh/kernel/head_64.S deleted file mode 100644 index 67685e1f00e1..000000000000 --- a/arch/sh/kernel/head_64.S +++ /dev/null @@ -1,346 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * arch/sh/kernel/head_64.S - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2003, 2004 Paul Mundt - */ - -#include - -#include -#include -#include -#include -#include -#include - -/* - * MMU defines: TLB boundaries. - */ - -#define MMUIR_FIRST ITLB_FIXED -#define MMUIR_END ITLB_LAST_VAR_UNRESTRICTED+TLB_STEP -#define MMUIR_STEP TLB_STEP - -#define MMUDR_FIRST DTLB_FIXED -#define MMUDR_END DTLB_LAST_VAR_UNRESTRICTED+TLB_STEP -#define MMUDR_STEP TLB_STEP - -/* Safety check : CONFIG_PAGE_OFFSET has to be a multiple of 512Mb */ -#if (CONFIG_PAGE_OFFSET & ((1UL<<29)-1)) -#error "CONFIG_PAGE_OFFSET must be a multiple of 512Mb" -#endif - -/* - * MMU defines: Fixed TLBs. - */ -/* Deal safely with the case where the base of RAM is not 512Mb aligned */ - -#define ALIGN_512M_MASK (0xffffffffe0000000) -#define ALIGNED_EFFECTIVE ((CONFIG_PAGE_OFFSET + CONFIG_MEMORY_START) & ALIGN_512M_MASK) -#define ALIGNED_PHYSICAL (CONFIG_MEMORY_START & ALIGN_512M_MASK) - -#define MMUIR_TEXT_H (0x0000000000000003 | ALIGNED_EFFECTIVE) - /* Enabled, Shared, ASID 0, Eff. Add. 0xA0000000 */ - -#define MMUIR_TEXT_L (0x000000000000009a | ALIGNED_PHYSICAL) - /* 512 Mb, Cacheable, Write-back, execute, Not User, Ph. Add. */ - -#define MMUDR_CACHED_H 0x0000000000000003 | ALIGNED_EFFECTIVE - /* Enabled, Shared, ASID 0, Eff. Add. 0xA0000000 */ -#define MMUDR_CACHED_L 0x000000000000015a | ALIGNED_PHYSICAL - /* 512 Mb, Cacheable, Write-back, read/write, Not User, Ph. Add. */ - -#ifdef CONFIG_CACHE_OFF -#define ICCR0_INIT_VAL ICCR0_OFF /* ICACHE off */ -#else -#define ICCR0_INIT_VAL ICCR0_ON | ICCR0_ICI /* ICE + ICI */ -#endif -#define ICCR1_INIT_VAL ICCR1_NOLOCK /* No locking */ - -#if defined (CONFIG_CACHE_OFF) -#define OCCR0_INIT_VAL OCCR0_OFF /* D-cache: off */ -#elif defined (CONFIG_CACHE_WRITETHROUGH) -#define OCCR0_INIT_VAL OCCR0_ON | OCCR0_OCI | OCCR0_WT /* D-cache: on, */ - /* WT, invalidate */ -#elif defined (CONFIG_CACHE_WRITEBACK) -#define OCCR0_INIT_VAL OCCR0_ON | OCCR0_OCI | OCCR0_WB /* D-cache: on, */ - /* WB, invalidate */ -#else -#error preprocessor flag CONFIG_CACHE_... not recognized! -#endif - -#define OCCR1_INIT_VAL OCCR1_NOLOCK /* No locking */ - - .section .empty_zero_page, "aw" - .global empty_zero_page - -empty_zero_page: - .long 1 /* MOUNT_ROOT_RDONLY */ - .long 0 /* RAMDISK_FLAGS */ - .long 0x0200 /* ORIG_ROOT_DEV */ - .long 1 /* LOADER_TYPE */ - .long 0x00800000 /* INITRD_START */ - .long 0x00800000 /* INITRD_SIZE */ - .long 0 - - .text - .balign 4096,0,4096 - - .section .data, "aw" - .balign PAGE_SIZE - - .section .data, "aw" - .balign PAGE_SIZE - - .global mmu_pdtp_cache -mmu_pdtp_cache: - .space PAGE_SIZE, 0 - - .global fpu_in_use -fpu_in_use: .quad 0 - - - __HEAD - .balign L1_CACHE_BYTES -/* - * Condition at the entry of __stext: - * . Reset state: - * . SR.FD = 1 (FPU disabled) - * . SR.BL = 1 (Exceptions disabled) - * . SR.MD = 1 (Privileged Mode) - * . SR.MMU = 0 (MMU Disabled) - * . SR.CD = 0 (CTC User Visible) - * . SR.IMASK = Undefined (Interrupt Mask) - * - * Operations supposed to be performed by __stext: - * . prevent speculative fetch onto device memory while MMU is off - * . reflect as much as possible SH5 ABI (r15, r26, r27, r18) - * . first, save CPU state and set it to something harmless - * . any CPU detection and/or endianness settings (?) - * . initialize EMI/LMI (but not TMU/RTC/INTC/SCIF): TBD - * . set initial TLB entries for cached and uncached regions - * (no fine granularity paging) - * . set initial cache state - * . enable MMU and caches - * . set CPU to a consistent state - * . registers (including stack pointer and current/KCR0) - * . NOT expecting to set Exception handling nor VBR/RESVEC/DCR - * at this stage. This is all to later Linux initialization steps. - * . initialize FPU - * . clear BSS - * . jump into start_kernel() - * . be prepared to hopeless start_kernel() returns. - * - */ - .global _stext -_stext: - /* - * Prevent speculative fetch on device memory due to - * uninitialized target registers. - */ - ptabs/u ZERO, tr0 - ptabs/u ZERO, tr1 - ptabs/u ZERO, tr2 - ptabs/u ZERO, tr3 - ptabs/u ZERO, tr4 - ptabs/u ZERO, tr5 - ptabs/u ZERO, tr6 - ptabs/u ZERO, tr7 - synci - - /* - * Read/Set CPU state. After this block: - * r29 = Initial SR - */ - getcon SR, r29 - movi SR_HARMLESS, r20 - putcon r20, SR - - /* - * Initialize EMI/LMI. To Be Done. - */ - - /* - * CPU detection and/or endianness settings (?). To Be Done. - * Pure PIC code here, please ! Just save state into r30. - * After this block: - * r30 = CPU type/Platform Endianness - */ - - /* - * Set initial TLB entries for cached and uncached regions. - * Note: PTA/BLINK is PIC code, PTABS/BLINK isn't ! - */ - /* Clear ITLBs */ - pta clear_ITLB, tr1 - movi MMUIR_FIRST, r21 - movi MMUIR_END, r22 -clear_ITLB: - putcfg r21, 0, ZERO /* Clear MMUIR[n].PTEH.V */ - addi r21, MMUIR_STEP, r21 - bne r21, r22, tr1 - - /* Clear DTLBs */ - pta clear_DTLB, tr1 - movi MMUDR_FIRST, r21 - movi MMUDR_END, r22 -clear_DTLB: - putcfg r21, 0, ZERO /* Clear MMUDR[n].PTEH.V */ - addi r21, MMUDR_STEP, r21 - bne r21, r22, tr1 - - /* Map one big (512Mb) page for ITLB */ - movi MMUIR_FIRST, r21 - movi MMUIR_TEXT_L, r22 /* PTEL first */ - add.l r22, r63, r22 /* Sign extend */ - putcfg r21, 1, r22 /* Set MMUIR[0].PTEL */ - movi MMUIR_TEXT_H, r22 /* PTEH last */ - add.l r22, r63, r22 /* Sign extend */ - putcfg r21, 0, r22 /* Set MMUIR[0].PTEH */ - - /* Map one big CACHED (512Mb) page for DTLB */ - movi MMUDR_FIRST, r21 - movi MMUDR_CACHED_L, r22 /* PTEL first */ - add.l r22, r63, r22 /* Sign extend */ - putcfg r21, 1, r22 /* Set MMUDR[0].PTEL */ - movi MMUDR_CACHED_H, r22 /* PTEH last */ - add.l r22, r63, r22 /* Sign extend */ - putcfg r21, 0, r22 /* Set MMUDR[0].PTEH */ - - /* - * Setup a DTLB translation for SCIF phys. - */ - addi r21, MMUDR_STEP, r21 - movi 0x0a03, r22 /* SCIF phys */ - shori 0x0148, r22 - putcfg r21, 1, r22 /* PTEL first */ - movi 0xfa03, r22 /* 0xfa030000, fixed SCIF virt */ - shori 0x0003, r22 - putcfg r21, 0, r22 /* PTEH last */ - - /* - * Set cache behaviours. - */ - /* ICache */ - movi ICCR_BASE, r21 - movi ICCR0_INIT_VAL, r22 - movi ICCR1_INIT_VAL, r23 - putcfg r21, ICCR_REG0, r22 - putcfg r21, ICCR_REG1, r23 - - /* OCache */ - movi OCCR_BASE, r21 - movi OCCR0_INIT_VAL, r22 - movi OCCR1_INIT_VAL, r23 - putcfg r21, OCCR_REG0, r22 - putcfg r21, OCCR_REG1, r23 - - - /* - * Enable Caches and MMU. Do the first non-PIC jump. - * Now head.S global variables, constants and externs - * can be used. - */ - getcon SR, r21 - movi SR_ENABLE_MMU, r22 - or r21, r22, r21 - putcon r21, SSR - movi hyperspace, r22 - ori r22, 1, r22 /* Make it SHmedia, not required but..*/ - putcon r22, SPC - synco - rte /* And now go into the hyperspace ... */ -hyperspace: /* ... that's the next instruction ! */ - - /* - * Set CPU to a consistent state. - * r31 = FPU support flag - * tr0/tr7 in use. Others give a chance to loop somewhere safe - */ - movi start_kernel, r32 - ori r32, 1, r32 - - ptabs r32, tr0 /* r32 = _start_kernel address */ - pta/u hopeless, tr1 - pta/u hopeless, tr2 - pta/u hopeless, tr3 - pta/u hopeless, tr4 - pta/u hopeless, tr5 - pta/u hopeless, tr6 - pta/u hopeless, tr7 - gettr tr1, r28 /* r28 = hopeless address */ - - /* Set initial stack pointer */ - movi init_thread_union, SP - putcon SP, KCR0 /* Set current to init_task */ - movi THREAD_SIZE, r22 /* Point to the end */ - add SP, r22, SP - - /* - * Initialize FPU. - * Keep FPU flag in r31. After this block: - * r31 = FPU flag - */ - movi fpu_in_use, r31 /* Temporary */ - -#ifdef CONFIG_SH_FPU - getcon SR, r21 - movi SR_ENABLE_FPU, r22 - and r21, r22, r22 - putcon r22, SR /* Try to enable */ - getcon SR, r22 - xor r21, r22, r21 - shlri r21, 15, r21 /* Supposedly 0/1 */ - st.q r31, 0 , r21 /* Set fpu_in_use */ -#else - movi 0, r21 - st.q r31, 0 , r21 /* Set fpu_in_use */ -#endif - or r21, ZERO, r31 /* Set FPU flag at last */ - -#ifndef CONFIG_SH_NO_BSS_INIT -/* Don't clear BSS if running on slow platforms such as an RTL simulation, - remote memory via SHdebug link, etc. For these the memory can be guaranteed - to be all zero on boot anyway. */ - /* - * Clear bss - */ - pta clear_quad, tr1 - movi __bss_start, r22 - movi _end, r23 -clear_quad: - st.q r22, 0, ZERO - addi r22, 8, r22 - bne r22, r23, tr1 /* Both quad aligned, see vmlinux.lds.S */ -#endif - pta/u hopeless, tr1 - - /* Say bye to head.S but be prepared to wrongly get back ... */ - blink tr0, LINK - - /* If we ever get back here through LINK/tr1-tr7 */ - pta/u hopeless, tr7 - -hopeless: - /* - * Something's badly wrong here. Loop endlessly, - * there's nothing more we can do about it. - * - * Note on hopeless: it can be jumped into invariably - * before or after jumping into hyperspace. The only - * requirement is to be PIC called (PTA) before and - * any way (PTA/PTABS) after. According to Virtual - * to Physical mapping a simulator/emulator can easily - * tell where we came here from just looking at hopeless - * (PC) address. - * - * For debugging purposes: - * (r28) hopeless/loop address - * (r29) Original SR - * (r30) CPU type/Platform endianness - * (r31) FPU Support - * (r32) _start_kernel address - */ - blink tr7, ZERO diff --git a/arch/sh/kernel/irq_64.c b/arch/sh/kernel/irq_64.c deleted file mode 100644 index 7a1f50435e33..000000000000 --- a/arch/sh/kernel/irq_64.c +++ /dev/null @@ -1,48 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * SHmedia irqflags support - * - * Copyright (C) 2006 - 2009 Paul Mundt - */ -#include -#include -#include - -void notrace arch_local_irq_restore(unsigned long flags) -{ - unsigned long long __dummy; - - if (flags == ARCH_IRQ_DISABLED) { - __asm__ __volatile__ ( - "getcon " __SR ", %0\n\t" - "or %0, %1, %0\n\t" - "putcon %0, " __SR "\n\t" - : "=&r" (__dummy) - : "r" (ARCH_IRQ_DISABLED) - ); - } else { - __asm__ __volatile__ ( - "getcon " __SR ", %0\n\t" - "and %0, %1, %0\n\t" - "putcon %0, " __SR "\n\t" - : "=&r" (__dummy) - : "r" (~ARCH_IRQ_DISABLED) - ); - } -} -EXPORT_SYMBOL(arch_local_irq_restore); - -unsigned long notrace arch_local_save_flags(void) -{ - unsigned long flags; - - __asm__ __volatile__ ( - "getcon " __SR ", %0\n\t" - "and %0, %1, %0" - : "=&r" (flags) - : "r" (ARCH_IRQ_DISABLED) - ); - - return flags; -} -EXPORT_SYMBOL(arch_local_save_flags); diff --git a/arch/sh/kernel/module.c b/arch/sh/kernel/module.c index bbc78d1d618e..b9cee98a754e 100644 --- a/arch/sh/kernel/module.c +++ b/arch/sh/kernel/module.c @@ -46,15 +46,6 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, + ELF32_R_SYM(rel[i].r_info); relocation = sym->st_value + rel[i].r_addend; -#ifdef CONFIG_SUPERH64 - /* For text addresses, bit2 of the st_other field indicates - * whether the symbol is SHmedia (1) or SHcompact (0). If - * SHmedia, the LSB of the symbol needs to be asserted - * for the CPU to be in SHmedia mode when it starts executing - * the branch target. */ - relocation |= !!(sym->st_other & 4); -#endif - switch (ELF32_R_TYPE(rel[i].r_info)) { case R_SH_NONE: break; diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c index 4d1bfc848dd3..169832fcf21b 100644 --- a/arch/sh/kernel/process.c +++ b/arch/sh/kernel/process.c @@ -23,9 +23,7 @@ EXPORT_SYMBOL(__stack_chk_guard); */ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { -#ifdef CONFIG_SUPERH32 unlazy_fpu(src, task_pt_regs(src)); -#endif *dst = *src; if (src->thread.xstate) { diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c deleted file mode 100644 index c2844a2e18cd..000000000000 --- a/arch/sh/kernel/process_64.c +++ /dev/null @@ -1,461 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * arch/sh/kernel/process_64.c - * - * This file handles the architecture-dependent parts of process handling.. - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2003 - 2007 Paul Mundt - * Copyright (C) 2003, 2004 Richard Curnow - * - * Started from SH3/4 version: - * Copyright (C) 1999, 2000 Niibe Yutaka & Kaz Kojima - * - * In turn started from i386 version: - * Copyright (C) 1995 Linus Torvalds - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct task_struct *last_task_used_math = NULL; -struct pt_regs fake_swapper_regs = { 0, }; - -void show_regs(struct pt_regs *regs) -{ - unsigned long long ah, al, bh, bl, ch, cl; - - printk("\n"); - show_regs_print_info(KERN_DEFAULT); - - ah = (regs->pc) >> 32; - al = (regs->pc) & 0xffffffff; - bh = (regs->regs[18]) >> 32; - bl = (regs->regs[18]) & 0xffffffff; - ch = (regs->regs[15]) >> 32; - cl = (regs->regs[15]) & 0xffffffff; - printk("PC : %08Lx%08Lx LINK: %08Lx%08Lx SP : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->sr) >> 32; - al = (regs->sr) & 0xffffffff; - asm volatile ("getcon " __TEA ", %0" : "=r" (bh)); - asm volatile ("getcon " __TEA ", %0" : "=r" (bl)); - bh = (bh) >> 32; - bl = (bl) & 0xffffffff; - asm volatile ("getcon " __KCR0 ", %0" : "=r" (ch)); - asm volatile ("getcon " __KCR0 ", %0" : "=r" (cl)); - ch = (ch) >> 32; - cl = (cl) & 0xffffffff; - printk("SR : %08Lx%08Lx TEA : %08Lx%08Lx KCR0: %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[0]) >> 32; - al = (regs->regs[0]) & 0xffffffff; - bh = (regs->regs[1]) >> 32; - bl = (regs->regs[1]) & 0xffffffff; - ch = (regs->regs[2]) >> 32; - cl = (regs->regs[2]) & 0xffffffff; - printk("R0 : %08Lx%08Lx R1 : %08Lx%08Lx R2 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[3]) >> 32; - al = (regs->regs[3]) & 0xffffffff; - bh = (regs->regs[4]) >> 32; - bl = (regs->regs[4]) & 0xffffffff; - ch = (regs->regs[5]) >> 32; - cl = (regs->regs[5]) & 0xffffffff; - printk("R3 : %08Lx%08Lx R4 : %08Lx%08Lx R5 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[6]) >> 32; - al = (regs->regs[6]) & 0xffffffff; - bh = (regs->regs[7]) >> 32; - bl = (regs->regs[7]) & 0xffffffff; - ch = (regs->regs[8]) >> 32; - cl = (regs->regs[8]) & 0xffffffff; - printk("R6 : %08Lx%08Lx R7 : %08Lx%08Lx R8 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[9]) >> 32; - al = (regs->regs[9]) & 0xffffffff; - bh = (regs->regs[10]) >> 32; - bl = (regs->regs[10]) & 0xffffffff; - ch = (regs->regs[11]) >> 32; - cl = (regs->regs[11]) & 0xffffffff; - printk("R9 : %08Lx%08Lx R10 : %08Lx%08Lx R11 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[12]) >> 32; - al = (regs->regs[12]) & 0xffffffff; - bh = (regs->regs[13]) >> 32; - bl = (regs->regs[13]) & 0xffffffff; - ch = (regs->regs[14]) >> 32; - cl = (regs->regs[14]) & 0xffffffff; - printk("R12 : %08Lx%08Lx R13 : %08Lx%08Lx R14 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[16]) >> 32; - al = (regs->regs[16]) & 0xffffffff; - bh = (regs->regs[17]) >> 32; - bl = (regs->regs[17]) & 0xffffffff; - ch = (regs->regs[19]) >> 32; - cl = (regs->regs[19]) & 0xffffffff; - printk("R16 : %08Lx%08Lx R17 : %08Lx%08Lx R19 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[20]) >> 32; - al = (regs->regs[20]) & 0xffffffff; - bh = (regs->regs[21]) >> 32; - bl = (regs->regs[21]) & 0xffffffff; - ch = (regs->regs[22]) >> 32; - cl = (regs->regs[22]) & 0xffffffff; - printk("R20 : %08Lx%08Lx R21 : %08Lx%08Lx R22 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[23]) >> 32; - al = (regs->regs[23]) & 0xffffffff; - bh = (regs->regs[24]) >> 32; - bl = (regs->regs[24]) & 0xffffffff; - ch = (regs->regs[25]) >> 32; - cl = (regs->regs[25]) & 0xffffffff; - printk("R23 : %08Lx%08Lx R24 : %08Lx%08Lx R25 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[26]) >> 32; - al = (regs->regs[26]) & 0xffffffff; - bh = (regs->regs[27]) >> 32; - bl = (regs->regs[27]) & 0xffffffff; - ch = (regs->regs[28]) >> 32; - cl = (regs->regs[28]) & 0xffffffff; - printk("R26 : %08Lx%08Lx R27 : %08Lx%08Lx R28 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[29]) >> 32; - al = (regs->regs[29]) & 0xffffffff; - bh = (regs->regs[30]) >> 32; - bl = (regs->regs[30]) & 0xffffffff; - ch = (regs->regs[31]) >> 32; - cl = (regs->regs[31]) & 0xffffffff; - printk("R29 : %08Lx%08Lx R30 : %08Lx%08Lx R31 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[32]) >> 32; - al = (regs->regs[32]) & 0xffffffff; - bh = (regs->regs[33]) >> 32; - bl = (regs->regs[33]) & 0xffffffff; - ch = (regs->regs[34]) >> 32; - cl = (regs->regs[34]) & 0xffffffff; - printk("R32 : %08Lx%08Lx R33 : %08Lx%08Lx R34 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[35]) >> 32; - al = (regs->regs[35]) & 0xffffffff; - bh = (regs->regs[36]) >> 32; - bl = (regs->regs[36]) & 0xffffffff; - ch = (regs->regs[37]) >> 32; - cl = (regs->regs[37]) & 0xffffffff; - printk("R35 : %08Lx%08Lx R36 : %08Lx%08Lx R37 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[38]) >> 32; - al = (regs->regs[38]) & 0xffffffff; - bh = (regs->regs[39]) >> 32; - bl = (regs->regs[39]) & 0xffffffff; - ch = (regs->regs[40]) >> 32; - cl = (regs->regs[40]) & 0xffffffff; - printk("R38 : %08Lx%08Lx R39 : %08Lx%08Lx R40 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[41]) >> 32; - al = (regs->regs[41]) & 0xffffffff; - bh = (regs->regs[42]) >> 32; - bl = (regs->regs[42]) & 0xffffffff; - ch = (regs->regs[43]) >> 32; - cl = (regs->regs[43]) & 0xffffffff; - printk("R41 : %08Lx%08Lx R42 : %08Lx%08Lx R43 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[44]) >> 32; - al = (regs->regs[44]) & 0xffffffff; - bh = (regs->regs[45]) >> 32; - bl = (regs->regs[45]) & 0xffffffff; - ch = (regs->regs[46]) >> 32; - cl = (regs->regs[46]) & 0xffffffff; - printk("R44 : %08Lx%08Lx R45 : %08Lx%08Lx R46 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[47]) >> 32; - al = (regs->regs[47]) & 0xffffffff; - bh = (regs->regs[48]) >> 32; - bl = (regs->regs[48]) & 0xffffffff; - ch = (regs->regs[49]) >> 32; - cl = (regs->regs[49]) & 0xffffffff; - printk("R47 : %08Lx%08Lx R48 : %08Lx%08Lx R49 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[50]) >> 32; - al = (regs->regs[50]) & 0xffffffff; - bh = (regs->regs[51]) >> 32; - bl = (regs->regs[51]) & 0xffffffff; - ch = (regs->regs[52]) >> 32; - cl = (regs->regs[52]) & 0xffffffff; - printk("R50 : %08Lx%08Lx R51 : %08Lx%08Lx R52 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[53]) >> 32; - al = (regs->regs[53]) & 0xffffffff; - bh = (regs->regs[54]) >> 32; - bl = (regs->regs[54]) & 0xffffffff; - ch = (regs->regs[55]) >> 32; - cl = (regs->regs[55]) & 0xffffffff; - printk("R53 : %08Lx%08Lx R54 : %08Lx%08Lx R55 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[56]) >> 32; - al = (regs->regs[56]) & 0xffffffff; - bh = (regs->regs[57]) >> 32; - bl = (regs->regs[57]) & 0xffffffff; - ch = (regs->regs[58]) >> 32; - cl = (regs->regs[58]) & 0xffffffff; - printk("R56 : %08Lx%08Lx R57 : %08Lx%08Lx R58 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[59]) >> 32; - al = (regs->regs[59]) & 0xffffffff; - bh = (regs->regs[60]) >> 32; - bl = (regs->regs[60]) & 0xffffffff; - ch = (regs->regs[61]) >> 32; - cl = (regs->regs[61]) & 0xffffffff; - printk("R59 : %08Lx%08Lx R60 : %08Lx%08Lx R61 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[62]) >> 32; - al = (regs->regs[62]) & 0xffffffff; - bh = (regs->tregs[0]) >> 32; - bl = (regs->tregs[0]) & 0xffffffff; - ch = (regs->tregs[1]) >> 32; - cl = (regs->tregs[1]) & 0xffffffff; - printk("R62 : %08Lx%08Lx T0 : %08Lx%08Lx T1 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->tregs[2]) >> 32; - al = (regs->tregs[2]) & 0xffffffff; - bh = (regs->tregs[3]) >> 32; - bl = (regs->tregs[3]) & 0xffffffff; - ch = (regs->tregs[4]) >> 32; - cl = (regs->tregs[4]) & 0xffffffff; - printk("T2 : %08Lx%08Lx T3 : %08Lx%08Lx T4 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->tregs[5]) >> 32; - al = (regs->tregs[5]) & 0xffffffff; - bh = (regs->tregs[6]) >> 32; - bl = (regs->tregs[6]) & 0xffffffff; - ch = (regs->tregs[7]) >> 32; - cl = (regs->tregs[7]) & 0xffffffff; - printk("T5 : %08Lx%08Lx T6 : %08Lx%08Lx T7 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - /* - * If we're in kernel mode, dump the stack too.. - */ - if (!user_mode(regs)) { - void show_stack(struct task_struct *tsk, unsigned long *sp); - unsigned long sp = regs->regs[15] & 0xffffffff; - struct task_struct *tsk = get_current(); - - tsk->thread.kregs = regs; - - show_stack(tsk, (unsigned long *)sp); - } -} - -/* - * Free current thread data structures etc.. - */ -void exit_thread(struct task_struct *tsk) -{ - /* - * See arch/sparc/kernel/process.c for the precedent for doing - * this -- RPC. - * - * The SH-5 FPU save/restore approach relies on - * last_task_used_math pointing to a live task_struct. When - * another task tries to use the FPU for the 1st time, the FPUDIS - * trap handling (see arch/sh/kernel/cpu/sh5/fpu.c) will save the - * existing FPU state to the FP regs field within - * last_task_used_math before re-loading the new task's FPU state - * (or initialising it if the FPU has been used before). So if - * last_task_used_math is stale, and its page has already been - * re-allocated for another use, the consequences are rather - * grim. Unless we null it here, there is no other path through - * which it would get safely nulled. - */ -#ifdef CONFIG_SH_FPU - if (last_task_used_math == tsk) - last_task_used_math = NULL; -#endif -} - -void flush_thread(void) -{ - - /* Called by fs/exec.c (setup_new_exec) to remove traces of a - * previously running executable. */ -#ifdef CONFIG_SH_FPU - if (last_task_used_math == current) { - last_task_used_math = NULL; - } - /* Force FPU state to be reinitialised after exec */ - clear_used_math(); -#endif - - /* if we are a kernel thread, about to change to user thread, - * update kreg - */ - if(current->thread.kregs==&fake_swapper_regs) { - current->thread.kregs = - ((struct pt_regs *)(THREAD_SIZE + (unsigned long) current) - 1); - current->thread.uregs = current->thread.kregs; - } -} - -void release_thread(struct task_struct *dead_task) -{ - /* do nothing */ -} - -/* Fill in the fpu structure for a core dump.. */ -int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu) -{ -#ifdef CONFIG_SH_FPU - int fpvalid; - struct task_struct *tsk = current; - - fpvalid = !!tsk_used_math(tsk); - if (fpvalid) { - if (current == last_task_used_math) { - enable_fpu(); - save_fpu(tsk); - disable_fpu(); - last_task_used_math = 0; - regs->sr |= SR_FD; - } - - memcpy(fpu, &tsk->thread.xstate->hardfpu, sizeof(*fpu)); - } - - return fpvalid; -#else - return 0; /* Task didn't use the fpu at all. */ -#endif -} -EXPORT_SYMBOL(dump_fpu); - -asmlinkage void ret_from_fork(void); -asmlinkage void ret_from_kernel_thread(void); - -int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, struct task_struct *p) -{ - struct pt_regs *childregs; - -#ifdef CONFIG_SH_FPU - /* can't happen for a kernel thread */ - if (last_task_used_math == current) { - enable_fpu(); - save_fpu(current); - disable_fpu(); - last_task_used_math = NULL; - current_pt_regs()->sr |= SR_FD; - } -#endif - /* Copy from sh version */ - childregs = (struct pt_regs *)(THREAD_SIZE + task_stack_page(p)) - 1; - p->thread.sp = (unsigned long) childregs; - - if (unlikely(p->flags & PF_KTHREAD)) { - memset(childregs, 0, sizeof(struct pt_regs)); - childregs->regs[2] = (unsigned long)arg; - childregs->regs[3] = (unsigned long)usp; - childregs->sr = (1 << 30); /* not user_mode */ - childregs->sr |= SR_FD; /* Invalidate FPU flag */ - p->thread.pc = (unsigned long) ret_from_kernel_thread; - return 0; - } - *childregs = *current_pt_regs(); - - /* - * Sign extend the edited stack. - * Note that thread.pc and thread.pc will stay - * 32-bit wide and context switch must take care - * of NEFF sign extension. - */ - if (usp) - childregs->regs[15] = neff_sign_extend(usp); - p->thread.uregs = childregs; - - childregs->regs[9] = 0; /* Set return value for child */ - childregs->sr |= SR_FD; /* Invalidate FPU flag */ - - p->thread.pc = (unsigned long) ret_from_fork; - - return 0; -} - -#ifdef CONFIG_FRAME_POINTER -static int in_sh64_switch_to(unsigned long pc) -{ - extern char __sh64_switch_to_end; - /* For a sleeping task, the PC is somewhere in the middle of the function, - so we don't have to worry about masking the LSB off */ - return (pc >= (unsigned long) sh64_switch_to) && - (pc < (unsigned long) &__sh64_switch_to_end); -} -#endif - -unsigned long get_wchan(struct task_struct *p) -{ - unsigned long pc; - - if (!p || p == current || p->state == TASK_RUNNING) - return 0; - - /* - * The same comment as on the Alpha applies here, too ... - */ - pc = thread_saved_pc(p); - -#ifdef CONFIG_FRAME_POINTER - if (in_sh64_switch_to(pc)) { - unsigned long schedule_fp; - unsigned long sh64_switch_to_fp; - unsigned long schedule_caller_pc; - - sh64_switch_to_fp = (long) p->thread.sp; - /* r14 is saved at offset 4 in the sh64_switch_to frame */ - schedule_fp = *(unsigned long *) (long)(sh64_switch_to_fp + 4); - - /* and the caller of 'schedule' is (currently!) saved at offset 24 - in the frame of schedule (from disasm) */ - schedule_caller_pc = *(unsigned long *) (long)(schedule_fp + 24); - return schedule_caller_pc; - } -#endif - return pc; -} diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c deleted file mode 100644 index 11085e48eaa6..000000000000 --- a/arch/sh/kernel/ptrace_64.c +++ /dev/null @@ -1,576 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * arch/sh/kernel/ptrace_64.c - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2003 - 2008 Paul Mundt - * - * Started from SH3/4 version: - * SuperH version: Copyright (C) 1999, 2000 Kaz Kojima & Niibe Yutaka - * - * Original x86 implementation: - * By Ross Biro 1/23/92 - * edited by Linus Torvalds - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define CREATE_TRACE_POINTS -#include - -/* This mask defines the bits of the SR which the user is not allowed to - change, which are everything except S, Q, M, PR, SZ, FR. */ -#define SR_MASK (0xffff8cfd) - -/* - * does not yet catch signals sent when the child dies. - * in exit.c or in signal.c. - */ - -/* - * This routine will get a word from the user area in the process kernel stack. - */ -static inline int get_stack_long(struct task_struct *task, int offset) -{ - unsigned char *stack; - - stack = (unsigned char *)(task->thread.uregs); - stack += offset; - return (*((int *)stack)); -} - -static inline unsigned long -get_fpu_long(struct task_struct *task, unsigned long addr) -{ - unsigned long tmp; - struct pt_regs *regs; - regs = (struct pt_regs*)((unsigned char *)task + THREAD_SIZE) - 1; - - if (!tsk_used_math(task)) { - if (addr == offsetof(struct user_fpu_struct, fpscr)) { - tmp = FPSCR_INIT; - } else { - tmp = 0xffffffffUL; /* matches initial value in fpu.c */ - } - return tmp; - } - - if (last_task_used_math == task) { - enable_fpu(); - save_fpu(task); - disable_fpu(); - last_task_used_math = 0; - regs->sr |= SR_FD; - } - - tmp = ((long *)task->thread.xstate)[addr / sizeof(unsigned long)]; - return tmp; -} - -/* - * This routine will put a word into the user area in the process kernel stack. - */ -static inline int put_stack_long(struct task_struct *task, int offset, - unsigned long data) -{ - unsigned char *stack; - - stack = (unsigned char *)(task->thread.uregs); - stack += offset; - *(unsigned long *) stack = data; - return 0; -} - -static inline int -put_fpu_long(struct task_struct *task, unsigned long addr, unsigned long data) -{ - struct pt_regs *regs; - - regs = (struct pt_regs*)((unsigned char *)task + THREAD_SIZE) - 1; - - if (!tsk_used_math(task)) { - init_fpu(task); - } else if (last_task_used_math == task) { - enable_fpu(); - save_fpu(task); - disable_fpu(); - last_task_used_math = 0; - regs->sr |= SR_FD; - } - - ((long *)task->thread.xstate)[addr / sizeof(unsigned long)] = data; - return 0; -} - -void user_enable_single_step(struct task_struct *child) -{ - struct pt_regs *regs = child->thread.uregs; - - regs->sr |= SR_SSTEP; /* auto-resetting upon exception */ - - set_tsk_thread_flag(child, TIF_SINGLESTEP); -} - -void user_disable_single_step(struct task_struct *child) -{ - struct pt_regs *regs = child->thread.uregs; - - regs->sr &= ~SR_SSTEP; - - clear_tsk_thread_flag(child, TIF_SINGLESTEP); -} - -static int genregs_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - const struct pt_regs *regs = task_pt_regs(target); - int ret; - - /* PC, SR, SYSCALL */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - ®s->pc, - 0, 3 * sizeof(unsigned long long)); - - /* R1 -> R63 */ - if (!ret) - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - regs->regs, - offsetof(struct pt_regs, regs[0]), - 63 * sizeof(unsigned long long)); - /* TR0 -> TR7 */ - if (!ret) - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - regs->tregs, - offsetof(struct pt_regs, tregs[0]), - 8 * sizeof(unsigned long long)); - - if (!ret) - ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, - sizeof(struct pt_regs), -1); - - return ret; -} - -static int genregs_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - struct pt_regs *regs = task_pt_regs(target); - int ret; - - /* PC, SR, SYSCALL */ - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - ®s->pc, - 0, 3 * sizeof(unsigned long long)); - - /* R1 -> R63 */ - if (!ret && count > 0) - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - regs->regs, - offsetof(struct pt_regs, regs[0]), - 63 * sizeof(unsigned long long)); - - /* TR0 -> TR7 */ - if (!ret && count > 0) - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - regs->tregs, - offsetof(struct pt_regs, tregs[0]), - 8 * sizeof(unsigned long long)); - - if (!ret) - ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, - sizeof(struct pt_regs), -1); - - return ret; -} - -#ifdef CONFIG_SH_FPU -int fpregs_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - int ret; - - ret = init_fpu(target); - if (ret) - return ret; - - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.xstate->hardfpu, 0, -1); -} - -static int fpregs_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - int ret; - - ret = init_fpu(target); - if (ret) - return ret; - - set_stopped_child_used_math(target); - - return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.xstate->hardfpu, 0, -1); -} - -static int fpregs_active(struct task_struct *target, - const struct user_regset *regset) -{ - return tsk_used_math(target) ? regset->n : 0; -} -#endif - -const struct pt_regs_offset regoffset_table[] = { - REG_OFFSET_NAME(pc), - REG_OFFSET_NAME(sr), - REG_OFFSET_NAME(syscall_nr), - REGS_OFFSET_NAME(0), - REGS_OFFSET_NAME(1), - REGS_OFFSET_NAME(2), - REGS_OFFSET_NAME(3), - REGS_OFFSET_NAME(4), - REGS_OFFSET_NAME(5), - REGS_OFFSET_NAME(6), - REGS_OFFSET_NAME(7), - REGS_OFFSET_NAME(8), - REGS_OFFSET_NAME(9), - REGS_OFFSET_NAME(10), - REGS_OFFSET_NAME(11), - REGS_OFFSET_NAME(12), - REGS_OFFSET_NAME(13), - REGS_OFFSET_NAME(14), - REGS_OFFSET_NAME(15), - REGS_OFFSET_NAME(16), - REGS_OFFSET_NAME(17), - REGS_OFFSET_NAME(18), - REGS_OFFSET_NAME(19), - REGS_OFFSET_NAME(20), - REGS_OFFSET_NAME(21), - REGS_OFFSET_NAME(22), - REGS_OFFSET_NAME(23), - REGS_OFFSET_NAME(24), - REGS_OFFSET_NAME(25), - REGS_OFFSET_NAME(26), - REGS_OFFSET_NAME(27), - REGS_OFFSET_NAME(28), - REGS_OFFSET_NAME(29), - REGS_OFFSET_NAME(30), - REGS_OFFSET_NAME(31), - REGS_OFFSET_NAME(32), - REGS_OFFSET_NAME(33), - REGS_OFFSET_NAME(34), - REGS_OFFSET_NAME(35), - REGS_OFFSET_NAME(36), - REGS_OFFSET_NAME(37), - REGS_OFFSET_NAME(38), - REGS_OFFSET_NAME(39), - REGS_OFFSET_NAME(40), - REGS_OFFSET_NAME(41), - REGS_OFFSET_NAME(42), - REGS_OFFSET_NAME(43), - REGS_OFFSET_NAME(44), - REGS_OFFSET_NAME(45), - REGS_OFFSET_NAME(46), - REGS_OFFSET_NAME(47), - REGS_OFFSET_NAME(48), - REGS_OFFSET_NAME(49), - REGS_OFFSET_NAME(50), - REGS_OFFSET_NAME(51), - REGS_OFFSET_NAME(52), - REGS_OFFSET_NAME(53), - REGS_OFFSET_NAME(54), - REGS_OFFSET_NAME(55), - REGS_OFFSET_NAME(56), - REGS_OFFSET_NAME(57), - REGS_OFFSET_NAME(58), - REGS_OFFSET_NAME(59), - REGS_OFFSET_NAME(60), - REGS_OFFSET_NAME(61), - REGS_OFFSET_NAME(62), - REGS_OFFSET_NAME(63), - TREGS_OFFSET_NAME(0), - TREGS_OFFSET_NAME(1), - TREGS_OFFSET_NAME(2), - TREGS_OFFSET_NAME(3), - TREGS_OFFSET_NAME(4), - TREGS_OFFSET_NAME(5), - TREGS_OFFSET_NAME(6), - TREGS_OFFSET_NAME(7), - REG_OFFSET_END, -}; - -/* - * These are our native regset flavours. - */ -enum sh_regset { - REGSET_GENERAL, -#ifdef CONFIG_SH_FPU - REGSET_FPU, -#endif -}; - -static const struct user_regset sh_regsets[] = { - /* - * Format is: - * PC, SR, SYSCALL, - * R1 --> R63, - * TR0 --> TR7, - */ - [REGSET_GENERAL] = { - .core_note_type = NT_PRSTATUS, - .n = ELF_NGREG, - .size = sizeof(long long), - .align = sizeof(long long), - .get = genregs_get, - .set = genregs_set, - }, - -#ifdef CONFIG_SH_FPU - [REGSET_FPU] = { - .core_note_type = NT_PRFPREG, - .n = sizeof(struct user_fpu_struct) / - sizeof(long long), - .size = sizeof(long long), - .align = sizeof(long long), - .get = fpregs_get, - .set = fpregs_set, - .active = fpregs_active, - }, -#endif -}; - -static const struct user_regset_view user_sh64_native_view = { - .name = "sh64", - .e_machine = EM_SH, - .regsets = sh_regsets, - .n = ARRAY_SIZE(sh_regsets), -}; - -const struct user_regset_view *task_user_regset_view(struct task_struct *task) -{ - return &user_sh64_native_view; -} - -long arch_ptrace(struct task_struct *child, long request, - unsigned long addr, unsigned long data) -{ - int ret; - unsigned long __user *datap = (unsigned long __user *) data; - - switch (request) { - /* read the word at location addr in the USER area. */ - case PTRACE_PEEKUSR: { - unsigned long tmp; - - ret = -EIO; - if ((addr & 3) || addr < 0) - break; - - if (addr < sizeof(struct pt_regs)) - tmp = get_stack_long(child, addr); - else if ((addr >= offsetof(struct user, fpu)) && - (addr < offsetof(struct user, u_fpvalid))) { - unsigned long index; - ret = init_fpu(child); - if (ret) - break; - index = addr - offsetof(struct user, fpu); - tmp = get_fpu_long(child, index); - } else if (addr == offsetof(struct user, u_fpvalid)) { - tmp = !!tsk_used_math(child); - } else { - break; - } - ret = put_user(tmp, datap); - break; - } - - case PTRACE_POKEUSR: - /* write the word at location addr in the USER area. We must - disallow any changes to certain SR bits or u_fpvalid, since - this could crash the kernel or result in a security - loophole. */ - ret = -EIO; - if ((addr & 3) || addr < 0) - break; - - if (addr < sizeof(struct pt_regs)) { - /* Ignore change of top 32 bits of SR */ - if (addr == offsetof (struct pt_regs, sr)+4) - { - ret = 0; - break; - } - /* If lower 32 bits of SR, ignore non-user bits */ - if (addr == offsetof (struct pt_regs, sr)) - { - long cursr = get_stack_long(child, addr); - data &= ~(SR_MASK); - data |= (cursr & SR_MASK); - } - ret = put_stack_long(child, addr, data); - } - else if ((addr >= offsetof(struct user, fpu)) && - (addr < offsetof(struct user, u_fpvalid))) { - unsigned long index; - ret = init_fpu(child); - if (ret) - break; - index = addr - offsetof(struct user, fpu); - ret = put_fpu_long(child, index, data); - } - break; - - case PTRACE_GETREGS: - return copy_regset_to_user(child, &user_sh64_native_view, - REGSET_GENERAL, - 0, sizeof(struct pt_regs), - datap); - case PTRACE_SETREGS: - return copy_regset_from_user(child, &user_sh64_native_view, - REGSET_GENERAL, - 0, sizeof(struct pt_regs), - datap); -#ifdef CONFIG_SH_FPU - case PTRACE_GETFPREGS: - return copy_regset_to_user(child, &user_sh64_native_view, - REGSET_FPU, - 0, sizeof(struct user_fpu_struct), - datap); - case PTRACE_SETFPREGS: - return copy_regset_from_user(child, &user_sh64_native_view, - REGSET_FPU, - 0, sizeof(struct user_fpu_struct), - datap); -#endif - default: - ret = ptrace_request(child, request, addr, data); - break; - } - - return ret; -} - -asmlinkage int sh64_ptrace(long request, long pid, - unsigned long addr, unsigned long data) -{ -#define WPC_DBRMODE 0x0d104008 - static unsigned long first_call; - - if (!test_and_set_bit(0, &first_call)) { - /* Set WPC.DBRMODE to 0. This makes all debug events get - * delivered through RESVEC, i.e. into the handlers in entry.S. - * (If the kernel was downloaded using a remote gdb, WPC.DBRMODE - * would normally be left set to 1, which makes debug events get - * delivered through DBRVEC, i.e. into the remote gdb's - * handlers. This prevents ptrace getting them, and confuses - * the remote gdb.) */ - printk("DBRMODE set to 0 to permit native debugging\n"); - poke_real_address_q(WPC_DBRMODE, 0); - } - - return sys_ptrace(request, pid, addr, data); -} - -asmlinkage long long do_syscall_trace_enter(struct pt_regs *regs) -{ - long long ret = 0; - - secure_computing_strict(regs->regs[9]); - - if (test_thread_flag(TIF_SYSCALL_TRACE) && - tracehook_report_syscall_entry(regs)) - /* - * Tracing decided this syscall should not happen. - * We'll return a bogus call number to get an ENOSYS - * error, but leave the original number in regs->regs[0]. - */ - ret = -1LL; - - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_sys_enter(regs, regs->regs[9]); - - audit_syscall_entry(regs->regs[1], regs->regs[2], regs->regs[3], - regs->regs[4], regs->regs[5]); - - return ret ?: regs->regs[9]; -} - -asmlinkage void do_syscall_trace_leave(struct pt_regs *regs) -{ - int step; - - audit_syscall_exit(regs); - - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_sys_exit(regs, regs->regs[9]); - - step = test_thread_flag(TIF_SINGLESTEP); - if (step || test_thread_flag(TIF_SYSCALL_TRACE)) - tracehook_report_syscall_exit(regs, step); -} - -/* Called with interrupts disabled */ -asmlinkage void do_single_step(unsigned long long vec, struct pt_regs *regs) -{ - /* This is called after a single step exception (DEBUGSS). - There is no need to change the PC, as it is a post-execution - exception, as entry.S does not do anything to the PC for DEBUGSS. - We need to clear the Single Step setting in SR to avoid - continually stepping. */ - local_irq_enable(); - regs->sr &= ~SR_SSTEP; - force_sig(SIGTRAP); -} - -/* Called with interrupts disabled */ -BUILD_TRAP_HANDLER(breakpoint) -{ - TRAP_HANDLER_DECL; - - /* We need to forward step the PC, to counteract the backstep done - in signal.c. */ - local_irq_enable(); - force_sig(SIGTRAP); - regs->pc += 4; -} - -/* - * Called by kernel/ptrace.c when detaching.. - * - * Make sure single step bits etc are not set. - */ -void ptrace_disable(struct task_struct *child) -{ - user_disable_single_step(child); -} diff --git a/arch/sh/kernel/reboot.c b/arch/sh/kernel/reboot.c index 11001a8a5fe0..5c33f036418b 100644 --- a/arch/sh/kernel/reboot.c +++ b/arch/sh/kernel/reboot.c @@ -4,9 +4,7 @@ #include #include #include -#ifdef CONFIG_SUPERH32 #include -#endif #include #include #include @@ -15,13 +13,11 @@ void (*pm_power_off)(void); EXPORT_SYMBOL(pm_power_off); -#ifdef CONFIG_SUPERH32 static void watchdog_trigger_immediate(void) { sh_wdt_write_cnt(0xFF); sh_wdt_write_csr(0xC2); } -#endif static void native_machine_restart(char * __unused) { @@ -33,10 +29,8 @@ static void native_machine_restart(char * __unused) /* Address error with SR.BL=1 first. */ trigger_address_error(); -#ifdef CONFIG_SUPERH32 /* If that fails or is unsupported, go for the watchdog next. */ watchdog_trigger_immediate(); -#endif /* * Give up and sleep. diff --git a/arch/sh/kernel/sh_ksyms_64.c b/arch/sh/kernel/sh_ksyms_64.c deleted file mode 100644 index 9de17065afb4..000000000000 --- a/arch/sh/kernel/sh_ksyms_64.c +++ /dev/null @@ -1,51 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * arch/sh/kernel/sh_ksyms_64.c - * - * Copyright (C) 2000, 2001 Paolo Alberelli - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -EXPORT_SYMBOL(__put_user_asm_b); -EXPORT_SYMBOL(__put_user_asm_w); -EXPORT_SYMBOL(__put_user_asm_l); -EXPORT_SYMBOL(__put_user_asm_q); -EXPORT_SYMBOL(__get_user_asm_b); -EXPORT_SYMBOL(__get_user_asm_w); -EXPORT_SYMBOL(__get_user_asm_l); -EXPORT_SYMBOL(__get_user_asm_q); -EXPORT_SYMBOL(__clear_user); -EXPORT_SYMBOL(copy_page); -EXPORT_SYMBOL(__copy_user); -EXPORT_SYMBOL(empty_zero_page); -EXPORT_SYMBOL(memcpy); -EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(__udelay); -EXPORT_SYMBOL(__ndelay); -EXPORT_SYMBOL(__const_udelay); -EXPORT_SYMBOL(strlen); -EXPORT_SYMBOL(strcpy); - -/* Ugh. These come in from libgcc.a at link time. */ -#define DECLARE_EXPORT(name) extern void name(void);EXPORT_SYMBOL(name) - -DECLARE_EXPORT(__sdivsi3); -DECLARE_EXPORT(__sdivsi3_1); -DECLARE_EXPORT(__sdivsi3_2); -DECLARE_EXPORT(__udivsi3); -DECLARE_EXPORT(__div_table); diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c deleted file mode 100644 index b9aaa9266b34..000000000000 --- a/arch/sh/kernel/signal_64.c +++ /dev/null @@ -1,567 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * arch/sh/kernel/signal_64.c - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2003 - 2008 Paul Mundt - * Copyright (C) 2004 Richard Curnow - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define REG_RET 9 -#define REG_ARG1 2 -#define REG_ARG2 3 -#define REG_ARG3 4 -#define REG_SP 15 -#define REG_PR 18 -#define REF_REG_RET regs->regs[REG_RET] -#define REF_REG_SP regs->regs[REG_SP] -#define DEREF_REG_PR regs->regs[REG_PR] - -#define DEBUG_SIG 0 - -static void -handle_signal(struct ksignal *ksig, struct pt_regs *regs); - -static inline void -handle_syscall_restart(struct pt_regs *regs, struct sigaction *sa) -{ - /* If we're not from a syscall, bail out */ - if (regs->syscall_nr < 0) - return; - - /* check for system call restart.. */ - switch (regs->regs[REG_RET]) { - case -ERESTART_RESTARTBLOCK: - case -ERESTARTNOHAND: - no_system_call_restart: - regs->regs[REG_RET] = -EINTR; - break; - - case -ERESTARTSYS: - if (!(sa->sa_flags & SA_RESTART)) - goto no_system_call_restart; - /* fallthrough */ - case -ERESTARTNOINTR: - /* Decode syscall # */ - regs->regs[REG_RET] = regs->syscall_nr; - regs->pc -= 4; - break; - } -} - -/* - * Note that 'init' is a special process: it doesn't get signals it doesn't - * want to handle. Thus you cannot kill init even with a SIGKILL even by - * mistake. - * - * Note that we go through the signals twice: once to check the signals that - * the kernel can handle, and then we build all the user-level signal handling - * stack-frames in one go after that. - */ -static void do_signal(struct pt_regs *regs) -{ - struct ksignal ksig; - - /* - * We want the common case to go fast, which - * is why we may in certain cases get here from - * kernel mode. Just return without doing anything - * if so. - */ - if (!user_mode(regs)) - return; - - if (get_signal(&ksig)) { - handle_syscall_restart(regs, &ksig.ka.sa); - - /* Whee! Actually deliver the signal. */ - handle_signal(&ksig, regs); - return; - } - - /* Did we come from a system call? */ - if (regs->syscall_nr >= 0) { - /* Restart the system call - no handlers present */ - switch (regs->regs[REG_RET]) { - case -ERESTARTNOHAND: - case -ERESTARTSYS: - case -ERESTARTNOINTR: - /* Decode Syscall # */ - regs->regs[REG_RET] = regs->syscall_nr; - regs->pc -= 4; - break; - - case -ERESTART_RESTARTBLOCK: - regs->regs[REG_RET] = __NR_restart_syscall; - regs->pc -= 4; - break; - } - } - - /* No signal to deliver -- put the saved sigmask back */ - restore_saved_sigmask(); -} - -/* - * Do a signal return; undo the signal stack. - */ -struct sigframe { - struct sigcontext sc; - unsigned long extramask[_NSIG_WORDS-1]; - long long retcode[2]; -}; - -struct rt_sigframe { - struct siginfo __user *pinfo; - void *puc; - struct siginfo info; - struct ucontext uc; - long long retcode[2]; -}; - -#ifdef CONFIG_SH_FPU -static inline int -restore_sigcontext_fpu(struct pt_regs *regs, struct sigcontext __user *sc) -{ - int err = 0; - int fpvalid; - - err |= __get_user (fpvalid, &sc->sc_fpvalid); - conditional_used_math(fpvalid); - if (! fpvalid) - return err; - - if (current == last_task_used_math) { - last_task_used_math = NULL; - regs->sr |= SR_FD; - } - - err |= __copy_from_user(¤t->thread.xstate->hardfpu, &sc->sc_fpregs[0], - (sizeof(long long) * 32) + (sizeof(int) * 1)); - - return err; -} - -static inline int -setup_sigcontext_fpu(struct pt_regs *regs, struct sigcontext __user *sc) -{ - int err = 0; - int fpvalid; - - fpvalid = !!used_math(); - err |= __put_user(fpvalid, &sc->sc_fpvalid); - if (! fpvalid) - return err; - - if (current == last_task_used_math) { - enable_fpu(); - save_fpu(current); - disable_fpu(); - last_task_used_math = NULL; - regs->sr |= SR_FD; - } - - err |= __copy_to_user(&sc->sc_fpregs[0], ¤t->thread.xstate->hardfpu, - (sizeof(long long) * 32) + (sizeof(int) * 1)); - clear_used_math(); - - return err; -} -#else -static inline int -restore_sigcontext_fpu(struct pt_regs *regs, struct sigcontext __user *sc) -{ - return 0; -} -static inline int -setup_sigcontext_fpu(struct pt_regs *regs, struct sigcontext __user *sc) -{ - return 0; -} -#endif - -static int -restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, long long *r2_p) -{ - unsigned int err = 0; - unsigned long long current_sr, new_sr; -#define SR_MASK 0xffff8cfd - -#define COPY(x) err |= __get_user(regs->x, &sc->sc_##x) - - COPY(regs[0]); COPY(regs[1]); COPY(regs[2]); COPY(regs[3]); - COPY(regs[4]); COPY(regs[5]); COPY(regs[6]); COPY(regs[7]); - COPY(regs[8]); COPY(regs[9]); COPY(regs[10]); COPY(regs[11]); - COPY(regs[12]); COPY(regs[13]); COPY(regs[14]); COPY(regs[15]); - COPY(regs[16]); COPY(regs[17]); COPY(regs[18]); COPY(regs[19]); - COPY(regs[20]); COPY(regs[21]); COPY(regs[22]); COPY(regs[23]); - COPY(regs[24]); COPY(regs[25]); COPY(regs[26]); COPY(regs[27]); - COPY(regs[28]); COPY(regs[29]); COPY(regs[30]); COPY(regs[31]); - COPY(regs[32]); COPY(regs[33]); COPY(regs[34]); COPY(regs[35]); - COPY(regs[36]); COPY(regs[37]); COPY(regs[38]); COPY(regs[39]); - COPY(regs[40]); COPY(regs[41]); COPY(regs[42]); COPY(regs[43]); - COPY(regs[44]); COPY(regs[45]); COPY(regs[46]); COPY(regs[47]); - COPY(regs[48]); COPY(regs[49]); COPY(regs[50]); COPY(regs[51]); - COPY(regs[52]); COPY(regs[53]); COPY(regs[54]); COPY(regs[55]); - COPY(regs[56]); COPY(regs[57]); COPY(regs[58]); COPY(regs[59]); - COPY(regs[60]); COPY(regs[61]); COPY(regs[62]); - COPY(tregs[0]); COPY(tregs[1]); COPY(tregs[2]); COPY(tregs[3]); - COPY(tregs[4]); COPY(tregs[5]); COPY(tregs[6]); COPY(tregs[7]); - - /* Prevent the signal handler manipulating SR in a way that can - crash the kernel. i.e. only allow S, Q, M, PR, SZ, FR to be - modified */ - current_sr = regs->sr; - err |= __get_user(new_sr, &sc->sc_sr); - regs->sr &= SR_MASK; - regs->sr |= (new_sr & ~SR_MASK); - - COPY(pc); - -#undef COPY - - /* Must do this last in case it sets regs->sr.fd (i.e. after rest of sr - * has been restored above.) */ - err |= restore_sigcontext_fpu(regs, sc); - - regs->syscall_nr = -1; /* disable syscall checks */ - err |= __get_user(*r2_p, &sc->sc_regs[REG_RET]); - return err; -} - -asmlinkage int sys_sigreturn(unsigned long r2, unsigned long r3, - unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs * regs) -{ - struct sigframe __user *frame = (struct sigframe __user *) (long) REF_REG_SP; - sigset_t set; - long long ret; - - /* Always make any pending restarted system calls return -EINTR */ - current->restart_block.fn = do_no_restart_syscall; - - if (!access_ok(frame, sizeof(*frame))) - goto badframe; - - if (__get_user(set.sig[0], &frame->sc.oldmask) - || (_NSIG_WORDS > 1 - && __copy_from_user(&set.sig[1], &frame->extramask, - sizeof(frame->extramask)))) - goto badframe; - - set_current_blocked(&set); - - if (restore_sigcontext(regs, &frame->sc, &ret)) - goto badframe; - regs->pc -= 4; - - return (int) ret; - -badframe: - force_sig(SIGSEGV); - return 0; -} - -asmlinkage int sys_rt_sigreturn(unsigned long r2, unsigned long r3, - unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs * regs) -{ - struct rt_sigframe __user *frame = (struct rt_sigframe __user *) (long) REF_REG_SP; - sigset_t set; - long long ret; - - /* Always make any pending restarted system calls return -EINTR */ - current->restart_block.fn = do_no_restart_syscall; - - if (!access_ok(frame, sizeof(*frame))) - goto badframe; - - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) - goto badframe; - - set_current_blocked(&set); - - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ret)) - goto badframe; - regs->pc -= 4; - - if (restore_altstack(&frame->uc.uc_stack)) - goto badframe; - - return (int) ret; - -badframe: - force_sig(SIGSEGV); - return 0; -} - -/* - * Set up a signal frame. - */ -static int -setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, - unsigned long mask) -{ - int err = 0; - - /* Do this first, otherwise is this sets sr->fd, that value isn't preserved. */ - err |= setup_sigcontext_fpu(regs, sc); - -#define COPY(x) err |= __put_user(regs->x, &sc->sc_##x) - - COPY(regs[0]); COPY(regs[1]); COPY(regs[2]); COPY(regs[3]); - COPY(regs[4]); COPY(regs[5]); COPY(regs[6]); COPY(regs[7]); - COPY(regs[8]); COPY(regs[9]); COPY(regs[10]); COPY(regs[11]); - COPY(regs[12]); COPY(regs[13]); COPY(regs[14]); COPY(regs[15]); - COPY(regs[16]); COPY(regs[17]); COPY(regs[18]); COPY(regs[19]); - COPY(regs[20]); COPY(regs[21]); COPY(regs[22]); COPY(regs[23]); - COPY(regs[24]); COPY(regs[25]); COPY(regs[26]); COPY(regs[27]); - COPY(regs[28]); COPY(regs[29]); COPY(regs[30]); COPY(regs[31]); - COPY(regs[32]); COPY(regs[33]); COPY(regs[34]); COPY(regs[35]); - COPY(regs[36]); COPY(regs[37]); COPY(regs[38]); COPY(regs[39]); - COPY(regs[40]); COPY(regs[41]); COPY(regs[42]); COPY(regs[43]); - COPY(regs[44]); COPY(regs[45]); COPY(regs[46]); COPY(regs[47]); - COPY(regs[48]); COPY(regs[49]); COPY(regs[50]); COPY(regs[51]); - COPY(regs[52]); COPY(regs[53]); COPY(regs[54]); COPY(regs[55]); - COPY(regs[56]); COPY(regs[57]); COPY(regs[58]); COPY(regs[59]); - COPY(regs[60]); COPY(regs[61]); COPY(regs[62]); - COPY(tregs[0]); COPY(tregs[1]); COPY(tregs[2]); COPY(tregs[3]); - COPY(tregs[4]); COPY(tregs[5]); COPY(tregs[6]); COPY(tregs[7]); - COPY(sr); COPY(pc); - -#undef COPY - - err |= __put_user(mask, &sc->oldmask); - - return err; -} - -/* - * Determine which stack to use.. - */ -static inline void __user * -get_sigframe(struct k_sigaction *ka, unsigned long sp, size_t frame_size) -{ - if ((ka->sa.sa_flags & SA_ONSTACK) != 0 && ! sas_ss_flags(sp)) - sp = current->sas_ss_sp + current->sas_ss_size; - - return (void __user *)((sp - frame_size) & -8ul); -} - -void sa_default_restorer(void); /* See comments below */ -void sa_default_rt_restorer(void); /* See comments below */ - -static int setup_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) -{ - struct sigframe __user *frame; - int err = 0, sig = ksig->sig; - int signal; - - frame = get_sigframe(&ksig->ka, regs->regs[REG_SP], sizeof(*frame)); - - if (!access_ok(frame, sizeof(*frame))) - return -EFAULT; - - err |= setup_sigcontext(&frame->sc, regs, set->sig[0]); - - /* Give up earlier as i386, in case */ - if (err) - return -EFAULT; - - if (_NSIG_WORDS > 1) { - err |= __copy_to_user(frame->extramask, &set->sig[1], - sizeof(frame->extramask)); } - - /* Give up earlier as i386, in case */ - if (err) - return -EFAULT; - - /* Set up to return from userspace. If provided, use a stub - already in userspace. */ - if (ksig->ka.sa.sa_flags & SA_RESTORER) { - /* - * On SH5 all edited pointers are subject to NEFF - */ - DEREF_REG_PR = neff_sign_extend((unsigned long) - ksig->ka->sa.sa_restorer | 0x1); - } else { - /* - * Different approach on SH5. - * . Endianness independent asm code gets placed in entry.S . - * This is limited to four ASM instructions corresponding - * to two long longs in size. - * . err checking is done on the else branch only - * . flush_icache_range() is called upon __put_user() only - * . all edited pointers are subject to NEFF - * . being code, linker turns ShMedia bit on, always - * dereference index -1. - */ - DEREF_REG_PR = neff_sign_extend((unsigned long) - frame->retcode | 0x01); - - if (__copy_to_user(frame->retcode, - (void *)((unsigned long)sa_default_restorer & (~1)), 16) != 0) - return -EFAULT; - - /* Cohere the trampoline with the I-cache. */ - flush_cache_sigtramp(DEREF_REG_PR-1); - } - - /* - * Set up registers for signal handler. - * All edited pointers are subject to NEFF. - */ - regs->regs[REG_SP] = neff_sign_extend((unsigned long)frame); - regs->regs[REG_ARG1] = sig; /* Arg for signal handler */ - - /* FIXME: - The glibc profiling support for SH-5 needs to be passed a sigcontext - so it can retrieve the PC. At some point during 2003 the glibc - support was changed to receive the sigcontext through the 2nd - argument, but there are still versions of libc.so in use that use - the 3rd argument. Until libc.so is stabilised, pass the sigcontext - through both 2nd and 3rd arguments. - */ - - regs->regs[REG_ARG2] = (unsigned long long)(unsigned long)(signed long)&frame->sc; - regs->regs[REG_ARG3] = (unsigned long long)(unsigned long)(signed long)&frame->sc; - - regs->pc = neff_sign_extend((unsigned long)ksig->ka.sa.sa_handler); - - /* Broken %016Lx */ - pr_debug("SIG deliver (#%d,%s:%d): sp=%p pc=%08Lx%08Lx link=%08Lx%08Lx\n", - sig, current->comm, current->pid, frame, - regs->pc >> 32, regs->pc & 0xffffffff, - DEREF_REG_PR >> 32, DEREF_REG_PR & 0xffffffff); - - return 0; -} - -static int setup_rt_frame(struct ksignal *kig, sigset_t *set, - struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - int err = 0, sig = ksig->sig; - - frame = get_sigframe(&ksig->ka, regs->regs[REG_SP], sizeof(*frame)); - - if (!access_ok(frame, sizeof(*frame))) - return -EFAULT; - - err |= __put_user(&frame->info, &frame->pinfo); - err |= __put_user(&frame->uc, &frame->puc); - err |= copy_siginfo_to_user(&frame->info, &ksig->info); - - /* Give up earlier as i386, in case */ - if (err) - return -EFAULT; - - /* Create the ucontext. */ - err |= __put_user(0, &frame->uc.uc_flags); - err |= __put_user(0, &frame->uc.uc_link); - err |= __save_altstack(&frame->uc.uc_stack, regs->regs[REG_SP]); - err |= setup_sigcontext(&frame->uc.uc_mcontext, - regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - - /* Give up earlier as i386, in case */ - if (err) - return -EFAULT; - - /* Set up to return from userspace. If provided, use a stub - already in userspace. */ - if (ksig->ka.sa.sa_flags & SA_RESTORER) { - /* - * On SH5 all edited pointers are subject to NEFF - */ - DEREF_REG_PR = neff_sign_extend((unsigned long) - ksig->ka.sa.sa_restorer | 0x1); - } else { - /* - * Different approach on SH5. - * . Endianness independent asm code gets placed in entry.S . - * This is limited to four ASM instructions corresponding - * to two long longs in size. - * . err checking is done on the else branch only - * . flush_icache_range() is called upon __put_user() only - * . all edited pointers are subject to NEFF - * . being code, linker turns ShMedia bit on, always - * dereference index -1. - */ - DEREF_REG_PR = neff_sign_extend((unsigned long) - frame->retcode | 0x01); - - if (__copy_to_user(frame->retcode, - (void *)((unsigned long)sa_default_rt_restorer & (~1)), 16) != 0) - return -EFAULT; - - /* Cohere the trampoline with the I-cache. */ - flush_icache_range(DEREF_REG_PR-1, DEREF_REG_PR-1+15); - } - - /* - * Set up registers for signal handler. - * All edited pointers are subject to NEFF. - */ - regs->regs[REG_SP] = neff_sign_extend((unsigned long)frame); - regs->regs[REG_ARG1] = sig; /* Arg for signal handler */ - regs->regs[REG_ARG2] = (unsigned long long)(unsigned long)(signed long)&frame->info; - regs->regs[REG_ARG3] = (unsigned long long)(unsigned long)(signed long)&frame->uc.uc_mcontext; - regs->pc = neff_sign_extend((unsigned long)ksig->ka.sa.sa_handler); - - pr_debug("SIG deliver (#%d,%s:%d): sp=%p pc=%08Lx%08Lx link=%08Lx%08Lx\n", - sig, current->comm, current->pid, frame, - regs->pc >> 32, regs->pc & 0xffffffff, - DEREF_REG_PR >> 32, DEREF_REG_PR & 0xffffffff); - - return 0; -} - -/* - * OK, we're invoking a handler - */ -static void -handle_signal(struct ksignal *ksig, struct pt_regs *regs) -{ - sigset_t *oldset = sigmask_to_save(); - int ret; - - /* Set up the stack frame */ - if (ksig->ka.sa.sa_flags & SA_SIGINFO) - ret = setup_rt_frame(ksig, oldset, regs); - else - ret = setup_frame(ksig, oldset, regs); - - signal_setup_done(ret, ksig, test_thread_flag(TIF_SINGLESTEP)); -} - -asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags) -{ - if (thread_info_flags & _TIF_SIGPENDING) - do_signal(regs); - - if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); - tracehook_notify_resume(regs); - } -} diff --git a/arch/sh/kernel/syscalls_64.S b/arch/sh/kernel/syscalls_64.S deleted file mode 100644 index 1bcb86f0b728..000000000000 --- a/arch/sh/kernel/syscalls_64.S +++ /dev/null @@ -1,419 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * arch/sh/kernel/syscalls_64.S - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2004 - 2007 Paul Mundt - * Copyright (C) 2003, 2004 Richard Curnow - */ - -#include - - .section .data, "aw" - .balign 32 - -/* - * System calls jump table - */ - .globl sys_call_table -sys_call_table: - .long sys_restart_syscall /* 0 - old "setup()" system call */ - .long sys_exit - .long sys_fork - .long sys_read - .long sys_write - .long sys_open /* 5 */ - .long sys_close - .long sys_waitpid - .long sys_creat - .long sys_link - .long sys_unlink /* 10 */ - .long sys_execve - .long sys_chdir - .long sys_time - .long sys_mknod - .long sys_chmod /* 15 */ - .long sys_lchown16 - .long sys_ni_syscall /* old break syscall holder */ - .long sys_stat - .long sys_lseek - .long sys_getpid /* 20 */ - .long sys_mount - .long sys_oldumount - .long sys_setuid16 - .long sys_getuid16 - .long sys_stime /* 25 */ - .long sh64_ptrace - .long sys_alarm - .long sys_fstat - .long sys_pause - .long sys_utime /* 30 */ - .long sys_ni_syscall /* old stty syscall holder */ - .long sys_ni_syscall /* old gtty syscall holder */ - .long sys_access - .long sys_nice - .long sys_ni_syscall /* 35 */ /* old ftime syscall holder */ - .long sys_sync - .long sys_kill - .long sys_rename - .long sys_mkdir - .long sys_rmdir /* 40 */ - .long sys_dup - .long sys_pipe - .long sys_times - .long sys_ni_syscall /* old prof syscall holder */ - .long sys_brk /* 45 */ - .long sys_setgid16 - .long sys_getgid16 - .long sys_signal - .long sys_geteuid16 - .long sys_getegid16 /* 50 */ - .long sys_acct - .long sys_umount /* recycled never used phys( */ - .long sys_ni_syscall /* old lock syscall holder */ - .long sys_ioctl - .long sys_fcntl /* 55 */ - .long sys_ni_syscall /* old mpx syscall holder */ - .long sys_setpgid - .long sys_ni_syscall /* old ulimit syscall holder */ - .long sys_ni_syscall /* sys_olduname */ - .long sys_umask /* 60 */ - .long sys_chroot - .long sys_ustat - .long sys_dup2 - .long sys_getppid - .long sys_getpgrp /* 65 */ - .long sys_setsid - .long sys_sigaction - .long sys_sgetmask - .long sys_ssetmask - .long sys_setreuid16 /* 70 */ - .long sys_setregid16 - .long sys_sigsuspend - .long sys_sigpending - .long sys_sethostname - .long sys_setrlimit /* 75 */ - .long sys_old_getrlimit - .long sys_getrusage - .long sys_gettimeofday - .long sys_settimeofday - .long sys_getgroups16 /* 80 */ - .long sys_setgroups16 - .long sys_ni_syscall /* sys_oldselect */ - .long sys_symlink - .long sys_lstat - .long sys_readlink /* 85 */ - .long sys_uselib - .long sys_swapon - .long sys_reboot - .long sys_old_readdir - .long old_mmap /* 90 */ - .long sys_munmap - .long sys_truncate - .long sys_ftruncate - .long sys_fchmod - .long sys_fchown16 /* 95 */ - .long sys_getpriority - .long sys_setpriority - .long sys_ni_syscall /* old profil syscall holder */ - .long sys_statfs - .long sys_fstatfs /* 100 */ - .long sys_ni_syscall /* ioperm */ - .long sys_socketcall /* Obsolete implementation of socket syscall */ - .long sys_syslog - .long sys_setitimer - .long sys_getitimer /* 105 */ - .long sys_newstat - .long sys_newlstat - .long sys_newfstat - .long sys_uname - .long sys_ni_syscall /* 110 */ /* iopl */ - .long sys_vhangup - .long sys_ni_syscall /* idle */ - .long sys_ni_syscall /* vm86old */ - .long sys_wait4 - .long sys_swapoff /* 115 */ - .long sys_sysinfo - .long sys_ipc /* Obsolete ipc syscall implementation */ - .long sys_fsync - .long sys_sigreturn - .long sys_clone /* 120 */ - .long sys_setdomainname - .long sys_newuname - .long sys_cacheflush /* x86: sys_modify_ldt */ - .long sys_adjtimex - .long sys_mprotect /* 125 */ - .long sys_sigprocmask - .long sys_ni_syscall /* old "create_module" */ - .long sys_init_module - .long sys_delete_module - .long sys_ni_syscall /* 130: old "get_kernel_syms" */ - .long sys_quotactl - .long sys_getpgid - .long sys_fchdir - .long sys_bdflush - .long sys_sysfs /* 135 */ - .long sys_personality - .long sys_ni_syscall /* for afs_syscall */ - .long sys_setfsuid16 - .long sys_setfsgid16 - .long sys_llseek /* 140 */ - .long sys_getdents - .long sys_select - .long sys_flock - .long sys_msync - .long sys_readv /* 145 */ - .long sys_writev - .long sys_getsid - .long sys_fdatasync - .long sys_sysctl - .long sys_mlock /* 150 */ - .long sys_munlock - .long sys_mlockall - .long sys_munlockall - .long sys_sched_setparam - .long sys_sched_getparam /* 155 */ - .long sys_sched_setscheduler - .long sys_sched_getscheduler - .long sys_sched_yield - .long sys_sched_get_priority_max - .long sys_sched_get_priority_min /* 160 */ - .long sys_sched_rr_get_interval - .long sys_nanosleep - .long sys_mremap - .long sys_setresuid16 - .long sys_getresuid16 /* 165 */ - .long sys_ni_syscall /* vm86 */ - .long sys_ni_syscall /* old "query_module" */ - .long sys_poll - .long sys_ni_syscall /* was nfsservctl */ - .long sys_setresgid16 /* 170 */ - .long sys_getresgid16 - .long sys_prctl - .long sys_rt_sigreturn - .long sys_rt_sigaction - .long sys_rt_sigprocmask /* 175 */ - .long sys_rt_sigpending - .long sys_rt_sigtimedwait - .long sys_rt_sigqueueinfo - .long sys_rt_sigsuspend - .long sys_pread64 /* 180 */ - .long sys_pwrite64 - .long sys_chown16 - .long sys_getcwd - .long sys_capget - .long sys_capset /* 185 */ - .long sys_sigaltstack - .long sys_sendfile - .long sys_ni_syscall /* getpmsg */ - .long sys_ni_syscall /* putpmsg */ - .long sys_vfork /* 190 */ - .long sys_getrlimit - .long sys_mmap2 - .long sys_truncate64 - .long sys_ftruncate64 - .long sys_stat64 /* 195 */ - .long sys_lstat64 - .long sys_fstat64 - .long sys_lchown - .long sys_getuid - .long sys_getgid /* 200 */ - .long sys_geteuid - .long sys_getegid - .long sys_setreuid - .long sys_setregid - .long sys_getgroups /* 205 */ - .long sys_setgroups - .long sys_fchown - .long sys_setresuid - .long sys_getresuid - .long sys_setresgid /* 210 */ - .long sys_getresgid - .long sys_chown - .long sys_setuid - .long sys_setgid - .long sys_setfsuid /* 215 */ - .long sys_setfsgid - .long sys_pivot_root - .long sys_mincore - .long sys_madvise - /* Broken-out socket family (maintain backwards compatibility in syscall - numbering with 2.4) */ - .long sys_socket /* 220 */ - .long sys_bind - .long sys_connect - .long sys_listen - .long sys_accept - .long sys_getsockname /* 225 */ - .long sys_getpeername - .long sys_socketpair - .long sys_send - .long sys_sendto - .long sys_recv /* 230*/ - .long sys_recvfrom - .long sys_shutdown - .long sys_setsockopt - .long sys_getsockopt - .long sys_sendmsg /* 235 */ - .long sys_recvmsg - /* Broken-out IPC family (maintain backwards compatibility in syscall - numbering with 2.4) */ - .long sys_semop - .long sys_semget - .long sys_semctl - .long sys_msgsnd /* 240 */ - .long sys_msgrcv - .long sys_msgget - .long sys_msgctl - .long sys_shmat - .long sys_shmdt /* 245 */ - .long sys_shmget - .long sys_shmctl - /* Rest of syscalls listed in 2.4 i386 unistd.h */ - .long sys_getdents64 - .long sys_fcntl64 - .long sys_ni_syscall /* 250 reserved for TUX */ - .long sys_ni_syscall /* Reserved for Security */ - .long sys_gettid - .long sys_readahead - .long sys_setxattr - .long sys_lsetxattr /* 255 */ - .long sys_fsetxattr - .long sys_getxattr - .long sys_lgetxattr - .long sys_fgetxattr - .long sys_listxattr /* 260 */ - .long sys_llistxattr - .long sys_flistxattr - .long sys_removexattr - .long sys_lremovexattr - .long sys_fremovexattr /* 265 */ - .long sys_tkill - .long sys_sendfile64 - .long sys_futex - .long sys_sched_setaffinity - .long sys_sched_getaffinity /* 270 */ - .long sys_ni_syscall /* reserved for set_thread_area */ - .long sys_ni_syscall /* reserved for get_thread_area */ - .long sys_io_setup - .long sys_io_destroy - .long sys_io_getevents /* 275 */ - .long sys_io_submit - .long sys_io_cancel - .long sys_fadvise64 - .long sys_ni_syscall - .long sys_exit_group /* 280 */ - /* Rest of new 2.6 syscalls */ - .long sys_lookup_dcookie - .long sys_epoll_create - .long sys_epoll_ctl - .long sys_epoll_wait - .long sys_remap_file_pages /* 285 */ - .long sys_set_tid_address - .long sys_timer_create - .long sys_timer_settime - .long sys_timer_gettime - .long sys_timer_getoverrun /* 290 */ - .long sys_timer_delete - .long sys_clock_settime - .long sys_clock_gettime - .long sys_clock_getres - .long sys_clock_nanosleep /* 295 */ - .long sys_statfs64 - .long sys_fstatfs64 - .long sys_tgkill - .long sys_utimes - .long sys_fadvise64_64 /* 300 */ - .long sys_ni_syscall /* Reserved for vserver */ - .long sys_ni_syscall /* Reserved for mbind */ - .long sys_ni_syscall /* get_mempolicy */ - .long sys_ni_syscall /* set_mempolicy */ - .long sys_mq_open /* 305 */ - .long sys_mq_unlink - .long sys_mq_timedsend - .long sys_mq_timedreceive - .long sys_mq_notify - .long sys_mq_getsetattr /* 310 */ - .long sys_ni_syscall /* Reserved for kexec */ - .long sys_waitid - .long sys_add_key - .long sys_request_key - .long sys_keyctl /* 315 */ - .long sys_ioprio_set - .long sys_ioprio_get - .long sys_inotify_init - .long sys_inotify_add_watch - .long sys_inotify_rm_watch /* 320 */ - .long sys_ni_syscall - .long sys_migrate_pages - .long sys_openat - .long sys_mkdirat - .long sys_mknodat /* 325 */ - .long sys_fchownat - .long sys_futimesat - .long sys_fstatat64 - .long sys_unlinkat - .long sys_renameat /* 330 */ - .long sys_linkat - .long sys_symlinkat - .long sys_readlinkat - .long sys_fchmodat - .long sys_faccessat /* 335 */ - .long sys_pselect6 - .long sys_ppoll - .long sys_unshare - .long sys_set_robust_list - .long sys_get_robust_list /* 340 */ - .long sys_splice - .long sys_sync_file_range - .long sys_tee - .long sys_vmsplice - .long sys_move_pages /* 345 */ - .long sys_getcpu - .long sys_epoll_pwait - .long sys_utimensat - .long sys_signalfd - .long sys_timerfd_create /* 350 */ - .long sys_eventfd - .long sys_fallocate - .long sys_timerfd_settime - .long sys_timerfd_gettime - .long sys_signalfd4 /* 355 */ - .long sys_eventfd2 - .long sys_epoll_create1 - .long sys_dup3 - .long sys_pipe2 - .long sys_inotify_init1 /* 360 */ - .long sys_preadv - .long sys_pwritev - .long sys_rt_tgsigqueueinfo - .long sys_perf_event_open - .long sys_recvmmsg /* 365 */ - .long sys_accept4 - .long sys_fanotify_init - .long sys_fanotify_mark - .long sys_prlimit64 - .long sys_name_to_handle_at /* 370 */ - .long sys_open_by_handle_at - .long sys_clock_adjtime - .long sys_syncfs - .long sys_sendmmsg - .long sys_setns /* 375 */ - .long sys_process_vm_readv - .long sys_process_vm_writev - .long sys_kcmp - .long sys_finit_module - .long sys_sched_getattr /* 380 */ - .long sys_sched_setattr - .long sys_renameat2 - .long sys_seccomp - .long sys_getrandom - .long sys_memfd_create /* 385 */ - .long sys_bpf - .long sys_execveat - .long sys_userfaultfd - .long sys_membarrier - .long sys_mlock2 /* 390 */ - .long sys_copy_file_range - .long sys_preadv2 - .long sys_pwritev2 diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c deleted file mode 100644 index 37046f3a26d3..000000000000 --- a/arch/sh/kernel/traps_64.c +++ /dev/null @@ -1,814 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * arch/sh/kernel/traps_64.c - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2003, 2004 Paul Mundt - * Copyright (C) 2003, 2004 Richard Curnow - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int read_opcode(reg_size_t pc, insn_size_t *result_opcode, int from_user_mode) -{ - int get_user_error; - unsigned long aligned_pc; - insn_size_t opcode; - - if ((pc & 3) == 1) { - /* SHmedia */ - aligned_pc = pc & ~3; - if (from_user_mode) { - if (!access_ok(aligned_pc, sizeof(insn_size_t))) { - get_user_error = -EFAULT; - } else { - get_user_error = __get_user(opcode, (insn_size_t *)aligned_pc); - *result_opcode = opcode; - } - return get_user_error; - } else { - /* If the fault was in the kernel, we can either read - * this directly, or if not, we fault. - */ - *result_opcode = *(insn_size_t *)aligned_pc; - return 0; - } - } else if ((pc & 1) == 0) { - /* SHcompact */ - /* TODO : provide handling for this. We don't really support - user-mode SHcompact yet, and for a kernel fault, this would - have to come from a module built for SHcompact. */ - return -EFAULT; - } else { - /* misaligned */ - return -EFAULT; - } -} - -static int address_is_sign_extended(__u64 a) -{ - __u64 b; -#if (NEFF == 32) - b = (__u64)(__s64)(__s32)(a & 0xffffffffUL); - return (b == a) ? 1 : 0; -#else -#error "Sign extend check only works for NEFF==32" -#endif -} - -/* return -1 for fault, 0 for OK */ -static int generate_and_check_address(struct pt_regs *regs, - insn_size_t opcode, - int displacement_not_indexed, - int width_shift, - __u64 *address) -{ - __u64 base_address, addr; - int basereg; - - switch (1 << width_shift) { - case 1: inc_unaligned_byte_access(); break; - case 2: inc_unaligned_word_access(); break; - case 4: inc_unaligned_dword_access(); break; - case 8: inc_unaligned_multi_access(); break; - } - - basereg = (opcode >> 20) & 0x3f; - base_address = regs->regs[basereg]; - if (displacement_not_indexed) { - __s64 displacement; - displacement = (opcode >> 10) & 0x3ff; - displacement = sign_extend64(displacement, 9); - addr = (__u64)((__s64)base_address + (displacement << width_shift)); - } else { - __u64 offset; - int offsetreg; - offsetreg = (opcode >> 10) & 0x3f; - offset = regs->regs[offsetreg]; - addr = base_address + offset; - } - - /* Check sign extended */ - if (!address_is_sign_extended(addr)) - return -1; - - /* Check accessible. For misaligned access in the kernel, assume the - address is always accessible (and if not, just fault when the - load/store gets done.) */ - if (user_mode(regs)) { - inc_unaligned_user_access(); - - if (addr >= TASK_SIZE) - return -1; - } else - inc_unaligned_kernel_access(); - - *address = addr; - - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, addr); - unaligned_fixups_notify(current, opcode, regs); - - return 0; -} - -static void misaligned_kernel_word_load(__u64 address, int do_sign_extend, __u64 *result) -{ - unsigned short x; - unsigned char *p, *q; - p = (unsigned char *) (int) address; - q = (unsigned char *) &x; - q[0] = p[0]; - q[1] = p[1]; - - if (do_sign_extend) { - *result = (__u64)(__s64) *(short *) &x; - } else { - *result = (__u64) x; - } -} - -static void misaligned_kernel_word_store(__u64 address, __u64 value) -{ - unsigned short x; - unsigned char *p, *q; - p = (unsigned char *) (int) address; - q = (unsigned char *) &x; - - x = (__u16) value; - p[0] = q[0]; - p[1] = q[1]; -} - -static int misaligned_load(struct pt_regs *regs, - insn_size_t opcode, - int displacement_not_indexed, - int width_shift, - int do_sign_extend) -{ - /* Return -1 for a fault, 0 for OK */ - int error; - int destreg; - __u64 address; - - error = generate_and_check_address(regs, opcode, - displacement_not_indexed, width_shift, &address); - if (error < 0) - return error; - - destreg = (opcode >> 4) & 0x3f; - if (user_mode(regs)) { - __u64 buffer; - - if (!access_ok((unsigned long) address, 1UL< 0) { - return -1; /* fault */ - } - switch (width_shift) { - case 1: - if (do_sign_extend) { - regs->regs[destreg] = (__u64)(__s64) *(__s16 *) &buffer; - } else { - regs->regs[destreg] = (__u64) *(__u16 *) &buffer; - } - break; - case 2: - regs->regs[destreg] = (__u64)(__s64) *(__s32 *) &buffer; - break; - case 3: - regs->regs[destreg] = buffer; - break; - default: - printk("Unexpected width_shift %d in misaligned_load, PC=%08lx\n", - width_shift, (unsigned long) regs->pc); - break; - } - } else { - /* kernel mode - we can take short cuts since if we fault, it's a genuine bug */ - __u64 lo, hi; - - switch (width_shift) { - case 1: - misaligned_kernel_word_load(address, do_sign_extend, ®s->regs[destreg]); - break; - case 2: - asm ("ldlo.l %1, 0, %0" : "=r" (lo) : "r" (address)); - asm ("ldhi.l %1, 3, %0" : "=r" (hi) : "r" (address)); - regs->regs[destreg] = lo | hi; - break; - case 3: - asm ("ldlo.q %1, 0, %0" : "=r" (lo) : "r" (address)); - asm ("ldhi.q %1, 7, %0" : "=r" (hi) : "r" (address)); - regs->regs[destreg] = lo | hi; - break; - - default: - printk("Unexpected width_shift %d in misaligned_load, PC=%08lx\n", - width_shift, (unsigned long) regs->pc); - break; - } - } - - return 0; -} - -static int misaligned_store(struct pt_regs *regs, - insn_size_t opcode, - int displacement_not_indexed, - int width_shift) -{ - /* Return -1 for a fault, 0 for OK */ - int error; - int srcreg; - __u64 address; - - error = generate_and_check_address(regs, opcode, - displacement_not_indexed, width_shift, &address); - if (error < 0) - return error; - - srcreg = (opcode >> 4) & 0x3f; - if (user_mode(regs)) { - __u64 buffer; - - if (!access_ok((unsigned long) address, 1UL<regs[srcreg]; - break; - case 2: - *(__u32 *) &buffer = (__u32) regs->regs[srcreg]; - break; - case 3: - buffer = regs->regs[srcreg]; - break; - default: - printk("Unexpected width_shift %d in misaligned_store, PC=%08lx\n", - width_shift, (unsigned long) regs->pc); - break; - } - - if (__copy_user((void *)(int)address, &buffer, (1 << width_shift)) > 0) { - return -1; /* fault */ - } - } else { - /* kernel mode - we can take short cuts since if we fault, it's a genuine bug */ - __u64 val = regs->regs[srcreg]; - - switch (width_shift) { - case 1: - misaligned_kernel_word_store(address, val); - break; - case 2: - asm ("stlo.l %1, 0, %0" : : "r" (val), "r" (address)); - asm ("sthi.l %1, 3, %0" : : "r" (val), "r" (address)); - break; - case 3: - asm ("stlo.q %1, 0, %0" : : "r" (val), "r" (address)); - asm ("sthi.q %1, 7, %0" : : "r" (val), "r" (address)); - break; - - default: - printk("Unexpected width_shift %d in misaligned_store, PC=%08lx\n", - width_shift, (unsigned long) regs->pc); - break; - } - } - - return 0; -} - -/* Never need to fix up misaligned FPU accesses within the kernel since that's a real - error. */ -static int misaligned_fpu_load(struct pt_regs *regs, - insn_size_t opcode, - int displacement_not_indexed, - int width_shift, - int do_paired_load) -{ - /* Return -1 for a fault, 0 for OK */ - int error; - int destreg; - __u64 address; - - error = generate_and_check_address(regs, opcode, - displacement_not_indexed, width_shift, &address); - if (error < 0) - return error; - - destreg = (opcode >> 4) & 0x3f; - if (user_mode(regs)) { - __u64 buffer; - __u32 buflo, bufhi; - - if (!access_ok((unsigned long) address, 1UL< 0) { - return -1; /* fault */ - } - /* 'current' may be the current owner of the FPU state, so - context switch the registers into memory so they can be - indexed by register number. */ - if (last_task_used_math == current) { - enable_fpu(); - save_fpu(current); - disable_fpu(); - last_task_used_math = NULL; - regs->sr |= SR_FD; - } - - buflo = *(__u32*) &buffer; - bufhi = *(1 + (__u32*) &buffer); - - switch (width_shift) { - case 2: - current->thread.xstate->hardfpu.fp_regs[destreg] = buflo; - break; - case 3: - if (do_paired_load) { - current->thread.xstate->hardfpu.fp_regs[destreg] = buflo; - current->thread.xstate->hardfpu.fp_regs[destreg+1] = bufhi; - } else { -#if defined(CONFIG_CPU_LITTLE_ENDIAN) - current->thread.xstate->hardfpu.fp_regs[destreg] = bufhi; - current->thread.xstate->hardfpu.fp_regs[destreg+1] = buflo; -#else - current->thread.xstate->hardfpu.fp_regs[destreg] = buflo; - current->thread.xstate->hardfpu.fp_regs[destreg+1] = bufhi; -#endif - } - break; - default: - printk("Unexpected width_shift %d in misaligned_fpu_load, PC=%08lx\n", - width_shift, (unsigned long) regs->pc); - break; - } - return 0; - } else { - die ("Misaligned FPU load inside kernel", regs, 0); - return -1; - } -} - -static int misaligned_fpu_store(struct pt_regs *regs, - insn_size_t opcode, - int displacement_not_indexed, - int width_shift, - int do_paired_load) -{ - /* Return -1 for a fault, 0 for OK */ - int error; - int srcreg; - __u64 address; - - error = generate_and_check_address(regs, opcode, - displacement_not_indexed, width_shift, &address); - if (error < 0) - return error; - - srcreg = (opcode >> 4) & 0x3f; - if (user_mode(regs)) { - __u64 buffer; - /* Initialise these to NaNs. */ - __u32 buflo=0xffffffffUL, bufhi=0xffffffffUL; - - if (!access_ok((unsigned long) address, 1UL<sr |= SR_FD; - } - - switch (width_shift) { - case 2: - buflo = current->thread.xstate->hardfpu.fp_regs[srcreg]; - break; - case 3: - if (do_paired_load) { - buflo = current->thread.xstate->hardfpu.fp_regs[srcreg]; - bufhi = current->thread.xstate->hardfpu.fp_regs[srcreg+1]; - } else { -#if defined(CONFIG_CPU_LITTLE_ENDIAN) - bufhi = current->thread.xstate->hardfpu.fp_regs[srcreg]; - buflo = current->thread.xstate->hardfpu.fp_regs[srcreg+1]; -#else - buflo = current->thread.xstate->hardfpu.fp_regs[srcreg]; - bufhi = current->thread.xstate->hardfpu.fp_regs[srcreg+1]; -#endif - } - break; - default: - printk("Unexpected width_shift %d in misaligned_fpu_store, PC=%08lx\n", - width_shift, (unsigned long) regs->pc); - break; - } - - *(__u32*) &buffer = buflo; - *(1 + (__u32*) &buffer) = bufhi; - if (__copy_user((void *)(int)address, &buffer, (1 << width_shift)) > 0) { - return -1; /* fault */ - } - return 0; - } else { - die ("Misaligned FPU load inside kernel", regs, 0); - return -1; - } -} - -static int misaligned_fixup(struct pt_regs *regs) -{ - insn_size_t opcode; - int error; - int major, minor; - unsigned int user_action; - - user_action = unaligned_user_action(); - if (!(user_action & UM_FIXUP)) - return -1; - - error = read_opcode(regs->pc, &opcode, user_mode(regs)); - if (error < 0) { - return error; - } - major = (opcode >> 26) & 0x3f; - minor = (opcode >> 16) & 0xf; - - switch (major) { - case (0x84>>2): /* LD.W */ - error = misaligned_load(regs, opcode, 1, 1, 1); - break; - case (0xb0>>2): /* LD.UW */ - error = misaligned_load(regs, opcode, 1, 1, 0); - break; - case (0x88>>2): /* LD.L */ - error = misaligned_load(regs, opcode, 1, 2, 1); - break; - case (0x8c>>2): /* LD.Q */ - error = misaligned_load(regs, opcode, 1, 3, 0); - break; - - case (0xa4>>2): /* ST.W */ - error = misaligned_store(regs, opcode, 1, 1); - break; - case (0xa8>>2): /* ST.L */ - error = misaligned_store(regs, opcode, 1, 2); - break; - case (0xac>>2): /* ST.Q */ - error = misaligned_store(regs, opcode, 1, 3); - break; - - case (0x40>>2): /* indexed loads */ - switch (minor) { - case 0x1: /* LDX.W */ - error = misaligned_load(regs, opcode, 0, 1, 1); - break; - case 0x5: /* LDX.UW */ - error = misaligned_load(regs, opcode, 0, 1, 0); - break; - case 0x2: /* LDX.L */ - error = misaligned_load(regs, opcode, 0, 2, 1); - break; - case 0x3: /* LDX.Q */ - error = misaligned_load(regs, opcode, 0, 3, 0); - break; - default: - error = -1; - break; - } - break; - - case (0x60>>2): /* indexed stores */ - switch (minor) { - case 0x1: /* STX.W */ - error = misaligned_store(regs, opcode, 0, 1); - break; - case 0x2: /* STX.L */ - error = misaligned_store(regs, opcode, 0, 2); - break; - case 0x3: /* STX.Q */ - error = misaligned_store(regs, opcode, 0, 3); - break; - default: - error = -1; - break; - } - break; - - case (0x94>>2): /* FLD.S */ - error = misaligned_fpu_load(regs, opcode, 1, 2, 0); - break; - case (0x98>>2): /* FLD.P */ - error = misaligned_fpu_load(regs, opcode, 1, 3, 1); - break; - case (0x9c>>2): /* FLD.D */ - error = misaligned_fpu_load(regs, opcode, 1, 3, 0); - break; - case (0x1c>>2): /* floating indexed loads */ - switch (minor) { - case 0x8: /* FLDX.S */ - error = misaligned_fpu_load(regs, opcode, 0, 2, 0); - break; - case 0xd: /* FLDX.P */ - error = misaligned_fpu_load(regs, opcode, 0, 3, 1); - break; - case 0x9: /* FLDX.D */ - error = misaligned_fpu_load(regs, opcode, 0, 3, 0); - break; - default: - error = -1; - break; - } - break; - case (0xb4>>2): /* FLD.S */ - error = misaligned_fpu_store(regs, opcode, 1, 2, 0); - break; - case (0xb8>>2): /* FLD.P */ - error = misaligned_fpu_store(regs, opcode, 1, 3, 1); - break; - case (0xbc>>2): /* FLD.D */ - error = misaligned_fpu_store(regs, opcode, 1, 3, 0); - break; - case (0x3c>>2): /* floating indexed stores */ - switch (minor) { - case 0x8: /* FSTX.S */ - error = misaligned_fpu_store(regs, opcode, 0, 2, 0); - break; - case 0xd: /* FSTX.P */ - error = misaligned_fpu_store(regs, opcode, 0, 3, 1); - break; - case 0x9: /* FSTX.D */ - error = misaligned_fpu_store(regs, opcode, 0, 3, 0); - break; - default: - error = -1; - break; - } - break; - - default: - /* Fault */ - error = -1; - break; - } - - if (error < 0) { - return error; - } else { - regs->pc += 4; /* Skip the instruction that's just been emulated */ - return 0; - } -} - -static void do_unhandled_exception(int signr, char *str, unsigned long error, - struct pt_regs *regs) -{ - if (user_mode(regs)) - force_sig(signr); - - die_if_no_fixup(str, regs, error); -} - -#define DO_ERROR(signr, str, name) \ -asmlinkage void do_##name(unsigned long error_code, struct pt_regs *regs) \ -{ \ - do_unhandled_exception(signr, str, error_code, regs); \ -} - -DO_ERROR(SIGILL, "illegal slot instruction", illegal_slot_inst) -DO_ERROR(SIGSEGV, "address error (exec)", address_error_exec) - -#if defined(CONFIG_SH64_ID2815_WORKAROUND) - -#define OPCODE_INVALID 0 -#define OPCODE_USER_VALID 1 -#define OPCODE_PRIV_VALID 2 - -/* getcon/putcon - requires checking which control register is referenced. */ -#define OPCODE_CTRL_REG 3 - -/* Table of valid opcodes for SHmedia mode. - Form a 10-bit value by concatenating the major/minor opcodes i.e. - opcode[31:26,20:16]. The 6 MSBs of this value index into the following - array. The 4 LSBs select the bit-pair in the entry (bits 1:0 correspond to - LSBs==4'b0000 etc). */ -static unsigned long shmedia_opcode_table[64] = { - 0x55554044,0x54445055,0x15141514,0x14541414,0x00000000,0x10001000,0x01110055,0x04050015, - 0x00000444,0xc0000000,0x44545515,0x40405555,0x55550015,0x10005555,0x55555505,0x04050000, - 0x00000555,0x00000404,0x00040445,0x15151414,0x00000000,0x00000000,0x00000000,0x00000000, - 0x00000055,0x40404444,0x00000404,0xc0009495,0x00000000,0x00000000,0x00000000,0x00000000, - 0x55555555,0x55555555,0x55555555,0x55555555,0x55555555,0x55555555,0x55555555,0x55555555, - 0x55555555,0x55555555,0x55555555,0x55555555,0x55555555,0x55555555,0x55555555,0x55555555, - 0x80005050,0x04005055,0x55555555,0x55555555,0x55555555,0x55555555,0x55555555,0x55555555, - 0x81055554,0x00000404,0x55555555,0x55555555,0x00000000,0x00000000,0x00000000,0x00000000 -}; - -/* Workaround SH5-101 cut2 silicon defect #2815 : - in some situations, inter-mode branches from SHcompact -> SHmedia - which should take ITLBMISS or EXECPROT exceptions at the target - falsely take RESINST at the target instead. */ -void do_reserved_inst(unsigned long error_code, struct pt_regs *regs) -{ - insn_size_t opcode = 0x6ff4fff0; /* guaranteed reserved opcode */ - unsigned long pc, aligned_pc; - unsigned long index, shift; - unsigned long major, minor, combined; - unsigned long reserved_field; - int opcode_state; - int get_user_error; - int signr = SIGILL; - char *exception_name = "reserved_instruction"; - - pc = regs->pc; - - /* SHcompact is not handled */ - if (unlikely((pc & 3) == 0)) - goto out; - - /* SHmedia : check for defect. This requires executable vmas - to be readable too. */ - aligned_pc = pc & ~3; - if (!access_ok(aligned_pc, sizeof(insn_size_t))) - get_user_error = -EFAULT; - else - get_user_error = __get_user(opcode, (insn_size_t *)aligned_pc); - - if (get_user_error < 0) { - /* - * Error trying to read opcode. This typically means a - * real fault, not a RESINST any more. So change the - * codes. - */ - exception_name = "address error (exec)"; - signr = SIGSEGV; - goto out; - } - - /* These bits are currently reserved as zero in all valid opcodes */ - reserved_field = opcode & 0xf; - if (unlikely(reserved_field)) - goto out; /* invalid opcode */ - - major = (opcode >> 26) & 0x3f; - minor = (opcode >> 16) & 0xf; - combined = (major << 4) | minor; - index = major; - shift = minor << 1; - opcode_state = (shmedia_opcode_table[index] >> shift) & 0x3; - switch (opcode_state) { - case OPCODE_INVALID: - /* Trap. */ - break; - case OPCODE_USER_VALID: - /* - * Restart the instruction: the branch to the instruction - * will now be from an RTE not from SHcompact so the - * silicon defect won't be triggered. - */ - return; - case OPCODE_PRIV_VALID: - if (!user_mode(regs)) { - /* - * Should only ever get here if a module has - * SHcompact code inside it. If so, the same fix - * up is needed. - */ - return; /* same reason */ - } - - /* - * Otherwise, user mode trying to execute a privileged - * instruction - fall through to trap. - */ - break; - case OPCODE_CTRL_REG: - /* If in privileged mode, return as above. */ - if (!user_mode(regs)) - return; - - /* In user mode ... */ - if (combined == 0x9f) { /* GETCON */ - unsigned long regno = (opcode >> 20) & 0x3f; - - if (regno >= 62) - return; - - /* reserved/privileged control register => trap */ - } else if (combined == 0x1bf) { /* PUTCON */ - unsigned long regno = (opcode >> 4) & 0x3f; - - if (regno >= 62) - return; - - /* reserved/privileged control register => trap */ - } - - break; - default: - /* Fall through to trap. */ - break; - } - -out: - do_unhandled_exception(signr, exception_name, error_code, regs); -} - -#else /* CONFIG_SH64_ID2815_WORKAROUND */ - -/* If the workaround isn't needed, this is just a straightforward reserved - instruction */ -DO_ERROR(SIGILL, "reserved instruction", reserved_inst) - -#endif /* CONFIG_SH64_ID2815_WORKAROUND */ - -/* Called with interrupts disabled */ -asmlinkage void do_exception_error(unsigned long ex, struct pt_regs *regs) -{ - die_if_kernel("exception", regs, ex); -} - -asmlinkage int do_unknown_trapa(unsigned long scId, struct pt_regs *regs) -{ - /* Syscall debug */ - printk("System call ID error: [0x1#args:8 #syscall:16 0x%lx]\n", scId); - - die_if_kernel("unknown trapa", regs, scId); - - return -ENOSYS; -} - -/* Implement misaligned load/store handling for kernel (and optionally for user - mode too). Limitation : only SHmedia mode code is handled - there is no - handling at all for misaligned accesses occurring in SHcompact code yet. */ - -asmlinkage void do_address_error_load(unsigned long error_code, struct pt_regs *regs) -{ - if (misaligned_fixup(regs) < 0) - do_unhandled_exception(SIGSEGV, "address error(load)", - error_code, regs); -} - -asmlinkage void do_address_error_store(unsigned long error_code, struct pt_regs *regs) -{ - if (misaligned_fixup(regs) < 0) - do_unhandled_exception(SIGSEGV, "address error(store)", - error_code, regs); -} - -asmlinkage void do_debug_interrupt(unsigned long code, struct pt_regs *regs) -{ - u64 peek_real_address_q(u64 addr); - u64 poke_real_address_q(u64 addr, u64 val); - unsigned long long DM_EXP_CAUSE_PHY = 0x0c100010; - unsigned long long exp_cause; - /* It's not worth ioremapping the debug module registers for the amount - of access we make to them - just go direct to their physical - addresses. */ - exp_cause = peek_real_address_q(DM_EXP_CAUSE_PHY); - if (exp_cause & ~4) - printk("DM.EXP_CAUSE had unexpected bits set (=%08lx)\n", - (unsigned long)(exp_cause & 0xffffffff)); - show_state(); - /* Clear all DEBUGINT causes */ - poke_real_address_q(DM_EXP_CAUSE_PHY, 0x0); -} - -void per_cpu_trap_init(void) -{ - /* Nothing to do for now, VBR initialization later. */ -} diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index c60b19958c35..bde7a6c01aaf 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -3,14 +3,7 @@ * ld script to make SuperH Linux kernel * Written by Niibe Yutaka and Paul Mundt */ -#ifdef CONFIG_SUPERH64 -#define LOAD_OFFSET PAGE_OFFSET -OUTPUT_ARCH(sh:sh5) -#else -#define LOAD_OFFSET 0 OUTPUT_ARCH(sh) -#endif - #include #include #include @@ -28,14 +21,13 @@ SECTIONS _text = .; /* Text and read-only data */ - .empty_zero_page : AT(ADDR(.empty_zero_page) - LOAD_OFFSET) { + .empty_zero_page : AT(ADDR(.empty_zero_page)) { *(.empty_zero_page) } = 0 - .text : AT(ADDR(.text) - LOAD_OFFSET) { + .text : AT(ADDR(.text)) { HEAD_TEXT TEXT_TEXT - EXTRA_TEXT SCHED_TEXT CPUIDLE_TEXT LOCK_TEXT @@ -62,7 +54,7 @@ SECTIONS INIT_DATA_SECTION(16) . = ALIGN(4); - .machvec.init : AT(ADDR(.machvec.init) - LOAD_OFFSET) { + .machvec.init : AT(ADDR(.machvec.init)) { __machvec_start = .; *(.machvec.init) __machvec_end = .; @@ -74,8 +66,8 @@ SECTIONS * .exit.text is discarded at runtime, not link time, to deal with * references from __bug_table */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { EXIT_TEXT } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { EXIT_DATA } + .exit.text : AT(ADDR(.exit.text)) { EXIT_TEXT } + .exit.data : AT(ADDR(.exit.data)) { EXIT_DATA } . = ALIGN(PAGE_SIZE); __init_end = .; diff --git a/arch/sh/lib64/Makefile b/arch/sh/lib64/Makefile deleted file mode 100644 index 69779ff741df..000000000000 --- a/arch/sh/lib64/Makefile +++ /dev/null @@ -1,17 +0,0 @@ -# -# Makefile for the SH-5 specific library files.. -# -# Copyright (C) 2000, 2001 Paolo Alberelli -# Copyright (C) 2003 - 2008 Paul Mundt -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# - -# Panic should really be compiled as PIC -lib-y := udelay.o panic.o memcpy.o memset.o \ - copy_user_memcpy.o copy_page.o strcpy.o strlen.o - -# Extracted from libgcc -lib-y += udivsi3.o udivdi3.o sdivsi3.o diff --git a/arch/sh/lib64/copy_page.S b/arch/sh/lib64/copy_page.S deleted file mode 100644 index 0ec6fca63b56..000000000000 --- a/arch/sh/lib64/copy_page.S +++ /dev/null @@ -1,89 +0,0 @@ -/* - Copyright 2003 Richard Curnow, SuperH (UK) Ltd. - - This file is subject to the terms and conditions of the GNU General Public - License. See the file "COPYING" in the main directory of this archive - for more details. - - Tight version of mempy for the case of just copying a page. - Prefetch strategy empirically optimised against RTL simulations - of SH5-101 cut2 eval chip with Cayman board DDR memory. - - Parameters: - r2 : destination effective address (start of page) - r3 : source effective address (start of page) - - Always copies 4096 bytes. - - Points to review. - * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead. - It seems like the prefetch needs to be at at least 4 lines ahead to get - the data into the cache in time, and the allocos contend with outstanding - prefetches for the same cache set, so it's better to have the numbers - different. - */ - - .section .text..SHmedia32,"ax" - .little - - .balign 8 - .global copy_page -copy_page: - - /* Copy 4096 bytes worth of data from r3 to r2. - Do prefetches 4 lines ahead. - Do alloco 2 lines ahead */ - - pta 1f, tr1 - pta 2f, tr2 - pta 3f, tr3 - ptabs r18, tr0 - -#if 0 - /* TAKum03020 */ - ld.q r3, 0x00, r63 - ld.q r3, 0x20, r63 - ld.q r3, 0x40, r63 - ld.q r3, 0x60, r63 -#endif - alloco r2, 0x00 - synco ! TAKum03020 - alloco r2, 0x20 - synco ! TAKum03020 - - movi 3968, r6 - add r2, r6, r6 - addi r6, 64, r7 - addi r7, 64, r8 - sub r3, r2, r60 - addi r60, 8, r61 - addi r61, 8, r62 - addi r62, 8, r23 - addi r60, 0x80, r22 - -/* Minimal code size. The extra branches inside the loop don't cost much - because they overlap with the time spent waiting for prefetches to - complete. */ -1: -#if 0 - /* TAKum03020 */ - bge/u r2, r6, tr2 ! skip prefetch for last 4 lines - ldx.q r2, r22, r63 ! prefetch 4 lines hence -#endif -2: - bge/u r2, r7, tr3 ! skip alloco for last 2 lines - alloco r2, 0x40 ! alloc destination line 2 lines ahead - synco ! TAKum03020 -3: - ldx.q r2, r60, r36 - ldx.q r2, r61, r37 - ldx.q r2, r62, r38 - ldx.q r2, r23, r39 - st.q r2, 0, r36 - st.q r2, 8, r37 - st.q r2, 16, r38 - st.q r2, 24, r39 - addi r2, 32, r2 - bgt/l r8, r2, tr1 - - blink tr0, r63 ! return diff --git a/arch/sh/lib64/copy_user_memcpy.S b/arch/sh/lib64/copy_user_memcpy.S deleted file mode 100644 index 515f81b00202..000000000000 --- a/arch/sh/lib64/copy_user_memcpy.S +++ /dev/null @@ -1,218 +0,0 @@ -! SPDX-License-Identifier: GPL-2.0 -! -! Fast SH memcpy -! -! by Toshiyasu Morita (tm@netcom.com) -! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut) -! SH5 code Copyright 2002 SuperH Ltd. -! -! Entry: ARG0: destination pointer -! ARG1: source pointer -! ARG2: byte count -! -! Exit: RESULT: destination pointer -! any other registers in the range r0-r7: trashed -! -! Notes: Usually one wants to do small reads and write a longword, but -! unfortunately it is difficult in some cases to concatanate bytes -! into a longword on the SH, so this does a longword read and small -! writes. -! -! This implementation makes two assumptions about how it is called: -! -! 1.: If the byte count is nonzero, the address of the last byte to be -! copied is unsigned greater than the address of the first byte to -! be copied. This could be easily swapped for a signed comparison, -! but the algorithm used needs some comparison. -! -! 2.: When there are two or three bytes in the last word of an 11-or-more -! bytes memory chunk to b copied, the rest of the word can be read -! without side effects. -! This could be easily changed by increasing the minimum size of -! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2, -! however, this would cost a few extra cyles on average. -! For SHmedia, the assumption is that any quadword can be read in its -! enirety if at least one byte is included in the copy. - -/* Imported into Linux kernel by Richard Curnow. This is used to implement the - __copy_user function in the general case, so it has to be a distinct - function from intra-kernel memcpy to allow for exception fix-ups in the - event that the user pointer is bad somewhere in the copy (e.g. due to - running off the end of the vma). - - Note, this algorithm will be slightly wasteful in the case where the source - and destination pointers are equally aligned, because the stlo/sthi pairs - could then be merged back into single stores. If there are a lot of cache - misses, this is probably offset by the stall lengths on the preloads. - -*/ - -/* NOTE : Prefetches removed and allocos guarded by synco to avoid TAKum03020 - * erratum. The first two prefetches are nop-ed out to avoid upsetting the - * instruction counts used in the jump address calculation. - * */ - - .section .text..SHmedia32,"ax" - .little - .balign 32 - .global copy_user_memcpy - .global copy_user_memcpy_end -copy_user_memcpy: - -#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1 -#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1 -#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1 -#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1 - - nop ! ld.b r3,0,r63 ! TAKum03020 - pta/l Large,tr0 - movi 25,r0 - bgeu/u r4,r0,tr0 - nsb r4,r0 - shlli r0,5,r0 - movi (L1-L0+63*32 + 1) & 0xffff,r1 - sub r1, r0, r0 -L0: ptrel r0,tr0 - add r2,r4,r5 - ptabs r18,tr1 - add r3,r4,r6 - blink tr0,r63 - -/* Rearranged to make cut2 safe */ - .balign 8 -L4_7: /* 4..7 byte memcpy cntd. */ - stlo.l r2, 0, r0 - or r6, r7, r6 - sthi.l r5, -1, r6 - stlo.l r5, -4, r6 - blink tr1,r63 - - .balign 8 -L1: /* 0 byte memcpy */ - nop - blink tr1,r63 - nop - nop - nop - nop - -L2_3: /* 2 or 3 byte memcpy cntd. */ - st.b r5,-1,r6 - blink tr1,r63 - - /* 1 byte memcpy */ - ld.b r3,0,r0 - st.b r2,0,r0 - blink tr1,r63 - -L8_15: /* 8..15 byte memcpy cntd. */ - stlo.q r2, 0, r0 - or r6, r7, r6 - sthi.q r5, -1, r6 - stlo.q r5, -8, r6 - blink tr1,r63 - - /* 2 or 3 byte memcpy */ - ld.b r3,0,r0 - nop ! ld.b r2,0,r63 ! TAKum03020 - ld.b r3,1,r1 - st.b r2,0,r0 - pta/l L2_3,tr0 - ld.b r6,-1,r6 - st.b r2,1,r1 - blink tr0, r63 - - /* 4 .. 7 byte memcpy */ - LDUAL (r3, 0, r0, r1) - pta L4_7, tr0 - ldlo.l r6, -4, r7 - or r0, r1, r0 - sthi.l r2, 3, r0 - ldhi.l r6, -1, r6 - blink tr0, r63 - - /* 8 .. 15 byte memcpy */ - LDUAQ (r3, 0, r0, r1) - pta L8_15, tr0 - ldlo.q r6, -8, r7 - or r0, r1, r0 - sthi.q r2, 7, r0 - ldhi.q r6, -1, r6 - blink tr0, r63 - - /* 16 .. 24 byte memcpy */ - LDUAQ (r3, 0, r0, r1) - LDUAQ (r3, 8, r8, r9) - or r0, r1, r0 - sthi.q r2, 7, r0 - or r8, r9, r8 - sthi.q r2, 15, r8 - ldlo.q r6, -8, r7 - ldhi.q r6, -1, r6 - stlo.q r2, 8, r8 - stlo.q r2, 0, r0 - or r6, r7, r6 - sthi.q r5, -1, r6 - stlo.q r5, -8, r6 - blink tr1,r63 - -Large: - ! ld.b r2, 0, r63 ! TAKum03020 - pta/l Loop_ua, tr1 - ori r3, -8, r7 - sub r2, r7, r22 - sub r3, r2, r6 - add r2, r4, r5 - ldlo.q r3, 0, r0 - addi r5, -16, r5 - movi 64+8, r27 ! could subtract r7 from that. - stlo.q r2, 0, r0 - sthi.q r2, 7, r0 - ldx.q r22, r6, r0 - bgtu/l r27, r4, tr1 - - addi r5, -48, r27 - pta/l Loop_line, tr0 - addi r6, 64, r36 - addi r6, -24, r19 - addi r6, -16, r20 - addi r6, -8, r21 - -Loop_line: - ! ldx.q r22, r36, r63 ! TAKum03020 - alloco r22, 32 - synco - addi r22, 32, r22 - ldx.q r22, r19, r23 - sthi.q r22, -25, r0 - ldx.q r22, r20, r24 - ldx.q r22, r21, r25 - stlo.q r22, -32, r0 - ldx.q r22, r6, r0 - sthi.q r22, -17, r23 - sthi.q r22, -9, r24 - sthi.q r22, -1, r25 - stlo.q r22, -24, r23 - stlo.q r22, -16, r24 - stlo.q r22, -8, r25 - bgeu r27, r22, tr0 - -Loop_ua: - addi r22, 8, r22 - sthi.q r22, -1, r0 - stlo.q r22, -8, r0 - ldx.q r22, r6, r0 - bgtu/l r5, r22, tr1 - - add r3, r4, r7 - ldlo.q r7, -8, r1 - sthi.q r22, 7, r0 - ldhi.q r7, -1, r7 - ptabs r18,tr1 - stlo.q r22, 0, r0 - or r1, r7, r1 - sthi.q r5, 15, r1 - stlo.q r5, 8, r1 - blink tr1, r63 -copy_user_memcpy_end: - nop diff --git a/arch/sh/lib64/memcpy.S b/arch/sh/lib64/memcpy.S deleted file mode 100644 index 231ea595b39a..000000000000 --- a/arch/sh/lib64/memcpy.S +++ /dev/null @@ -1,202 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */ -/* Modified by SuperH, Inc. September 2003 */ -! -! Fast SH memcpy -! -! by Toshiyasu Morita (tm@netcom.com) -! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut) -! SH5 code Copyright 2002 SuperH Ltd. -! -! Entry: ARG0: destination pointer -! ARG1: source pointer -! ARG2: byte count -! -! Exit: RESULT: destination pointer -! any other registers in the range r0-r7: trashed -! -! Notes: Usually one wants to do small reads and write a longword, but -! unfortunately it is difficult in some cases to concatanate bytes -! into a longword on the SH, so this does a longword read and small -! writes. -! -! This implementation makes two assumptions about how it is called: -! -! 1.: If the byte count is nonzero, the address of the last byte to be -! copied is unsigned greater than the address of the first byte to -! be copied. This could be easily swapped for a signed comparison, -! but the algorithm used needs some comparison. -! -! 2.: When there are two or three bytes in the last word of an 11-or-more -! bytes memory chunk to b copied, the rest of the word can be read -! without side effects. -! This could be easily changed by increasing the minimum size of -! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2, -! however, this would cost a few extra cyles on average. -! For SHmedia, the assumption is that any quadword can be read in its -! enirety if at least one byte is included in the copy. -! - - .section .text..SHmedia32,"ax" - .globl memcpy - .type memcpy, @function - .align 5 - -memcpy: - -#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1 -#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1 -#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1 -#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1 - - ld.b r3,0,r63 - pta/l Large,tr0 - movi 25,r0 - bgeu/u r4,r0,tr0 - nsb r4,r0 - shlli r0,5,r0 - movi (L1-L0+63*32 + 1) & 0xffff,r1 - sub r1, r0, r0 -L0: ptrel r0,tr0 - add r2,r4,r5 - ptabs r18,tr1 - add r3,r4,r6 - blink tr0,r63 - -/* Rearranged to make cut2 safe */ - .balign 8 -L4_7: /* 4..7 byte memcpy cntd. */ - stlo.l r2, 0, r0 - or r6, r7, r6 - sthi.l r5, -1, r6 - stlo.l r5, -4, r6 - blink tr1,r63 - - .balign 8 -L1: /* 0 byte memcpy */ - nop - blink tr1,r63 - nop - nop - nop - nop - -L2_3: /* 2 or 3 byte memcpy cntd. */ - st.b r5,-1,r6 - blink tr1,r63 - - /* 1 byte memcpy */ - ld.b r3,0,r0 - st.b r2,0,r0 - blink tr1,r63 - -L8_15: /* 8..15 byte memcpy cntd. */ - stlo.q r2, 0, r0 - or r6, r7, r6 - sthi.q r5, -1, r6 - stlo.q r5, -8, r6 - blink tr1,r63 - - /* 2 or 3 byte memcpy */ - ld.b r3,0,r0 - ld.b r2,0,r63 - ld.b r3,1,r1 - st.b r2,0,r0 - pta/l L2_3,tr0 - ld.b r6,-1,r6 - st.b r2,1,r1 - blink tr0, r63 - - /* 4 .. 7 byte memcpy */ - LDUAL (r3, 0, r0, r1) - pta L4_7, tr0 - ldlo.l r6, -4, r7 - or r0, r1, r0 - sthi.l r2, 3, r0 - ldhi.l r6, -1, r6 - blink tr0, r63 - - /* 8 .. 15 byte memcpy */ - LDUAQ (r3, 0, r0, r1) - pta L8_15, tr0 - ldlo.q r6, -8, r7 - or r0, r1, r0 - sthi.q r2, 7, r0 - ldhi.q r6, -1, r6 - blink tr0, r63 - - /* 16 .. 24 byte memcpy */ - LDUAQ (r3, 0, r0, r1) - LDUAQ (r3, 8, r8, r9) - or r0, r1, r0 - sthi.q r2, 7, r0 - or r8, r9, r8 - sthi.q r2, 15, r8 - ldlo.q r6, -8, r7 - ldhi.q r6, -1, r6 - stlo.q r2, 8, r8 - stlo.q r2, 0, r0 - or r6, r7, r6 - sthi.q r5, -1, r6 - stlo.q r5, -8, r6 - blink tr1,r63 - -Large: - ld.b r2, 0, r63 - pta/l Loop_ua, tr1 - ori r3, -8, r7 - sub r2, r7, r22 - sub r3, r2, r6 - add r2, r4, r5 - ldlo.q r3, 0, r0 - addi r5, -16, r5 - movi 64+8, r27 // could subtract r7 from that. - stlo.q r2, 0, r0 - sthi.q r2, 7, r0 - ldx.q r22, r6, r0 - bgtu/l r27, r4, tr1 - - addi r5, -48, r27 - pta/l Loop_line, tr0 - addi r6, 64, r36 - addi r6, -24, r19 - addi r6, -16, r20 - addi r6, -8, r21 - -Loop_line: - ldx.q r22, r36, r63 - alloco r22, 32 - addi r22, 32, r22 - ldx.q r22, r19, r23 - sthi.q r22, -25, r0 - ldx.q r22, r20, r24 - ldx.q r22, r21, r25 - stlo.q r22, -32, r0 - ldx.q r22, r6, r0 - sthi.q r22, -17, r23 - sthi.q r22, -9, r24 - sthi.q r22, -1, r25 - stlo.q r22, -24, r23 - stlo.q r22, -16, r24 - stlo.q r22, -8, r25 - bgeu r27, r22, tr0 - -Loop_ua: - addi r22, 8, r22 - sthi.q r22, -1, r0 - stlo.q r22, -8, r0 - ldx.q r22, r6, r0 - bgtu/l r5, r22, tr1 - - add r3, r4, r7 - ldlo.q r7, -8, r1 - sthi.q r22, 7, r0 - ldhi.q r7, -1, r7 - ptabs r18,tr1 - stlo.q r22, 0, r0 - or r1, r7, r1 - sthi.q r5, 15, r1 - stlo.q r5, 8, r1 - blink tr1, r63 - - .size memcpy,.-memcpy diff --git a/arch/sh/lib64/memset.S b/arch/sh/lib64/memset.S deleted file mode 100644 index 453aa5f1d263..000000000000 --- a/arch/sh/lib64/memset.S +++ /dev/null @@ -1,92 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */ -/* Modified by SuperH, Inc. September 2003 */ -! -! Fast SH memset -! -! by Toshiyasu Morita (tm@netcom.com) -! -! SH5 code by J"orn Rennecke (joern.rennecke@superh.com) -! Copyright 2002 SuperH Ltd. -! - -#if __BYTE_ORDER == __LITTLE_ENDIAN -#define SHHI shlld -#define SHLO shlrd -#else -#define SHHI shlrd -#define SHLO shlld -#endif - - .section .text..SHmedia32,"ax" - .globl memset - .type memset, @function - - .align 5 - -memset: - pta/l multiquad, tr0 - andi r2, 7, r22 - ptabs r18, tr2 - mshflo.b r3,r3,r3 - add r4, r22, r23 - mperm.w r3, r63, r3 // Fill pattern now in every byte of r3 - - movi 8, r9 - bgtu/u r23, r9, tr0 // multiquad - - beqi/u r4, 0, tr2 // Return with size 0 - ensures no mem accesses - ldlo.q r2, 0, r7 - shlli r4, 2, r4 - movi -1, r8 - SHHI r8, r4, r8 - SHHI r8, r4, r8 - mcmv r7, r8, r3 - stlo.q r2, 0, r3 - blink tr2, r63 - -multiquad: - pta/l lastquad, tr0 - stlo.q r2, 0, r3 - shlri r23, 3, r24 - add r2, r4, r5 - beqi/u r24, 1, tr0 // lastquad - pta/l loop, tr1 - sub r2, r22, r25 - andi r5, -8, r20 // calculate end address and - addi r20, -7*8, r8 // loop end address; This might overflow, so we need - // to use a different test before we start the loop - bge/u r24, r9, tr1 // loop - st.q r25, 8, r3 - st.q r20, -8, r3 - shlri r24, 1, r24 - beqi/u r24, 1, tr0 // lastquad - st.q r25, 16, r3 - st.q r20, -16, r3 - beqi/u r24, 2, tr0 // lastquad - st.q r25, 24, r3 - st.q r20, -24, r3 -lastquad: - sthi.q r5, -1, r3 - blink tr2,r63 - -loop: -!!! alloco r25, 32 // QQQ comment out for short-term fix to SHUK #3895. - // QQQ commenting out is locically correct, but sub-optimal - // QQQ Sean McGoogan - 4th April 2003. - st.q r25, 8, r3 - st.q r25, 16, r3 - st.q r25, 24, r3 - st.q r25, 32, r3 - addi r25, 32, r25 - bgeu/l r8, r25, tr1 // loop - - st.q r20, -40, r3 - st.q r20, -32, r3 - st.q r20, -24, r3 - st.q r20, -16, r3 - st.q r20, -8, r3 - sthi.q r5, -1, r3 - blink tr2,r63 - - .size memset,.-memset diff --git a/arch/sh/lib64/panic.c b/arch/sh/lib64/panic.c deleted file mode 100644 index 38c954e04f6a..000000000000 --- a/arch/sh/lib64/panic.c +++ /dev/null @@ -1,15 +0,0 @@ -/* - * Copyright (C) 2003 Richard Curnow, SuperH UK Limited - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ - -void -panic_handler(unsigned long panicPC, unsigned long panicSSR, - unsigned long panicEXPEVT) -{ - /* Never return from the panic handler */ - for (;;) ; -} diff --git a/arch/sh/lib64/sdivsi3.S b/arch/sh/lib64/sdivsi3.S deleted file mode 100644 index b422e2374430..000000000000 --- a/arch/sh/lib64/sdivsi3.S +++ /dev/null @@ -1,136 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - .global __sdivsi3 - .global __sdivsi3_1 - .global __sdivsi3_2 - .section .text..SHmedia32,"ax" - .align 2 - - /* inputs: r4,r5 */ - /* clobbered: r1,r18,r19,r20,r21,r25,tr0 */ - /* result in r0 */ -__sdivsi3: -__sdivsi3_1: - ptb __div_table,tr0 - gettr tr0,r20 - -__sdivsi3_2: - nsb r5, r1 - shlld r5, r1, r25 /* normalize; [-2 ..1, 1..2) in s2.62 */ - shari r25, 58, r21 /* extract 5(6) bit index (s2.4 with hole -1..1) */ - /* bubble */ - ldx.ub r20, r21, r19 /* u0.8 */ - shari r25, 32, r25 /* normalize to s2.30 */ - shlli r21, 1, r21 - muls.l r25, r19, r19 /* s2.38 */ - ldx.w r20, r21, r21 /* s2.14 */ - ptabs r18, tr0 - shari r19, 24, r19 /* truncate to s2.14 */ - sub r21, r19, r19 /* some 11 bit inverse in s1.14 */ - muls.l r19, r19, r21 /* u0.28 */ - sub r63, r1, r1 - addi r1, 92, r1 - muls.l r25, r21, r18 /* s2.58 */ - shlli r19, 45, r19 /* multiply by two and convert to s2.58 */ - /* bubble */ - sub r19, r18, r18 - shari r18, 28, r18 /* some 22 bit inverse in s1.30 */ - muls.l r18, r25, r0 /* s2.60 */ - muls.l r18, r4, r25 /* s32.30 */ - /* bubble */ - shari r0, 16, r19 /* s-16.44 */ - muls.l r19, r18, r19 /* s-16.74 */ - shari r25, 63, r0 - shari r4, 14, r18 /* s19.-14 */ - shari r19, 30, r19 /* s-16.44 */ - muls.l r19, r18, r19 /* s15.30 */ - xor r21, r0, r21 /* You could also use the constant 1 << 27. */ - add r21, r25, r21 - sub r21, r19, r21 - shard r21, r1, r21 - sub r21, r0, r0 - blink tr0, r63 - -/* This table has been generated by divtab.c . -Defects for bias -330: - Max defect: 6.081536e-07 at -1.000000e+00 - Min defect: 2.849516e-08 at 1.030651e+00 - Max 2nd step defect: 9.606539e-12 at -1.000000e+00 - Min 2nd step defect: 0.000000e+00 at 0.000000e+00 - Defect at 1: 1.238659e-07 - Defect at -2: 1.061708e-07 */ - - .balign 2 - .type __div_table,@object - .size __div_table,128 -/* negative division constants */ - .word -16638 - .word -17135 - .word -17737 - .word -18433 - .word -19103 - .word -19751 - .word -20583 - .word -21383 - .word -22343 - .word -23353 - .word -24407 - .word -25582 - .word -26863 - .word -28382 - .word -29965 - .word -31800 -/* negative division factors */ - .byte 66 - .byte 70 - .byte 75 - .byte 81 - .byte 87 - .byte 93 - .byte 101 - .byte 109 - .byte 119 - .byte 130 - .byte 142 - .byte 156 - .byte 172 - .byte 192 - .byte 214 - .byte 241 - .skip 16 - .global __div_table -__div_table: - .skip 16 -/* positive division factors */ - .byte 241 - .byte 214 - .byte 192 - .byte 172 - .byte 156 - .byte 142 - .byte 130 - .byte 119 - .byte 109 - .byte 101 - .byte 93 - .byte 87 - .byte 81 - .byte 75 - .byte 70 - .byte 66 -/* positive division constants */ - .word 31801 - .word 29966 - .word 28383 - .word 26864 - .word 25583 - .word 24408 - .word 23354 - .word 22344 - .word 21384 - .word 20584 - .word 19752 - .word 19104 - .word 18434 - .word 17738 - .word 17136 - .word 16639 diff --git a/arch/sh/lib64/strcpy.S b/arch/sh/lib64/strcpy.S deleted file mode 100644 index b61631e523d4..000000000000 --- a/arch/sh/lib64/strcpy.S +++ /dev/null @@ -1,98 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */ -/* Modified by SuperH, Inc. September 2003 */ -! Entry: arg0: destination -! arg1: source -! Exit: result: destination -! -! SH5 code Copyright 2002 SuperH Ltd. - -#if __BYTE_ORDER == __LITTLE_ENDIAN -#define SHHI shlld -#define SHLO shlrd -#else -#define SHHI shlrd -#define SHLO shlld -#endif - - .section .text..SHmedia32,"ax" - .globl strcpy - .type strcpy, @function - .align 5 - -strcpy: - - pta/l shortstring,tr1 - ldlo.q r3,0,r4 - ptabs r18,tr4 - shlli r3,3,r7 - addi r2, 8, r0 - mcmpeq.b r4,r63,r6 - SHHI r6,r7,r6 - bnei/u r6,0,tr1 // shortstring - pta/l no_lddst, tr2 - ori r3,-8,r23 - sub r2, r23, r0 - sub r3, r2, r21 - addi r21, 8, r20 - ldx.q r0, r21, r5 - pta/l loop, tr0 - ori r2,-8,r22 - mcmpeq.b r5, r63, r6 - bgt/u r22, r23, tr2 // no_lddst - - // r22 < r23 : Need to do a load from the destination. - // r22 == r23 : Doesn't actually need to load from destination, - // but still can be handled here. - ldlo.q r2, 0, r9 - movi -1, r8 - SHLO r8, r7, r8 - mcmv r4, r8, r9 - stlo.q r2, 0, r9 - beqi/l r6, 0, tr0 // loop - - add r5, r63, r4 - addi r0, 8, r0 - blink tr1, r63 // shortstring -no_lddst: - // r22 > r23: note that for r22 == r23 the sthi.q would clobber - // bytes before the destination region. - stlo.q r2, 0, r4 - SHHI r4, r7, r4 - sthi.q r0, -1, r4 - beqi/l r6, 0, tr0 // loop - - add r5, r63, r4 - addi r0, 8, r0 -shortstring: -#if __BYTE_ORDER != __LITTLE_ENDIAN - pta/l shortstring2,tr1 - byterev r4,r4 -#endif -shortstring2: - st.b r0,-8,r4 - andi r4,0xff,r5 - shlri r4,8,r4 - addi r0,1,r0 - bnei/l r5,0,tr1 - blink tr4,r63 // return - - .balign 8 -loop: - stlo.q r0, 0, r5 - ldx.q r0, r20, r4 - addi r0, 16, r0 - sthi.q r0, -9, r5 - mcmpeq.b r4, r63, r6 - bnei/u r6, 0, tr1 // shortstring - ldx.q r0, r21, r5 - stlo.q r0, -8, r4 - sthi.q r0, -1, r4 - mcmpeq.b r5, r63, r6 - beqi/l r6, 0, tr0 // loop - - add r5, r63, r4 - addi r0, 8, r0 - blink tr1, r63 // shortstring - - .size strcpy,.-strcpy diff --git a/arch/sh/lib64/strlen.S b/arch/sh/lib64/strlen.S deleted file mode 100644 index c00b972f9999..000000000000 --- a/arch/sh/lib64/strlen.S +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Simplistic strlen() implementation for SHmedia. - * - * Copyright (C) 2003 Paul Mundt - */ - - .section .text..SHmedia32,"ax" - .globl strlen - .type strlen,@function - - .balign 16 -strlen: - ptabs r18, tr4 - - /* - * Note: We could easily deal with the NULL case here with a simple - * sanity check, though it seems that the behavior we want is to fault - * in the event that r2 == NULL, so we don't bother. - */ -/* beqi r2, 0, tr4 */ ! Sanity check - - movi -1, r0 - pta/l loop, tr0 -loop: - ld.b r2, 0, r1 - addi r2, 1, r2 - addi r0, 1, r0 - bnei/l r1, 0, tr0 - - or r0, r63, r2 - blink tr4, r63 - - .size strlen,.-strlen diff --git a/arch/sh/lib64/udelay.c b/arch/sh/lib64/udelay.c deleted file mode 100644 index f215b063da70..000000000000 --- a/arch/sh/lib64/udelay.c +++ /dev/null @@ -1,49 +0,0 @@ -/* - * arch/sh/lib64/udelay.c - * - * Delay routines, using a pre-computed "loops_per_jiffy" value. - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2003, 2004 Paul Mundt - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ -#include -#include - -/* - * Use only for very small delays (< 1 msec). - * - * The active part of our cycle counter is only 32-bits wide, and - * we're treating the difference between two marks as signed. On - * a 1GHz box, that's about 2 seconds. - */ - -void __delay(unsigned long loops) -{ - long long dummy; - __asm__ __volatile__("gettr tr0, %1\n\t" - "pta $+4, tr0\n\t" - "addi %0, -1, %0\n\t" - "bne %0, r63, tr0\n\t" - "ptabs %1, tr0\n\t":"=r"(loops), - "=r"(dummy) - :"0"(loops)); -} - -void __const_udelay(unsigned long xloops) -{ - __delay(xloops * (HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy)); -} - -void __udelay(unsigned long usecs) -{ - __const_udelay(usecs * 0x000010c6); /* 2**32 / 1000000 */ -} - -void __ndelay(unsigned long nsecs) -{ - __const_udelay(nsecs * 0x00000005); -} diff --git a/arch/sh/lib64/udivdi3.S b/arch/sh/lib64/udivdi3.S deleted file mode 100644 index c032cb157589..000000000000 --- a/arch/sh/lib64/udivdi3.S +++ /dev/null @@ -1,121 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - .section .text..SHmedia32,"ax" - .align 2 - .global __udivdi3 -__udivdi3: - shlri r3,1,r4 - nsb r4,r22 - shlld r3,r22,r6 - shlri r6,49,r5 - movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */ - sub r21,r5,r1 - mmulfx.w r1,r1,r4 - mshflo.w r1,r63,r1 - sub r63,r22,r20 // r63 == 64 % 64 - mmulfx.w r5,r4,r4 - pta large_divisor,tr0 - addi r20,32,r9 - msub.w r1,r4,r1 - madd.w r1,r1,r1 - mmulfx.w r1,r1,r4 - shlri r6,32,r7 - bgt/u r9,r63,tr0 // large_divisor - mmulfx.w r5,r4,r4 - shlri r2,32+14,r19 - addi r22,-31,r0 - msub.w r1,r4,r1 - - mulu.l r1,r7,r4 - addi r1,-3,r5 - mulu.l r5,r19,r5 - sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 - shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as - the case may be, %0000000000000000 000.11111111111, still */ - muls.l r1,r4,r4 /* leaving at least one sign bit. */ - mulu.l r5,r3,r8 - mshalds.l r1,r21,r1 - shari r4,26,r4 - shlld r8,r0,r8 - add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) - sub r2,r8,r2 - /* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */ - - shlri r2,22,r21 - mulu.l r21,r1,r21 - shlld r5,r0,r8 - addi r20,30-22,r0 - shlrd r21,r0,r21 - mulu.l r21,r3,r5 - add r8,r21,r8 - mcmpgt.l r21,r63,r21 // See Note 1 - addi r20,30,r0 - mshfhi.l r63,r21,r21 - sub r2,r5,r2 - andc r2,r21,r2 - - /* small divisor: need a third divide step */ - mulu.l r2,r1,r7 - ptabs r18,tr0 - addi r2,1,r2 - shlrd r7,r0,r7 - mulu.l r7,r3,r5 - add r8,r7,r8 - sub r2,r3,r2 - cmpgt r2,r5,r5 - add r8,r5,r2 - /* could test r3 here to check for divide by zero. */ - blink tr0,r63 - -large_divisor: - mmulfx.w r5,r4,r4 - shlrd r2,r9,r25 - shlri r25,32,r8 - msub.w r1,r4,r1 - - mulu.l r1,r7,r4 - addi r1,-3,r5 - mulu.l r5,r8,r5 - sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 - shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as - the case may be, %0000000000000000 000.11111111111, still */ - muls.l r1,r4,r4 /* leaving at least one sign bit. */ - shlri r5,14-1,r8 - mulu.l r8,r7,r5 - mshalds.l r1,r21,r1 - shari r4,26,r4 - add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) - sub r25,r5,r25 - /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */ - - shlri r25,22,r21 - mulu.l r21,r1,r21 - pta no_lo_adj,tr0 - addi r22,32,r0 - shlri r21,40,r21 - mulu.l r21,r7,r5 - add r8,r21,r8 - shlld r2,r0,r2 - sub r25,r5,r25 - bgtu/u r7,r25,tr0 // no_lo_adj - addi r8,1,r8 - sub r25,r7,r25 -no_lo_adj: - mextr4 r2,r25,r2 - - /* large_divisor: only needs a few adjustments. */ - mulu.l r8,r6,r5 - ptabs r18,tr0 - /* bubble */ - cmpgtu r5,r2,r5 - sub r8,r5,r2 - blink tr0,r63 - -/* Note 1: To shift the result of the second divide stage so that the result - always fits into 32 bits, yet we still reduce the rest sufficiently - would require a lot of instructions to do the shifts just right. Using - the full 64 bit shift result to multiply with the divisor would require - four extra instructions for the upper 32 bits (shift / mulu / shift / sub). - Fortunately, if the upper 32 bits of the shift result are nonzero, we - know that the rest after taking this partial result into account will - fit into 32 bits. So we just clear the upper 32 bits of the rest if the - upper 32 bits of the partial result are nonzero. */ diff --git a/arch/sh/lib64/udivsi3.S b/arch/sh/lib64/udivsi3.S deleted file mode 100644 index e4788fb4fe82..000000000000 --- a/arch/sh/lib64/udivsi3.S +++ /dev/null @@ -1,60 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - .global __udivsi3 - .section .text..SHmedia32,"ax" - .align 2 - -/* - inputs: r4,r5 - clobbered: r18,r19,r20,r21,r22,r25,tr0 - result in r0. - */ -__udivsi3: - addz.l r5,r63,r22 - nsb r22,r0 - shlld r22,r0,r25 - shlri r25,48,r25 - movi 0xffffffffffffbb0c,r20 /* shift count eqiv 76 */ - sub r20,r25,r21 - mmulfx.w r21,r21,r19 - mshflo.w r21,r63,r21 - ptabs r18,tr0 - mmulfx.w r25,r19,r19 - sub r20,r0,r0 - /* bubble */ - msub.w r21,r19,r19 - - /* - * It would be nice for scheduling to do this add to r21 before - * the msub.w, but we need a different value for r19 to keep - * errors under control. - */ - addi r19,-2,r21 - mulu.l r4,r21,r18 - mmulfx.w r19,r19,r19 - shlli r21,15,r21 - shlrd r18,r0,r18 - mulu.l r18,r22,r20 - mmacnfx.wl r25,r19,r21 - /* bubble */ - sub r4,r20,r25 - - mulu.l r25,r21,r19 - addi r0,14,r0 - /* bubble */ - shlrd r19,r0,r19 - mulu.l r19,r22,r20 - add r18,r19,r18 - /* bubble */ - sub.l r25,r20,r25 - - mulu.l r25,r21,r19 - addz.l r25,r63,r25 - sub r25,r22,r25 - shlrd r19,r0,r19 - mulu.l r19,r22,r20 - addi r25,1,r25 - add r18,r19,r18 - - cmpgt r25,r20,r25 - add.l r18,r25,r0 - blink tr0,r63 diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index 5c8a2ebfc720..6c39d24ad919 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -15,8 +15,7 @@ config MMU config PAGE_OFFSET hex - default "0x80000000" if MMU && SUPERH32 - default "0x20000000" if MMU && SUPERH64 + default "0x80000000" if MMU default "0x00000000" config FORCE_MAX_ZONEORDER @@ -72,12 +71,11 @@ config MEMORY_SIZE config 29BIT def_bool !32BIT - depends on SUPERH32 select UNCACHED_MAPPING config 32BIT bool - default y if CPU_SH5 || !MMU + default !MMU config PMB bool "Support 32-bit physical addressing through PMB" @@ -152,7 +150,7 @@ config ARCH_MEMORY_PROBE config IOREMAP_FIXED def_bool y - depends on X2TLB || SUPERH64 + depends on X2TLB config UNCACHED_MAPPING bool @@ -184,7 +182,7 @@ config PAGE_SIZE_16KB config PAGE_SIZE_64KB bool "64kB" - depends on !MMU || CPU_SH4 || CPU_SH5 + depends on !MMU || CPU_SH4 help This enables support for 64kB pages, possible on all SH-4 CPUs and later. @@ -216,10 +214,6 @@ config HUGETLB_PAGE_SIZE_64MB bool "64MB" depends on X2TLB -config HUGETLB_PAGE_SIZE_512MB - bool "512MB" - depends on CPU_SH5 - endchoice config SCHED_MC @@ -242,7 +236,7 @@ config SH7705_CACHE_32KB choice prompt "Cache mode" - default CACHE_WRITEBACK if CPU_SH2A || CPU_SH3 || CPU_SH4 || CPU_SH5 + default CACHE_WRITEBACK if CPU_SH2A || CPU_SH3 || CPU_SH4 default CACHE_WRITETHROUGH if (CPU_SH2 && !CPU_SH2A) config CACHE_WRITEBACK diff --git a/arch/sh/mm/Makefile b/arch/sh/mm/Makefile index 5051b38fd5b6..487da0ff03b3 100644 --- a/arch/sh/mm/Makefile +++ b/arch/sh/mm/Makefile @@ -10,15 +10,14 @@ cacheops-$(CONFIG_CPU_SUBTYPE_SH7619) := cache-sh2.o cacheops-$(CONFIG_CPU_SH2A) := cache-sh2a.o cacheops-$(CONFIG_CPU_SH3) := cache-sh3.o cacheops-$(CONFIG_CPU_SH4) := cache-sh4.o flush-sh4.o -cacheops-$(CONFIG_CPU_SH5) := cache-sh5.o flush-sh4.o cacheops-$(CONFIG_SH7705_CACHE_32KB) += cache-sh7705.o cacheops-$(CONFIG_CPU_SHX3) += cache-shx3.o obj-y += $(cacheops-y) mmu-y := nommu.o extable_32.o -mmu-$(CONFIG_MMU) := extable_$(BITS).o fault.o ioremap.o kmap.o \ - pgtable.o tlbex_$(BITS).o tlbflush_$(BITS).o +mmu-$(CONFIG_MMU) := extable_32.o fault.o ioremap.o kmap.o \ + pgtable.o tlbex_32.o tlbflush_32.o obj-y += $(mmu-y) @@ -31,7 +30,6 @@ ifdef CONFIG_MMU debugfs-$(CONFIG_CPU_SH4) += tlb-debugfs.o tlb-$(CONFIG_CPU_SH3) := tlb-sh3.o tlb-$(CONFIG_CPU_SH4) := tlb-sh4.o tlb-urb.o -tlb-$(CONFIG_CPU_SH5) := tlb-sh5.o tlb-$(CONFIG_CPU_HAS_PTEAEX) := tlb-pteaex.o tlb-urb.o obj-y += $(tlb-y) endif @@ -46,29 +44,4 @@ obj-$(CONFIG_HAVE_SRAM_POOL) += sram.o GCOV_PROFILE_pmb.o := n -# Special flags for tlbex_64.o. This puts restrictions on the number of -# caller-save registers that the compiler can target when building this file. -# This is required because the code is called from a context in entry.S where -# very few registers have been saved in the exception handler (for speed -# reasons). -# The caller save registers that have been saved and which can be used are -# r2,r3,r4,r5 : argument passing -# r15, r18 : SP and LINK -# tr0-4 : allow all caller-save TR's. The compiler seems to be able to make -# use of them, so it's probably beneficial to performance to save them -# and have them available for it. -# -# The resources not listed below are callee save, i.e. the compiler is free to -# use any of them and will spill them to the stack itself. - -CFLAGS_tlbex_64.o += -ffixed-r7 \ - -ffixed-r8 -ffixed-r9 -ffixed-r10 -ffixed-r11 -ffixed-r12 \ - -ffixed-r13 -ffixed-r14 -ffixed-r16 -ffixed-r17 -ffixed-r19 \ - -ffixed-r20 -ffixed-r21 -ffixed-r22 -ffixed-r23 \ - -ffixed-r24 -ffixed-r25 -ffixed-r26 -ffixed-r27 \ - -ffixed-r36 -ffixed-r37 -ffixed-r38 -ffixed-r39 -ffixed-r40 \ - -ffixed-r41 -ffixed-r42 -ffixed-r43 \ - -ffixed-r60 -ffixed-r61 -ffixed-r62 \ - -fomit-frame-pointer - ccflags-y := -Werror diff --git a/arch/sh/mm/cache-sh5.c b/arch/sh/mm/cache-sh5.c deleted file mode 100644 index 445b5e69b73c..000000000000 --- a/arch/sh/mm/cache-sh5.c +++ /dev/null @@ -1,621 +0,0 @@ -/* - * arch/sh/mm/cache-sh5.c - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2002 Benedict Gaster - * Copyright (C) 2003 Richard Curnow - * Copyright (C) 2003 - 2008 Paul Mundt - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -extern void __weak sh4__flush_region_init(void); - -/* Wired TLB entry for the D-cache */ -static unsigned long long dtlb_cache_slot; - -/* - * The following group of functions deal with mapping and unmapping a - * temporary page into a DTLB slot that has been set aside for exclusive - * use. - */ -static inline void -sh64_setup_dtlb_cache_slot(unsigned long eaddr, unsigned long asid, - unsigned long paddr) -{ - local_irq_disable(); - sh64_setup_tlb_slot(dtlb_cache_slot, eaddr, asid, paddr); -} - -static inline void sh64_teardown_dtlb_cache_slot(void) -{ - sh64_teardown_tlb_slot(dtlb_cache_slot); - local_irq_enable(); -} - -static inline void sh64_icache_inv_all(void) -{ - unsigned long long addr, flag, data; - unsigned long flags; - - addr = ICCR0; - flag = ICCR0_ICI; - data = 0; - - /* Make this a critical section for safety (probably not strictly necessary.) */ - local_irq_save(flags); - - /* Without %1 it gets unexplicably wrong */ - __asm__ __volatile__ ( - "getcfg %3, 0, %0\n\t" - "or %0, %2, %0\n\t" - "putcfg %3, 0, %0\n\t" - "synci" - : "=&r" (data) - : "0" (data), "r" (flag), "r" (addr)); - - local_irq_restore(flags); -} - -static void sh64_icache_inv_kernel_range(unsigned long start, unsigned long end) -{ - /* Invalidate range of addresses [start,end] from the I-cache, where - * the addresses lie in the kernel superpage. */ - - unsigned long long ullend, addr, aligned_start; - aligned_start = (unsigned long long)(signed long long)(signed long) start; - addr = L1_CACHE_ALIGN(aligned_start); - ullend = (unsigned long long) (signed long long) (signed long) end; - - while (addr <= ullend) { - __asm__ __volatile__ ("icbi %0, 0" : : "r" (addr)); - addr += L1_CACHE_BYTES; - } -} - -static void sh64_icache_inv_user_page(struct vm_area_struct *vma, unsigned long eaddr) -{ - /* If we get called, we know that vma->vm_flags contains VM_EXEC. - Also, eaddr is page-aligned. */ - unsigned int cpu = smp_processor_id(); - unsigned long long addr, end_addr; - unsigned long flags = 0; - unsigned long running_asid, vma_asid; - addr = eaddr; - end_addr = addr + PAGE_SIZE; - - /* Check whether we can use the current ASID for the I-cache - invalidation. For example, if we're called via - access_process_vm->flush_cache_page->here, (e.g. when reading from - /proc), 'running_asid' will be that of the reader, not of the - victim. - - Also, note the risk that we might get pre-empted between the ASID - compare and blocking IRQs, and before we regain control, the - pid->ASID mapping changes. However, the whole cache will get - invalidated when the mapping is renewed, so the worst that can - happen is that the loop below ends up invalidating somebody else's - cache entries. - */ - - running_asid = get_asid(); - vma_asid = cpu_asid(cpu, vma->vm_mm); - if (running_asid != vma_asid) { - local_irq_save(flags); - switch_and_save_asid(vma_asid); - } - while (addr < end_addr) { - /* Worth unrolling a little */ - __asm__ __volatile__("icbi %0, 0" : : "r" (addr)); - __asm__ __volatile__("icbi %0, 32" : : "r" (addr)); - __asm__ __volatile__("icbi %0, 64" : : "r" (addr)); - __asm__ __volatile__("icbi %0, 96" : : "r" (addr)); - addr += 128; - } - if (running_asid != vma_asid) { - switch_and_save_asid(running_asid); - local_irq_restore(flags); - } -} - -static void sh64_icache_inv_user_page_range(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - /* Used for invalidating big chunks of I-cache, i.e. assume the range - is whole pages. If 'start' or 'end' is not page aligned, the code - is conservative and invalidates to the ends of the enclosing pages. - This is functionally OK, just a performance loss. */ - - /* See the comments below in sh64_dcache_purge_user_range() regarding - the choice of algorithm. However, for the I-cache option (2) isn't - available because there are no physical tags so aliases can't be - resolved. The icbi instruction has to be used through the user - mapping. Because icbi is cheaper than ocbp on a cache hit, it - would be cheaper to use the selective code for a large range than is - possible with the D-cache. Just assume 64 for now as a working - figure. - */ - int n_pages; - - if (!mm) - return; - - n_pages = ((end - start) >> PAGE_SHIFT); - if (n_pages >= 64) { - sh64_icache_inv_all(); - } else { - unsigned long aligned_start; - unsigned long eaddr; - unsigned long after_last_page_start; - unsigned long mm_asid, current_asid; - unsigned long flags = 0; - - mm_asid = cpu_asid(smp_processor_id(), mm); - current_asid = get_asid(); - - if (mm_asid != current_asid) { - /* Switch ASID and run the invalidate loop under cli */ - local_irq_save(flags); - switch_and_save_asid(mm_asid); - } - - aligned_start = start & PAGE_MASK; - after_last_page_start = PAGE_SIZE + ((end - 1) & PAGE_MASK); - - while (aligned_start < after_last_page_start) { - struct vm_area_struct *vma; - unsigned long vma_end; - vma = find_vma(mm, aligned_start); - if (!vma || (aligned_start <= vma->vm_end)) { - /* Avoid getting stuck in an error condition */ - aligned_start += PAGE_SIZE; - continue; - } - vma_end = vma->vm_end; - if (vma->vm_flags & VM_EXEC) { - /* Executable */ - eaddr = aligned_start; - while (eaddr < vma_end) { - sh64_icache_inv_user_page(vma, eaddr); - eaddr += PAGE_SIZE; - } - } - aligned_start = vma->vm_end; /* Skip to start of next region */ - } - - if (mm_asid != current_asid) { - switch_and_save_asid(current_asid); - local_irq_restore(flags); - } - } -} - -static void sh64_icache_inv_current_user_range(unsigned long start, unsigned long end) -{ - /* The icbi instruction never raises ITLBMISS. i.e. if there's not a - cache hit on the virtual tag the instruction ends there, without a - TLB lookup. */ - - unsigned long long aligned_start; - unsigned long long ull_end; - unsigned long long addr; - - ull_end = end; - - /* Just invalidate over the range using the natural addresses. TLB - miss handling will be OK (TBC). Since it's for the current process, - either we're already in the right ASID context, or the ASIDs have - been recycled since we were last active in which case we might just - invalidate another processes I-cache entries : no worries, just a - performance drop for him. */ - aligned_start = L1_CACHE_ALIGN(start); - addr = aligned_start; - while (addr < ull_end) { - __asm__ __volatile__ ("icbi %0, 0" : : "r" (addr)); - __asm__ __volatile__ ("nop"); - __asm__ __volatile__ ("nop"); - addr += L1_CACHE_BYTES; - } -} - -/* Buffer used as the target of alloco instructions to purge data from cache - sets by natural eviction. -- RPC */ -#define DUMMY_ALLOCO_AREA_SIZE ((L1_CACHE_BYTES << 10) + (1024 * 4)) -static unsigned char dummy_alloco_area[DUMMY_ALLOCO_AREA_SIZE] __cacheline_aligned = { 0, }; - -static inline void sh64_dcache_purge_sets(int sets_to_purge_base, int n_sets) -{ - /* Purge all ways in a particular block of sets, specified by the base - set number and number of sets. Can handle wrap-around, if that's - needed. */ - - int dummy_buffer_base_set; - unsigned long long eaddr, eaddr0, eaddr1; - int j; - int set_offset; - - dummy_buffer_base_set = ((int)&dummy_alloco_area & - cpu_data->dcache.entry_mask) >> - cpu_data->dcache.entry_shift; - set_offset = sets_to_purge_base - dummy_buffer_base_set; - - for (j = 0; j < n_sets; j++, set_offset++) { - set_offset &= (cpu_data->dcache.sets - 1); - eaddr0 = (unsigned long long)dummy_alloco_area + - (set_offset << cpu_data->dcache.entry_shift); - - /* - * Do one alloco which hits the required set per cache - * way. For write-back mode, this will purge the #ways - * resident lines. There's little point unrolling this - * loop because the allocos stall more if they're too - * close together. - */ - eaddr1 = eaddr0 + cpu_data->dcache.way_size * - cpu_data->dcache.ways; - - for (eaddr = eaddr0; eaddr < eaddr1; - eaddr += cpu_data->dcache.way_size) { - __asm__ __volatile__ ("alloco %0, 0" : : "r" (eaddr)); - __asm__ __volatile__ ("synco"); /* TAKum03020 */ - } - - eaddr1 = eaddr0 + cpu_data->dcache.way_size * - cpu_data->dcache.ways; - - for (eaddr = eaddr0; eaddr < eaddr1; - eaddr += cpu_data->dcache.way_size) { - /* - * Load from each address. Required because - * alloco is a NOP if the cache is write-through. - */ - if (test_bit(SH_CACHE_MODE_WT, &(cpu_data->dcache.flags))) - __raw_readb((unsigned long)eaddr); - } - } - - /* - * Don't use OCBI to invalidate the lines. That costs cycles - * directly. If the dummy block is just left resident, it will - * naturally get evicted as required. - */ -} - -/* - * Purge the entire contents of the dcache. The most efficient way to - * achieve this is to use alloco instructions on a region of unused - * memory equal in size to the cache, thereby causing the current - * contents to be discarded by natural eviction. The alternative, namely - * reading every tag, setting up a mapping for the corresponding page and - * doing an OCBP for the line, would be much more expensive. - */ -static void sh64_dcache_purge_all(void) -{ - - sh64_dcache_purge_sets(0, cpu_data->dcache.sets); -} - - -/* Assumes this address (+ (2**n_synbits) pages up from it) aren't used for - anything else in the kernel */ -#define MAGIC_PAGE0_START 0xffffffffec000000ULL - -/* Purge the physical page 'paddr' from the cache. It's known that any - * cache lines requiring attention have the same page colour as the the - * address 'eaddr'. - * - * This relies on the fact that the D-cache matches on physical tags when - * no virtual tag matches. So we create an alias for the original page - * and purge through that. (Alternatively, we could have done this by - * switching ASID to match the original mapping and purged through that, - * but that involves ASID switching cost + probably a TLBMISS + refill - * anyway.) - */ -static void sh64_dcache_purge_coloured_phy_page(unsigned long paddr, - unsigned long eaddr) -{ - unsigned long long magic_page_start; - unsigned long long magic_eaddr, magic_eaddr_end; - - magic_page_start = MAGIC_PAGE0_START + (eaddr & CACHE_OC_SYN_MASK); - - /* As long as the kernel is not pre-emptible, this doesn't need to be - under cli/sti. */ - sh64_setup_dtlb_cache_slot(magic_page_start, get_asid(), paddr); - - magic_eaddr = magic_page_start; - magic_eaddr_end = magic_eaddr + PAGE_SIZE; - - while (magic_eaddr < magic_eaddr_end) { - /* Little point in unrolling this loop - the OCBPs are blocking - and won't go any quicker (i.e. the loop overhead is parallel - to part of the OCBP execution.) */ - __asm__ __volatile__ ("ocbp %0, 0" : : "r" (magic_eaddr)); - magic_eaddr += L1_CACHE_BYTES; - } - - sh64_teardown_dtlb_cache_slot(); -} - -/* - * Purge a page given its physical start address, by creating a temporary - * 1 page mapping and purging across that. Even if we know the virtual - * address (& vma or mm) of the page, the method here is more elegant - * because it avoids issues of coping with page faults on the purge - * instructions (i.e. no special-case code required in the critical path - * in the TLB miss handling). - */ -static void sh64_dcache_purge_phy_page(unsigned long paddr) -{ - unsigned long long eaddr_start, eaddr, eaddr_end; - int i; - - /* As long as the kernel is not pre-emptible, this doesn't need to be - under cli/sti. */ - eaddr_start = MAGIC_PAGE0_START; - for (i = 0; i < (1 << CACHE_OC_N_SYNBITS); i++) { - sh64_setup_dtlb_cache_slot(eaddr_start, get_asid(), paddr); - - eaddr = eaddr_start; - eaddr_end = eaddr + PAGE_SIZE; - while (eaddr < eaddr_end) { - __asm__ __volatile__ ("ocbp %0, 0" : : "r" (eaddr)); - eaddr += L1_CACHE_BYTES; - } - - sh64_teardown_dtlb_cache_slot(); - eaddr_start += PAGE_SIZE; - } -} - -static void sh64_dcache_purge_user_pages(struct mm_struct *mm, - unsigned long addr, unsigned long end) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - pte_t entry; - spinlock_t *ptl; - unsigned long paddr; - - if (!mm) - return; /* No way to find physical address of page */ - - pgd = pgd_offset(mm, addr); - if (pgd_bad(*pgd)) - return; - - pud = pud_offset(pgd, addr); - if (pud_none(*pud) || pud_bad(*pud)) - return; - - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd) || pmd_bad(*pmd)) - return; - - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); - do { - entry = *pte; - if (pte_none(entry) || !pte_present(entry)) - continue; - paddr = pte_val(entry) & PAGE_MASK; - sh64_dcache_purge_coloured_phy_page(paddr, addr); - } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap_unlock(pte - 1, ptl); -} - -/* - * There are at least 5 choices for the implementation of this, with - * pros (+), cons(-), comments(*): - * - * 1. ocbp each line in the range through the original user's ASID - * + no lines spuriously evicted - * - tlbmiss handling (must either handle faults on demand => extra - * special-case code in tlbmiss critical path), or map the page in - * advance (=> flush_tlb_range in advance to avoid multiple hits) - * - ASID switching - * - expensive for large ranges - * - * 2. temporarily map each page in the range to a special effective - * address and ocbp through the temporary mapping; relies on the - * fact that SH-5 OCB* always do TLB lookup and match on ptags (they - * never look at the etags) - * + no spurious evictions - * - expensive for large ranges - * * surely cheaper than (1) - * - * 3. walk all the lines in the cache, check the tags, if a match - * occurs create a page mapping to ocbp the line through - * + no spurious evictions - * - tag inspection overhead - * - (especially for small ranges) - * - potential cost of setting up/tearing down page mapping for - * every line that matches the range - * * cost partly independent of range size - * - * 4. walk all the lines in the cache, check the tags, if a match - * occurs use 4 * alloco to purge the line (+3 other probably - * innocent victims) by natural eviction - * + no tlb mapping overheads - * - spurious evictions - * - tag inspection overhead - * - * 5. implement like flush_cache_all - * + no tag inspection overhead - * - spurious evictions - * - bad for small ranges - * - * (1) can be ruled out as more expensive than (2). (2) appears best - * for small ranges. The choice between (3), (4) and (5) for large - * ranges and the range size for the large/small boundary need - * benchmarking to determine. - * - * For now use approach (2) for small ranges and (5) for large ones. - */ -static void sh64_dcache_purge_user_range(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - int n_pages = ((end - start) >> PAGE_SHIFT); - - if (n_pages >= 64 || ((start ^ (end - 1)) & PMD_MASK)) { - sh64_dcache_purge_all(); - } else { - /* Small range, covered by a single page table page */ - start &= PAGE_MASK; /* should already be so */ - end = PAGE_ALIGN(end); /* should already be so */ - sh64_dcache_purge_user_pages(mm, start, end); - } -} - -/* - * Invalidate the entire contents of both caches, after writing back to - * memory any dirty data from the D-cache. - */ -static void sh5_flush_cache_all(void *unused) -{ - sh64_dcache_purge_all(); - sh64_icache_inv_all(); -} - -/* - * Invalidate an entire user-address space from both caches, after - * writing back dirty data (e.g. for shared mmap etc). - * - * This could be coded selectively by inspecting all the tags then - * doing 4*alloco on any set containing a match (as for - * flush_cache_range), but fork/exit/execve (where this is called from) - * are expensive anyway. - * - * Have to do a purge here, despite the comments re I-cache below. - * There could be odd-coloured dirty data associated with the mm still - * in the cache - if this gets written out through natural eviction - * after the kernel has reused the page there will be chaos. - * - * The mm being torn down won't ever be active again, so any Icache - * lines tagged with its ASID won't be visible for the rest of the - * lifetime of this ASID cycle. Before the ASID gets reused, there - * will be a flush_cache_all. Hence we don't need to touch the - * I-cache. This is similar to the lack of action needed in - * flush_tlb_mm - see fault.c. - */ -static void sh5_flush_cache_mm(void *unused) -{ - sh64_dcache_purge_all(); -} - -/* - * Invalidate (from both caches) the range [start,end) of virtual - * addresses from the user address space specified by mm, after writing - * back any dirty data. - * - * Note, 'end' is 1 byte beyond the end of the range to flush. - */ -static void sh5_flush_cache_range(void *args) -{ - struct flusher_data *data = args; - struct vm_area_struct *vma; - unsigned long start, end; - - vma = data->vma; - start = data->addr1; - end = data->addr2; - - sh64_dcache_purge_user_range(vma->vm_mm, start, end); - sh64_icache_inv_user_page_range(vma->vm_mm, start, end); -} - -/* - * Invalidate any entries in either cache for the vma within the user - * address space vma->vm_mm for the page starting at virtual address - * 'eaddr'. This seems to be used primarily in breaking COW. Note, - * the I-cache must be searched too in case the page in question is - * both writable and being executed from (e.g. stack trampolines.) - * - * Note, this is called with pte lock held. - */ -static void sh5_flush_cache_page(void *args) -{ - struct flusher_data *data = args; - struct vm_area_struct *vma; - unsigned long eaddr, pfn; - - vma = data->vma; - eaddr = data->addr1; - pfn = data->addr2; - - sh64_dcache_purge_phy_page(pfn << PAGE_SHIFT); - - if (vma->vm_flags & VM_EXEC) - sh64_icache_inv_user_page(vma, eaddr); -} - -static void sh5_flush_dcache_page(void *page) -{ - sh64_dcache_purge_phy_page(page_to_phys((struct page *)page)); - wmb(); -} - -/* - * Flush the range [start,end] of kernel virtual address space from - * the I-cache. The corresponding range must be purged from the - * D-cache also because the SH-5 doesn't have cache snooping between - * the caches. The addresses will be visible through the superpage - * mapping, therefore it's guaranteed that there no cache entries for - * the range in cache sets of the wrong colour. - */ -static void sh5_flush_icache_range(void *args) -{ - struct flusher_data *data = args; - unsigned long start, end; - - start = data->addr1; - end = data->addr2; - - __flush_purge_region((void *)start, end); - wmb(); - sh64_icache_inv_kernel_range(start, end); -} - -/* - * For the address range [start,end), write back the data from the - * D-cache and invalidate the corresponding region of the I-cache for the - * current process. Used to flush signal trampolines on the stack to - * make them executable. - */ -static void sh5_flush_cache_sigtramp(void *vaddr) -{ - unsigned long end = (unsigned long)vaddr + L1_CACHE_BYTES; - - __flush_wback_region(vaddr, L1_CACHE_BYTES); - wmb(); - sh64_icache_inv_current_user_range((unsigned long)vaddr, end); -} - -void __init sh5_cache_init(void) -{ - local_flush_cache_all = sh5_flush_cache_all; - local_flush_cache_mm = sh5_flush_cache_mm; - local_flush_cache_dup_mm = sh5_flush_cache_mm; - local_flush_cache_page = sh5_flush_cache_page; - local_flush_cache_range = sh5_flush_cache_range; - local_flush_dcache_page = sh5_flush_dcache_page; - local_flush_icache_range = sh5_flush_icache_range; - local_flush_cache_sigtramp = sh5_flush_cache_sigtramp; - - /* Reserve a slot for dcache colouring in the DTLB */ - dtlb_cache_slot = sh64_get_wired_dtlb_entry(); - - sh4__flush_region_init(); -} diff --git a/arch/sh/mm/cache.c b/arch/sh/mm/cache.c index 464f160a9576..3aef78ceb820 100644 --- a/arch/sh/mm/cache.c +++ b/arch/sh/mm/cache.c @@ -355,12 +355,6 @@ void __init cpu_cache_init(void) } } - if (boot_cpu_data.family == CPU_FAMILY_SH5) { - extern void __weak sh5_cache_init(void); - - sh5_cache_init(); - } - skip: emit_cache_params(); } diff --git a/arch/sh/mm/extable_64.c b/arch/sh/mm/extable_64.c deleted file mode 100644 index 7a3b4d33d2e7..000000000000 --- a/arch/sh/mm/extable_64.c +++ /dev/null @@ -1,84 +0,0 @@ -/* - * arch/sh/mm/extable_64.c - * - * Copyright (C) 2003 Richard Curnow - * Copyright (C) 2003, 2004 Paul Mundt - * - * Cloned from the 2.5 SH version.. - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ -#include -#include -#include -#include - -extern unsigned long copy_user_memcpy, copy_user_memcpy_end; -extern void __copy_user_fixup(void); - -static const struct exception_table_entry __copy_user_fixup_ex = { - .fixup = (unsigned long)&__copy_user_fixup, -}; - -/* - * Some functions that may trap due to a bad user-mode address have too - * many loads and stores in them to make it at all practical to label - * each one and put them all in the main exception table. - * - * In particular, the fast memcpy routine is like this. It's fix-up is - * just to fall back to a slow byte-at-a-time copy, which is handled the - * conventional way. So it's functionally OK to just handle any trap - * occurring in the fast memcpy with that fixup. - */ -static const struct exception_table_entry *check_exception_ranges(unsigned long addr) -{ - if ((addr >= (unsigned long)©_user_memcpy) && - (addr <= (unsigned long)©_user_memcpy_end)) - return &__copy_user_fixup_ex; - - return NULL; -} - -static int cmp_ex_search(const void *key, const void *elt) -{ - const struct exception_table_entry *_elt = elt; - unsigned long _key = *(unsigned long *)key; - - /* avoid overflow */ - if (_key > _elt->insn) - return 1; - if (_key < _elt->insn) - return -1; - return 0; -} - -/* Simple binary search */ -const struct exception_table_entry * -search_extable(const struct exception_table_entry *base, - const size_t num, - unsigned long value) -{ - const struct exception_table_entry *mid; - - mid = check_exception_ranges(value); - if (mid) - return mid; - - return bsearch(&value, base, num, - sizeof(struct exception_table_entry), cmp_ex_search); -} - -int fixup_exception(struct pt_regs *regs) -{ - const struct exception_table_entry *fixup; - - fixup = search_exception_tables(regs->pc); - if (fixup) { - regs->pc = fixup->fixup; - return 1; - } - - return 0; -} diff --git a/arch/sh/mm/tlb-sh5.c b/arch/sh/mm/tlb-sh5.c deleted file mode 100644 index e4bb2a8e0a69..000000000000 --- a/arch/sh/mm/tlb-sh5.c +++ /dev/null @@ -1,224 +0,0 @@ -/* - * arch/sh/mm/tlb-sh5.c - * - * Copyright (C) 2003 Paul Mundt - * Copyright (C) 2003 Richard Curnow - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ -#include -#include -#include -#include -#include - -/** - * sh64_tlb_init - Perform initial setup for the DTLB and ITLB. - */ -int sh64_tlb_init(void) -{ - /* Assign some sane DTLB defaults */ - cpu_data->dtlb.entries = 64; - cpu_data->dtlb.step = 0x10; - - cpu_data->dtlb.first = DTLB_FIXED | cpu_data->dtlb.step; - cpu_data->dtlb.next = cpu_data->dtlb.first; - - cpu_data->dtlb.last = DTLB_FIXED | - ((cpu_data->dtlb.entries - 1) * - cpu_data->dtlb.step); - - /* And again for the ITLB */ - cpu_data->itlb.entries = 64; - cpu_data->itlb.step = 0x10; - - cpu_data->itlb.first = ITLB_FIXED | cpu_data->itlb.step; - cpu_data->itlb.next = cpu_data->itlb.first; - cpu_data->itlb.last = ITLB_FIXED | - ((cpu_data->itlb.entries - 1) * - cpu_data->itlb.step); - - return 0; -} - -/** - * sh64_next_free_dtlb_entry - Find the next available DTLB entry - */ -unsigned long long sh64_next_free_dtlb_entry(void) -{ - return cpu_data->dtlb.next; -} - -/** - * sh64_get_wired_dtlb_entry - Allocate a wired (locked-in) entry in the DTLB - */ -unsigned long long sh64_get_wired_dtlb_entry(void) -{ - unsigned long long entry = sh64_next_free_dtlb_entry(); - - cpu_data->dtlb.first += cpu_data->dtlb.step; - cpu_data->dtlb.next += cpu_data->dtlb.step; - - return entry; -} - -/** - * sh64_put_wired_dtlb_entry - Free a wired (locked-in) entry in the DTLB. - * - * @entry: Address of TLB slot. - * - * Works like a stack, last one to allocate must be first one to free. - */ -int sh64_put_wired_dtlb_entry(unsigned long long entry) -{ - __flush_tlb_slot(entry); - - /* - * We don't do any particularly useful tracking of wired entries, - * so this approach works like a stack .. last one to be allocated - * has to be the first one to be freed. - * - * We could potentially load wired entries into a list and work on - * rebalancing the list periodically (which also entails moving the - * contents of a TLB entry) .. though I have a feeling that this is - * more trouble than it's worth. - */ - - /* - * Entry must be valid .. we don't want any ITLB addresses! - */ - if (entry <= DTLB_FIXED) - return -EINVAL; - - /* - * Next, check if we're within range to be freed. (ie, must be the - * entry beneath the first 'free' entry! - */ - if (entry < (cpu_data->dtlb.first - cpu_data->dtlb.step)) - return -EINVAL; - - /* If we are, then bring this entry back into the list */ - cpu_data->dtlb.first -= cpu_data->dtlb.step; - cpu_data->dtlb.next = entry; - - return 0; -} - -/** - * sh64_setup_tlb_slot - Load up a translation in a wired slot. - * - * @config_addr: Address of TLB slot. - * @eaddr: Virtual address. - * @asid: Address Space Identifier. - * @paddr: Physical address. - * - * Load up a virtual<->physical translation for @eaddr<->@paddr in the - * pre-allocated TLB slot @config_addr (see sh64_get_wired_dtlb_entry). - */ -void sh64_setup_tlb_slot(unsigned long long config_addr, unsigned long eaddr, - unsigned long asid, unsigned long paddr) -{ - unsigned long long pteh, ptel; - - pteh = neff_sign_extend(eaddr); - pteh &= PAGE_MASK; - pteh |= (asid << PTEH_ASID_SHIFT) | PTEH_VALID; - ptel = neff_sign_extend(paddr); - ptel &= PAGE_MASK; - ptel |= (_PAGE_CACHABLE | _PAGE_READ | _PAGE_WRITE); - - asm volatile("putcfg %0, 1, %1\n\t" - "putcfg %0, 0, %2\n" - : : "r" (config_addr), "r" (ptel), "r" (pteh)); -} - -/** - * sh64_teardown_tlb_slot - Teardown a translation. - * - * @config_addr: Address of TLB slot. - * - * Teardown any existing mapping in the TLB slot @config_addr. - */ -void sh64_teardown_tlb_slot(unsigned long long config_addr) - __attribute__ ((alias("__flush_tlb_slot"))); - -static int dtlb_entry; -static unsigned long long dtlb_entries[64]; - -void tlb_wire_entry(struct vm_area_struct *vma, unsigned long addr, pte_t pte) -{ - unsigned long long entry; - unsigned long paddr, flags; - - BUG_ON(dtlb_entry == ARRAY_SIZE(dtlb_entries)); - - local_irq_save(flags); - - entry = sh64_get_wired_dtlb_entry(); - dtlb_entries[dtlb_entry++] = entry; - - paddr = pte_val(pte) & _PAGE_FLAGS_HARDWARE_MASK; - paddr &= ~PAGE_MASK; - - sh64_setup_tlb_slot(entry, addr, get_asid(), paddr); - - local_irq_restore(flags); -} - -void tlb_unwire_entry(void) -{ - unsigned long long entry; - unsigned long flags; - - BUG_ON(!dtlb_entry); - - local_irq_save(flags); - entry = dtlb_entries[dtlb_entry--]; - - sh64_teardown_tlb_slot(entry); - sh64_put_wired_dtlb_entry(entry); - - local_irq_restore(flags); -} - -void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t pte) -{ - unsigned long long ptel; - unsigned long long pteh=0; - struct tlb_info *tlbp; - unsigned long long next; - unsigned int fault_code = get_thread_fault_code(); - - /* Get PTEL first */ - ptel = pte.pte_low; - - /* - * Set PTEH register - */ - pteh = neff_sign_extend(address & MMU_VPN_MASK); - - /* Set the ASID. */ - pteh |= get_asid() << PTEH_ASID_SHIFT; - pteh |= PTEH_VALID; - - /* Set PTEL register, set_pte has performed the sign extension */ - ptel &= _PAGE_FLAGS_HARDWARE_MASK; /* drop software flags */ - - if (fault_code & FAULT_CODE_ITLB) - tlbp = &cpu_data->itlb; - else - tlbp = &cpu_data->dtlb; - - next = tlbp->next; - __flush_tlb_slot(next); - asm volatile ("putcfg %0,1,%2\n\n\t" - "putcfg %0,0,%1\n" - : : "r" (next), "r" (pteh), "r" (ptel) ); - - next += TLB_STEP; - if (next > tlbp->last) - next = tlbp->first; - tlbp->next = next; -} diff --git a/arch/sh/mm/tlbex_64.c b/arch/sh/mm/tlbex_64.c deleted file mode 100644 index 8ff966dd0c74..000000000000 --- a/arch/sh/mm/tlbex_64.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * The SH64 TLB miss. - * - * Original code from fault.c - * Copyright (C) 2000, 2001 Paolo Alberelli - * - * Fast PTE->TLB refill path - * Copyright (C) 2003 Richard.Curnow@superh.com - * - * IMPORTANT NOTES : - * The do_fast_page_fault function is called from a context in entry.S - * where very few registers have been saved. In particular, the code in - * this file must be compiled not to use ANY caller-save registers that - * are not part of the restricted save set. Also, it means that code in - * this file must not make calls to functions elsewhere in the kernel, or - * else the excepting context will see corruption in its caller-save - * registers. Plus, the entry.S save area is non-reentrant, so this code - * has to run with SR.BL==1, i.e. no interrupts taken inside it and panic - * on any exception. - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int handle_tlbmiss(unsigned long long protection_flags, - unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - pte_t entry; - - if (is_vmalloc_addr((void *)address)) { - pgd = pgd_offset_k(address); - } else { - if (unlikely(address >= TASK_SIZE || !current->mm)) - return 1; - - pgd = pgd_offset(current->mm, address); - } - - pud = pud_offset(pgd, address); - if (pud_none(*pud) || !pud_present(*pud)) - return 1; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd) || !pmd_present(*pmd)) - return 1; - - pte = pte_offset_kernel(pmd, address); - entry = *pte; - if (pte_none(entry) || !pte_present(entry)) - return 1; - - /* - * If the page doesn't have sufficient protection bits set to - * service the kind of fault being handled, there's not much - * point doing the TLB refill. Punt the fault to the general - * handler. - */ - if ((pte_val(entry) & protection_flags) != protection_flags) - return 1; - - update_mmu_cache(NULL, address, pte); - - return 0; -} - -/* - * Put all this information into one structure so that everything is just - * arithmetic relative to a single base address. This reduces the number - * of movi/shori pairs needed just to load addresses of static data. - */ -struct expevt_lookup { - unsigned short protection_flags[8]; - unsigned char is_text_access[8]; - unsigned char is_write_access[8]; -}; - -#define PRU (1<<9) -#define PRW (1<<8) -#define PRX (1<<7) -#define PRR (1<<6) - -/* Sized as 8 rather than 4 to allow checking the PTE's PRU bit against whether - the fault happened in user mode or privileged mode. */ -static struct expevt_lookup expevt_lookup_table = { - .protection_flags = {PRX, PRX, 0, 0, PRR, PRR, PRW, PRW}, - .is_text_access = {1, 1, 0, 0, 0, 0, 0, 0} -}; - -static inline unsigned int -expevt_to_fault_code(unsigned long expevt) -{ - if (expevt == 0xa40) - return FAULT_CODE_ITLB; - else if (expevt == 0x060) - return FAULT_CODE_WRITE; - - return 0; -} - -/* - This routine handles page faults that can be serviced just by refilling a - TLB entry from an existing page table entry. (This case represents a very - large majority of page faults.) Return 1 if the fault was successfully - handled. Return 0 if the fault could not be handled. (This leads into the - general fault handling in fault.c which deals with mapping file-backed - pages, stack growth, segmentation faults, swapping etc etc) - */ -asmlinkage int __kprobes -do_fast_page_fault(unsigned long long ssr_md, unsigned long long expevt, - unsigned long address) -{ - unsigned long long protection_flags; - unsigned long long index; - unsigned long long expevt4; - unsigned int fault_code; - - /* The next few lines implement a way of hashing EXPEVT into a - * small array index which can be used to lookup parameters - * specific to the type of TLBMISS being handled. - * - * Note: - * ITLBMISS has EXPEVT==0xa40 - * RTLBMISS has EXPEVT==0x040 - * WTLBMISS has EXPEVT==0x060 - */ - expevt4 = (expevt >> 4); - /* TODO : xor ssr_md into this expression too. Then we can check - * that PRU is set when it needs to be. */ - index = expevt4 ^ (expevt4 >> 5); - index &= 7; - - fault_code = expevt_to_fault_code(expevt); - - protection_flags = expevt_lookup_table.protection_flags[index]; - - if (expevt_lookup_table.is_text_access[index]) - fault_code |= FAULT_CODE_ITLB; - if (!ssr_md) - fault_code |= FAULT_CODE_USER; - - set_thread_fault_code(fault_code); - - return handle_tlbmiss(protection_flags, address); -} diff --git a/arch/sh/mm/tlbflush_64.c b/arch/sh/mm/tlbflush_64.c deleted file mode 100644 index bd0715d5dca4..000000000000 --- a/arch/sh/mm/tlbflush_64.c +++ /dev/null @@ -1,172 +0,0 @@ -/* - * arch/sh/mm/tlb-flush_64.c - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2003 Richard Curnow (/proc/tlb, bug fixes) - * Copyright (C) 2003 - 2012 Paul Mundt - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -void local_flush_tlb_one(unsigned long asid, unsigned long page) -{ - unsigned long long match, pteh=0, lpage; - unsigned long tlb; - - /* - * Sign-extend based on neff. - */ - lpage = neff_sign_extend(page); - match = (asid << PTEH_ASID_SHIFT) | PTEH_VALID; - match |= lpage; - - for_each_itlb_entry(tlb) { - asm volatile ("getcfg %1, 0, %0" - : "=r" (pteh) - : "r" (tlb) ); - - if (pteh == match) { - __flush_tlb_slot(tlb); - break; - } - } - - for_each_dtlb_entry(tlb) { - asm volatile ("getcfg %1, 0, %0" - : "=r" (pteh) - : "r" (tlb) ); - - if (pteh == match) { - __flush_tlb_slot(tlb); - break; - } - - } -} - -void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long page) -{ - unsigned long flags; - - if (vma->vm_mm) { - page &= PAGE_MASK; - local_irq_save(flags); - local_flush_tlb_one(get_asid(), page); - local_irq_restore(flags); - } -} - -void local_flush_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) -{ - unsigned long flags; - unsigned long long match, pteh=0, pteh_epn, pteh_low; - unsigned long tlb; - unsigned int cpu = smp_processor_id(); - struct mm_struct *mm; - - mm = vma->vm_mm; - if (cpu_context(cpu, mm) == NO_CONTEXT) - return; - - local_irq_save(flags); - - start &= PAGE_MASK; - end &= PAGE_MASK; - - match = (cpu_asid(cpu, mm) << PTEH_ASID_SHIFT) | PTEH_VALID; - - /* Flush ITLB */ - for_each_itlb_entry(tlb) { - asm volatile ("getcfg %1, 0, %0" - : "=r" (pteh) - : "r" (tlb) ); - - pteh_epn = pteh & PAGE_MASK; - pteh_low = pteh & ~PAGE_MASK; - - if (pteh_low == match && pteh_epn >= start && pteh_epn <= end) - __flush_tlb_slot(tlb); - } - - /* Flush DTLB */ - for_each_dtlb_entry(tlb) { - asm volatile ("getcfg %1, 0, %0" - : "=r" (pteh) - : "r" (tlb) ); - - pteh_epn = pteh & PAGE_MASK; - pteh_low = pteh & ~PAGE_MASK; - - if (pteh_low == match && pteh_epn >= start && pteh_epn <= end) - __flush_tlb_slot(tlb); - } - - local_irq_restore(flags); -} - -void local_flush_tlb_mm(struct mm_struct *mm) -{ - unsigned long flags; - unsigned int cpu = smp_processor_id(); - - if (cpu_context(cpu, mm) == NO_CONTEXT) - return; - - local_irq_save(flags); - - cpu_context(cpu, mm) = NO_CONTEXT; - if (mm == current->mm) - activate_context(mm, cpu); - - local_irq_restore(flags); -} - -void local_flush_tlb_all(void) -{ - /* Invalidate all, including shared pages, excluding fixed TLBs */ - unsigned long flags, tlb; - - local_irq_save(flags); - - /* Flush each ITLB entry */ - for_each_itlb_entry(tlb) - __flush_tlb_slot(tlb); - - /* Flush each DTLB entry */ - for_each_dtlb_entry(tlb) - __flush_tlb_slot(tlb); - - local_irq_restore(flags); -} - -void local_flush_tlb_kernel_range(unsigned long start, unsigned long end) -{ - /* FIXME: Optimize this later.. */ - flush_tlb_all(); -} - -void __flush_tlb_global(void) -{ - flush_tlb_all(); -} diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index ec873f09c763..527957d9c6ce 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -1516,7 +1516,7 @@ config RTC_DRV_GENERIC tristate "Generic RTC support" # Please consider writing a new RTC driver instead of using the generic # RTC abstraction - depends on PARISC || M68K || PPC || SUPERH32 || COMPILE_TEST + depends on PARISC || M68K || PPC || SUPERH || COMPILE_TEST help Say Y or M here to enable RTC support on systems using the generic RTC abstraction. If you do not know what you are doing, you should diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 62dc4f577ba1..fb6efe5210e2 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -39,7 +39,7 @@ config ARCH_BINFMT_ELF_STATE config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y if !BINFMT_ELF - depends on (ARM || (SUPERH32 && !MMU) || C6X) + depends on (ARM || (SUPERH && !MMU) || C6X) select ELFCORE help ELF FDPIC binaries are based on ELF, but allow the individual load diff --git a/scripts/headers_install.sh b/scripts/headers_install.sh index a07668a5c36b..720f2e6b176a 100755 --- a/scripts/headers_install.sh +++ b/scripts/headers_install.sh @@ -81,9 +81,6 @@ arch/ia64/include/uapi/asm/cmpxchg.h:CONFIG_IA64_DEBUG_CMPXCHG arch/m68k/include/uapi/asm/ptrace.h:CONFIG_COLDFIRE arch/nios2/include/uapi/asm/swab.h:CONFIG_NIOS2_CI_SWAB_NO arch/nios2/include/uapi/asm/swab.h:CONFIG_NIOS2_CI_SWAB_SUPPORT -arch/sh/include/uapi/asm/ptrace.h:CONFIG_CPU_SH5 -arch/sh/include/uapi/asm/sigcontext.h:CONFIG_CPU_SH5 -arch/sh/include/uapi/asm/stat.h:CONFIG_CPU_SH5 arch/x86/include/uapi/asm/auxvec.h:CONFIG_IA32_EMULATION arch/x86/include/uapi/asm/auxvec.h:CONFIG_X86_64 arch/x86/include/uapi/asm/mman.h:CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS diff --git a/tools/arch/sh/include/asm/barrier.h b/tools/arch/sh/include/asm/barrier.h index bde5221af282..7eaea27cdd67 100644 --- a/tools/arch/sh/include/asm/barrier.h +++ b/tools/arch/sh/include/asm/barrier.h @@ -22,7 +22,7 @@ * Historically we have only done this type of barrier for the MMUCR, but * it's also necessary for the CCR, so we make it generic here instead. */ -#if defined(__SH4A__) || defined(__SH5__) +#if defined(__SH4A__) #define mb() __asm__ __volatile__ ("synco": : :"memory") #define rmb() mb() #define wmb() mb() From 3563a6f4683eb08f9a437e028dd027ac31092381 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 30 May 2020 17:10:16 -0500 Subject: [PATCH 276/427] smb3: minor update to compression header definitions MS-SMB2 specification was updated in March. Make minor additions and corrections to compression related definitions in smb2pdu.h Signed-off-by: Steve French Reviewed-by: Aurelien Aptel --- fs/cifs/smb2pdu.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 10acf90f858d..3b0e6acf9d7d 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -143,8 +143,17 @@ struct smb2_transform_hdr { __u64 SessionId; } __packed; +/* See MS-SMB2 2.2.42 */ +struct smb2_compression_transform_hdr { + __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */ + __le32 OriginalCompressedSegmentSize; + __le16 CompressionAlgorithm; + __le16 Flags; + __le16 Length; /* if chained it is length, else offset */ +} __packed; + /* See MS-SMB2 2.2.42.1 */ -struct compression_playload_header { +struct compression_payload_header { __le16 AlgorithmId; __le16 Reserved; __le32 Length; @@ -333,7 +342,7 @@ struct smb2_encryption_neg_context { #define SMB3_COMPRESS_LZ77 cpu_to_le16(0x0002) #define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003) /* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */ -#define SMB3_COMPRESS_PATTERN cpu_to_le16(0x0004) +#define SMB3_COMPRESS_PATTERN cpu_to_le16(0x0004) /* Pattern_V1 */ /* Compression Flags */ #define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE cpu_to_le32(0x00000000) From bbbf9eafbfdaa2af75fb5ed6d5ddb01be89e6d30 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 30 May 2020 17:29:50 -0500 Subject: [PATCH 277/427] cifs: fix minor typos in comments and log messages Fix four minor typos in comments and log messages Signed-off-by: Steve French Reviewed-by: Aurelien Aptel --- fs/cifs/sess.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 3f8b43e77539..0ae25cc77fc0 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -122,7 +122,7 @@ int cifs_try_adding_channels(struct cifs_ses *ses) tries++; if (tries > 3*ses->chan_max) { - cifs_dbg(FYI, "too many attempt at opening channels (%d channels left to open)\n", + cifs_dbg(FYI, "too many channel open attempts (%d channels left to open)\n", left); break; } @@ -200,7 +200,7 @@ cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface) vol.UNC = unc; vol.prepath = ""; - /* Re-use same version as master connection */ + /* Reuse same version as master connection */ vol.vals = ses->server->vals; vol.ops = ses->server->ops; @@ -263,7 +263,7 @@ cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface) goto out; /* success, put it on the list - * XXX: sharing ses between 2 tcp server is not possible, the + * XXX: sharing ses between 2 tcp servers is not possible, the * way "internal" linked lists works in linux makes element * only able to belong to one list * @@ -972,7 +972,7 @@ sess_auth_lanman(struct sess_data *sess_data) /* Calculate hash with password and copy into bcc_ptr. * Encryption Key (stored as in cryptkey) gets used if the - * security mode bit in Negottiate Protocol response states + * security mode bit in Negotiate Protocol response states * to use challenge/response method (i.e. Password bit is 1). */ rc = calc_lanman_hash(ses->password, ses->server->cryptkey, From 5f68ea4aa98bcddb5ec5229d2a0933d84ed17732 Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Wed, 22 Apr 2020 15:58:57 +0200 Subject: [PATCH 278/427] cifs: multichannel: move channel selection in function This commit moves channel picking code in separate function. Signed-off-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 1 + fs/cifs/transport.c | 38 +++++++++++++++++++++++++++----------- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 8036216ce434..9767f9b5d315 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -96,6 +96,7 @@ extern int cifs_call_async(struct TCP_Server_Info *server, mid_receive_t *receive, mid_callback_t *callback, mid_handle_t *handle, void *cbdata, const int flags, const struct cifs_credits *exist_credits); +extern struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses); extern int cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, struct smb_rqst *rqst, int *resp_buf_type, const int flags, struct kvec *resp_iov); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index c359221d6848..4d4cb26d2ae1 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -992,6 +992,32 @@ cifs_cancelled_callback(struct mid_q_entry *mid) DeleteMidQEntry(mid); } +/* + * Return a channel (master if none) of @ses that can be used to send + * regular requests. + * + * If we are currently binding a new channel (negprot/sess.setup), + * return the new incomplete channel. + */ +struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) +{ + uint index = 0; + + if (!ses) + return NULL; + + if (!ses->binding) { + /* round robin */ + if (ses->chan_count > 1) { + index = (uint)atomic_inc_return(&ses->chan_seq); + index %= ses->chan_count; + } + return ses->chans[index].server; + } else { + return cifs_ses_server(ses); + } +} + int compound_send_recv(const unsigned int xid, struct cifs_ses *ses, const int flags, const int num_rqst, struct smb_rqst *rqst, @@ -1017,17 +1043,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, return -EIO; } - if (!ses->binding) { - uint index = 0; - - if (ses->chan_count > 1) { - index = (uint)atomic_inc_return(&ses->chan_seq); - index %= ses->chan_count; - } - server = ses->chans[index].server; - } else { - server = cifs_ses_server(ses); - } + server = cifs_pick_channel(ses); if (server->tcpStatus == CifsExiting) return -ENOENT; From 3247bd10a4502a3075ce8e1c3c7d31ef76f193ce Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Thu, 28 May 2020 13:57:42 -0700 Subject: [PATCH 279/427] PCI: Add ACS quirk for Intel Root Complex Integrated Endpoints All Intel platforms guarantee that all root complex implementations must send transactions up to IOMMU for address translations. Hence for Intel RCiEP devices, we can assume some ACS-type isolation even without an ACS capability. From the Intel VT-d spec, r3.1, sec 3.16 ("Root-Complex Peer to Peer Considerations"): When DMA remapping is enabled, peer-to-peer requests through the Root-Complex must be handled as follows: - The input address in the request is translated (through first-level, second-level or nested translation) to a host physical address (HPA). The address decoding for peer addresses must be done only on the translated HPA. Hardware implementations are free to further limit peer-to-peer accesses to specific host physical address regions (or to completely disallow peer-forwarding of translated requests). - Since address translation changes the contents (address field) of the PCI Express Transaction Layer Packet (TLP), for PCI Express peer-to-peer requests with ECRC, the Root-Complex hardware must use the new ECRC (re-computed with the translated address) if it decides to forward the TLP as a peer request. - Root-ports, and multi-function root-complex integrated endpoints, may support additional peer-to-peer control features by supporting PCI Express Access Control Services (ACS) capability. Refer to ACS capability in PCI Express specifications for details. Since Linux didn't give special treatment to allow this exception, certain RCiEP MFD devices were grouped in a single IOMMU group. This doesn't permit a single device to be assigned to a guest for instance. In one vendor system: Device 14.x were grouped in a single IOMMU group. /sys/kernel/iommu_groups/5/devices/0000:00:14.0 /sys/kernel/iommu_groups/5/devices/0000:00:14.2 /sys/kernel/iommu_groups/5/devices/0000:00:14.3 After this patch: /sys/kernel/iommu_groups/5/devices/0000:00:14.0 /sys/kernel/iommu_groups/5/devices/0000:00:14.2 /sys/kernel/iommu_groups/6/devices/0000:00:14.3 <<< new group 14.0 and 14.2 are integrated devices, but legacy end points, whereas 14.3 was a PCIe-compliant RCiEP. 00:14.3 Network controller: Intel Corporation Device 9df0 (rev 30) Capabilities: [40] Express (v2) Root Complex Integrated Endpoint, MSI 00 This permits assigning this device to a guest VM. [bhelgaas: drop "Fixes" tag since this doesn't fix a bug in that commit] Link: https://lore.kernel.org/r/1590699462-7131-1-git-send-email-ashok.raj@intel.com Tested-by: Darrel Goeddel Signed-off-by: Ashok Raj Signed-off-by: Bjorn Helgaas Reviewed-by: Alex Williamson Cc: stable@vger.kernel.org Cc: Lu Baolu Cc: Mark Scott , Cc: Romil Sharma --- drivers/pci/quirks.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 15341eacc50d..5cd2a3bd115a 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4682,6 +4682,20 @@ static int pci_quirk_mf_endpoint_acs(struct pci_dev *dev, u16 acs_flags) PCI_ACS_CR | PCI_ACS_UF | PCI_ACS_DT); } +static int pci_quirk_rciep_acs(struct pci_dev *dev, u16 acs_flags) +{ + /* + * Intel RCiEP's are required to allow p2p only on translated + * addresses. Refer to Intel VT-d specification, r3.1, sec 3.16, + * "Root-Complex Peer to Peer Considerations". + */ + if (pci_pcie_type(dev) != PCI_EXP_TYPE_RC_END) + return -ENOTTY; + + return pci_acs_ctrl_enabled(acs_flags, + PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF); +} + static int pci_quirk_brcm_acs(struct pci_dev *dev, u16 acs_flags) { /* @@ -4764,6 +4778,7 @@ static const struct pci_dev_acs_enabled { /* I219 */ { PCI_VENDOR_ID_INTEL, 0x15b7, pci_quirk_mf_endpoint_acs }, { PCI_VENDOR_ID_INTEL, 0x15b8, pci_quirk_mf_endpoint_acs }, + { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_quirk_rciep_acs }, /* QCOM QDF2xxx root ports */ { PCI_VENDOR_ID_QCOM, 0x0400, pci_quirk_qcom_rp_acs }, { PCI_VENDOR_ID_QCOM, 0x0401, pci_quirk_qcom_rp_acs }, From f0d50ca045e44dd7b423c2ae69f6a598f07d93e3 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:56:59 +0900 Subject: [PATCH 280/427] kbuild: refactor KBUILD_VMLINUX_{OBJS,LIBS} calculation Do not overwrite core-y or drivers-y. Remove libs-y1 and libs-y2. Signed-off-by: Masahiro Yamada --- Makefile | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index c0c086d06753..24cf37c21cba 100644 --- a/Makefile +++ b/Makefile @@ -1069,19 +1069,18 @@ vmlinux-alldirs := $(sort $(vmlinux-dirs) Documentation \ build-dirs := $(vmlinux-dirs) clean-dirs := $(vmlinux-alldirs) -core-y := $(patsubst %/, %/built-in.a, $(core-y)) -drivers-y := $(patsubst %/, %/built-in.a, $(drivers-y)) -libs-y2 := $(patsubst %/, %/built-in.a, $(filter %/, $(libs-y))) -ifdef CONFIG_MODULES -libs-y1 := $(filter-out %/, $(libs-y)) -libs-y2 += $(patsubst %/, %/lib.a, $(filter %/, $(libs-y))) -else -libs-y1 := $(patsubst %/, %/lib.a, $(libs-y)) -endif - # Externally visible symbols (used by link-vmlinux.sh) -export KBUILD_VMLINUX_OBJS := $(head-y) $(core-y) $(libs-y2) $(drivers-y) -export KBUILD_VMLINUX_LIBS := $(libs-y1) +KBUILD_VMLINUX_OBJS := $(head-y) $(patsubst %/,%/built-in.a, $(core-y)) +KBUILD_VMLINUX_OBJS += $(addsuffix built-in.a, $(filter %/, $(libs-y))) +ifdef CONFIG_MODULES +KBUILD_VMLINUX_OBJS += $(patsubst %/, %/lib.a, $(filter %/, $(libs-y))) +KBUILD_VMLINUX_LIBS := $(filter-out %/, $(libs-y)) +else +KBUILD_VMLINUX_LIBS := $(patsubst %/,%/lib.a, $(libs-y)) +endif +KBUILD_VMLINUX_OBJS += $(patsubst %/,%/built-in.a, $(drivers-y)) + +export KBUILD_VMLINUX_OBJS KBUILD_VMLINUX_LIBS export KBUILD_LDS := arch/$(SRCARCH)/kernel/vmlinux.lds export LDFLAGS_vmlinux # used by scripts/Makefile.package From b2c88554912267483baf8b4f5ae0a1bff529f6a3 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:00 +0900 Subject: [PATCH 281/427] kbuild: update modules.order only when contained modules are updated Make modules.order depend on $(obj-m), and use if_changed to build it. This will avoid unneeded update of modules.order, which will be useful to optimize the modpost stage. Currently, the second pass of modpost is always invoked. By checking the timestamp of modules.order, we can avoid the unneeded modpost. Signed-off-by: Masahiro Yamada --- Makefile | 14 +++++++++++--- scripts/Makefile.build | 21 +++++++++++++-------- scripts/Makefile.lib | 27 ++++++++++++++------------- 3 files changed, 38 insertions(+), 24 deletions(-) diff --git a/Makefile b/Makefile index 24cf37c21cba..b0bbf8453b66 100644 --- a/Makefile +++ b/Makefile @@ -1066,6 +1066,10 @@ vmlinux-alldirs := $(sort $(vmlinux-dirs) Documentation \ $(patsubst %/,%,$(filter %/, $(core-) \ $(drivers-) $(libs-)))) +subdir-modorder := $(addsuffix modules.order,$(filter %/, \ + $(core-y) $(core-m) $(libs-y) $(libs-m) \ + $(drivers-y) $(drivers-m))) + build-dirs := $(vmlinux-dirs) clean-dirs := $(vmlinux-alldirs) @@ -1124,7 +1128,7 @@ targets := vmlinux # The actual objects are generated when descending, # make sure no implicit rule kicks in -$(sort $(vmlinux-deps)): descend ; +$(sort $(vmlinux-deps) $(subdir-modorder)): descend ; filechk_kernel.release = \ echo "$(KERNELVERSION)$$($(CONFIG_SHELL) $(srctree)/scripts/setlocalversion $(srctree))" @@ -1345,8 +1349,12 @@ PHONY += modules_check modules_check: modules.order $(Q)$(CONFIG_SHELL) $(srctree)/scripts/modules-check.sh $< -modules.order: descend - $(Q)$(AWK) '!x[$$0]++' $(addsuffix /$@, $(build-dirs)) > $@ +cmd_modules_order = $(AWK) '!x[$$0]++' $(real-prereqs) > $@ + +modules.order: $(subdir-modorder) FORCE + $(call if_changed,modules_order) + +targets += modules.order # Target to prepare building external modules PHONY += modules_prepare diff --git a/scripts/Makefile.build b/scripts/Makefile.build index a1f09bec8c70..2e8810b7e5ed 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -71,7 +71,7 @@ endif # subdir-builtin and subdir-modorder may contain duplications. Use $(sort ...) subdir-builtin := $(sort $(filter %/built-in.a, $(real-obj-y))) -subdir-modorder := $(sort $(filter %/modules.order, $(modorder))) +subdir-modorder := $(sort $(filter %/modules.order, $(obj-m))) targets-for-builtin := $(extra-y) @@ -83,8 +83,7 @@ ifdef need-builtin targets-for-builtin += $(obj)/built-in.a endif -targets-for-modules := $(obj-m) -targets-for-modules += $(patsubst %.o, %.mod, $(obj-m)) +targets-for-modules := $(patsubst %.o, %.mod, $(filter %.o, $(obj-m))) ifdef need-modorder targets-for-modules += $(obj)/modules.order @@ -361,8 +360,9 @@ endif $(obj)/%.o: $(src)/%.S $(objtool_dep) FORCE $(call if_changed_rule,as_o_S) -targets += $(filter-out $(subdir-builtin), $(real-obj-y)) $(real-obj-m) $(lib-y) -targets += $(always-y) $(MAKECMDGOALS) +targets += $(filter-out $(subdir-builtin), $(real-obj-y)) +targets += $(filter-out $(subdir-modorder), $(real-obj-m)) +targets += $(lib-y) $(always-y) $(MAKECMDGOALS) # Linker scripts preprocessor (.lds.S -> .lds) # --------------------------------------------------------------------------- @@ -404,11 +404,16 @@ $(obj)/built-in.a: $(real-obj-y) FORCE # # Create commands to either record .ko file or cat modules.order from # a subdirectory -$(obj)/modules.order: $(subdir-modorder) FORCE - $(Q){ $(foreach m, $(modorder), \ - $(if $(filter $^, $m), cat $m, echo $m);) :; } \ +# Add $(obj-m) as the prerequisite to avoid updating the timestamp of +# modules.order unless contained modules are updated. + +cmd_modules_order = { $(foreach m, $(real-prereqs), \ + $(if $(filter %/modules.order, $m), cat $m, echo $(patsubst %.o,%.ko,$m));) :; } \ | $(AWK) '!x[$$0]++' - > $@ +$(obj)/modules.order: $(obj-m) FORCE + $(call if_changed,modules_order) + # # Rule to compile a set of .o files into one .a file (with symbol table) # diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 748e44d5a1e3..e598b07e6de4 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -32,27 +32,29 @@ obj-m := $(filter-out $(obj-y),$(obj-m)) # Filter out objects already built-in lib-y := $(filter-out $(obj-y), $(sort $(lib-y) $(lib-m))) -# Determine modorder. -# Unfortunately, we don't have information about ordering between -y -# and -m subdirs. Just put -y's first. -ifdef need-modorder -modorder := $(patsubst %/,%/modules.order, $(filter %/, $(obj-y)) $(obj-m:.o=.ko)) -endif - # Subdirectories we need to descend into subdir-ym := $(sort $(subdir-y) $(subdir-m) \ $(patsubst %/,%, $(filter %/, $(obj-y) $(obj-m)))) -# Handle objects in subdirs -# --------------------------------------------------------------------------- -# o if we encounter foo/ in $(obj-y), replace it by foo/built-in.a -# o if we encounter foo/ in $(obj-m), remove it from $(obj-m) +# Handle objects in subdirs: +# - If we encounter foo/ in $(obj-y), replace it by foo/built-in.a and +# foo/modules.order +# - If we encounter foo/ in $(obj-m), replace it by foo/modules.order +# +# Generate modules.order to determine modorder. Unfortunately, we don't have +# information about ordering between -y and -m subdirs. Just put -y's first. + +ifdef need-modorder +obj-m := $(patsubst %/,%/modules.order, $(filter %/, $(obj-y)) $(obj-m)) +else +obj-m := $(filter-out %/, $(obj-m)) +endif + ifdef need-builtin obj-y := $(patsubst %/, %/built-in.a, $(obj-y)) else obj-y := $(filter-out %/, $(obj-y)) endif -obj-m := $(filter-out %/, $(obj-m)) # If $(foo-objs), $(foo-y), $(foo-m), or $(foo-) exists, foo.o is a composite object multi-used-y := $(sort $(foreach m,$(obj-y), $(if $(strip $($(m:.o=-objs)) $($(m:.o=-y)) $($(m:.o=-))), $(m)))) @@ -81,7 +83,6 @@ endif extra-y := $(addprefix $(obj)/,$(extra-y)) always-y := $(addprefix $(obj)/,$(always-y)) targets := $(addprefix $(obj)/,$(targets)) -modorder := $(addprefix $(obj)/,$(modorder)) obj-m := $(addprefix $(obj)/,$(obj-m)) lib-y := $(addprefix $(obj)/,$(lib-y)) real-obj-y := $(addprefix $(obj)/,$(real-obj-y)) From 91e6ee581270b8ae970f028b898314d73f16870b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:01 +0900 Subject: [PATCH 282/427] modpost: fix -i (--ignore-errors) MAKEFLAGS detection $(filter -i,$(MAKEFLAGS)) works only in limited use-cases. The representation of $(MAKEFLAGS) depends on various factors: - GNU Make version (version 3.8x or version 4.x) - The presence of other flags like -j In my experiments, $(MAKEFLAGS) is expanded as follows: * GNU Make 3.8x: * without -j option: --no-print-directory -Rri * with -j option: --no-print-directory -Rr --jobserver-fds=3,4 -j -i * GNU Make 4.x: * without -j option: irR --no-print-directory * with -j option: irR -j --jobserver-fds=3,4 --no-print-directory For GNU Make 4.x, the flags are grouped as 'irR', which does not work. For the single thread build with GNU Make 3.8x, the flags are grouped as '-Rri', which does not work either. To make it work for all cases, do likewise as commit 6f0fa58e4596 ("kbuild: simplify silent build (-s) detection"). BTW, since commit ff9b45c55b26 ("kbuild: modpost: read modules.order instead of $(MODVERDIR)/*.mod"), you also need to pass -k option to build final *.ko files. 'make -i -k' ignores compile errors in modules, and build as many remaining *.ko as possible. Please note this feature is kind of dangerous if other modules depend on the broken module because the generated modules will lack the correct module dependency or CRC. Honestly, I am not a big fan of it, but I am keeping this feature. Fixes: eed380f3f593 ("modpost: Optionally ignore secondary errors seen if a single module build fails") Cc: Guenter Roeck Signed-off-by: Masahiro Yamada --- scripts/Makefile.modpost | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index b79bf0e30d32..cadc74c6b5a4 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -66,7 +66,7 @@ __modpost: else -MODPOST += $(subst -i,-n,$(filter -i,$(MAKEFLAGS))) -s -T - \ +MODPOST += -s -T - \ $(if $(KBUILD_NSDEPS),-d $(MODULES_NSDEPS)) ifeq ($(KBUILD_EXTMOD),) @@ -82,6 +82,11 @@ include $(if $(wildcard $(KBUILD_EXTMOD)/Kbuild), \ $(KBUILD_EXTMOD)/Kbuild, $(KBUILD_EXTMOD)/Makefile) endif +# 'make -i -k' ignores compile errors, and builds as many modules as possible. +ifneq ($(findstring i,$(filter-out --%,$(MAKEFLAGS))),) +MODPOST += -n +endif + # find all modules listed in modules.order modules := $(sort $(shell cat $(MODORDER))) From 89d6117680bd8ac6a86f58576de0bd6905421707 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:02 +0900 Subject: [PATCH 283/427] modpost: move -T option close to the modpost command The '-T -' option reads the file list from stdin. It is clearer to put it close to the piped command. Signed-off-by: Masahiro Yamada --- scripts/Makefile.modpost | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index cadc74c6b5a4..ac143c085182 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -66,7 +66,7 @@ __modpost: else -MODPOST += -s -T - \ +MODPOST += -s \ $(if $(KBUILD_NSDEPS),-d $(MODULES_NSDEPS)) ifeq ($(KBUILD_EXTMOD),) @@ -93,7 +93,7 @@ modules := $(sort $(shell cat $(MODORDER))) # Read out modules.order instead of expanding $(modules) to pass in modpost. # Otherwise, allmodconfig would fail with "Argument list too long". quiet_cmd_modpost = MODPOST $(words $(modules)) modules - cmd_modpost = sed 's/ko$$/o/' $(MODORDER) | $(MODPOST) + cmd_modpost = sed 's/ko$$/o/' $(MODORDER) | $(MODPOST) -T - __modpost: $(call cmd,modpost) From 4e5ab74c3cbbe7ca2b907a86ce5140e442b340bf Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:03 +0900 Subject: [PATCH 284/427] modpost: pass -N option only for modules modpost The built-in only code is not required to have MODULE_IMPORT_NS() to use symbols. So, the namespace is not checked for vmlinux(.o). Do not pass the meaningless -N option to the first pass of modpost. Signed-off-by: Masahiro Yamada --- scripts/Makefile.modpost | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index ac143c085182..3334f100a490 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -53,7 +53,6 @@ MODPOST = scripts/mod/modpost \ $(if $(KBUILD_EXTMOD),$(addprefix -e ,$(KBUILD_EXTRA_SYMBOLS))) \ $(if $(KBUILD_EXTMOD),-o $(modulesymfile)) \ $(if $(CONFIG_SECTION_MISMATCH_WARN_ONLY),,-E) \ - $(if $(CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS)$(KBUILD_NSDEPS),-N) \ $(if $(KBUILD_MODPOST_WARN),-w) ifdef MODPOST_VMLINUX @@ -82,6 +81,10 @@ include $(if $(wildcard $(KBUILD_EXTMOD)/Kbuild), \ $(KBUILD_EXTMOD)/Kbuild, $(KBUILD_EXTMOD)/Makefile) endif +# modpost options for modules (both in-kernel and external) +MODPOST += \ + $(if $(CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS)$(KBUILD_NSDEPS),-N) + # 'make -i -k' ignores compile errors, and builds as many modules as possible. ifneq ($(findstring i,$(filter-out --%,$(MAKEFLAGS))),) MODPOST += -n From 2beee868997485a5718a349c7868260d5ee7378f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:04 +0900 Subject: [PATCH 285/427] modpost: load KBUILD_EXTRA_SYMBOLS files in order Currently, modpost reads extra symbol dump files in the reverse order. If '-e foo -e bar' is given, modpost reads bar, foo, in this order. This is probably not a big deal, but there is no good reason to reverse the order. Read files in the given order. Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 160139508821..5224a02edbf2 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2555,8 +2555,8 @@ int main(int argc, char **argv) int opt; int err; int n; - struct ext_sym_list *extsym_iter; struct ext_sym_list *extsym_start = NULL; + struct ext_sym_list **extsym_iter = &extsym_start; while ((opt = getopt(argc, argv, "i:e:mnsT:o:awENd:")) != -1) { switch (opt) { @@ -2566,11 +2566,9 @@ int main(int argc, char **argv) break; case 'e': external_module = 1; - extsym_iter = - NOFAIL(malloc(sizeof(*extsym_iter))); - extsym_iter->next = extsym_start; - extsym_iter->file = optarg; - extsym_start = extsym_iter; + *extsym_iter = NOFAIL(calloc(1, sizeof(**extsym_iter))); + (*extsym_iter)->file = optarg; + extsym_iter = &(*extsym_iter)->next; break; case 'm': modversions = 1; @@ -2610,10 +2608,12 @@ int main(int argc, char **argv) if (kernel_read) read_dump(kernel_read, 1); while (extsym_start) { + struct ext_sym_list *tmp; + read_dump(extsym_start->file, 0); - extsym_iter = extsym_start->next; + tmp = extsym_start->next; free(extsym_start); - extsym_start = extsym_iter; + extsym_start = tmp; } while (optind < argc) From 067a436b1b0aafa593344fddd711a755a58afb3b Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Wed, 3 Jun 2020 17:08:20 +0200 Subject: [PATCH 286/427] ima: Directly assign the ima_default_policy pointer to ima_rules This patch prevents the following oops: [ 10.771813] BUG: kernel NULL pointer dereference, address: 0000000000000 [...] [ 10.779790] RIP: 0010:ima_match_policy+0xf7/0xb80 [...] [ 10.798576] Call Trace: [ 10.798993] ? ima_lsm_policy_change+0x2b0/0x2b0 [ 10.799753] ? inode_init_owner+0x1a0/0x1a0 [ 10.800484] ? _raw_spin_lock+0x7a/0xd0 [ 10.801592] ima_must_appraise.part.0+0xb6/0xf0 [ 10.802313] ? ima_fix_xattr.isra.0+0xd0/0xd0 [ 10.803167] ima_must_appraise+0x4f/0x70 [ 10.804004] ima_post_path_mknod+0x2e/0x80 [ 10.804800] do_mknodat+0x396/0x3c0 It occurs when there is a failure during IMA initialization, and ima_init_policy() is not called. IMA hooks still call ima_match_policy() but ima_rules is NULL. This patch prevents the crash by directly assigning the ima_default_policy pointer to ima_rules when ima_rules is defined. This wouldn't alter the existing behavior, as ima_rules is always set at the end of ima_init_policy(). Cc: stable@vger.kernel.org # 3.7.x Fixes: 07f6a79415d7d ("ima: add appraise action keywords and default rules") Reported-by: Takashi Iwai Signed-off-by: Roberto Sassu Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_policy.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index ef7f68cc935e..e493063a3c34 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -204,7 +204,7 @@ static struct ima_rule_entry *arch_policy_entry __ro_after_init; static LIST_HEAD(ima_default_rules); static LIST_HEAD(ima_policy_rules); static LIST_HEAD(ima_temp_rules); -static struct list_head *ima_rules; +static struct list_head *ima_rules = &ima_default_rules; /* Pre-allocated buffer used for matching keyrings. */ static char *ima_keyrings; @@ -768,7 +768,6 @@ void __init ima_init_policy(void) ARRAY_SIZE(default_appraise_rules), IMA_DEFAULT_POLICY); - ima_rules = &ima_default_rules; ima_update_policy_flag(); } From 6cc7c266e5b47d3cd2b5bb7fd3aac4e6bb2dd1d2 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Wed, 3 Jun 2020 17:08:21 +0200 Subject: [PATCH 287/427] ima: Call ima_calc_boot_aggregate() in ima_eventdigest_init() If the template field 'd' is chosen and the digest to be added to the measurement entry was not calculated with SHA1 or MD5, it is recalculated with SHA1, by using the passed file descriptor. However, this cannot be done for boot_aggregate, because there is no file descriptor. This patch adds a call to ima_calc_boot_aggregate() in ima_eventdigest_init(), so that the digest can be recalculated also for the boot_aggregate entry. Cc: stable@vger.kernel.org # 3.13.x Fixes: 3ce1217d6cd5d ("ima: define template fields library and new helpers") Reported-by: Takashi Iwai Signed-off-by: Roberto Sassu Signed-off-by: Mimi Zohar --- security/integrity/ima/ima.h | 3 ++- security/integrity/ima/ima_crypto.c | 6 +++--- security/integrity/ima/ima_init.c | 2 +- security/integrity/ima/ima_template_lib.c | 18 ++++++++++++++++++ 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 02796473238b..df93ac258e01 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -57,6 +57,7 @@ extern int ima_hash_algo_idx __ro_after_init; extern int ima_extra_slots __ro_after_init; extern int ima_appraise; extern struct tpm_chip *ima_tpm_chip; +extern const char boot_aggregate_name[]; /* IMA event related data */ struct ima_event_data { @@ -144,7 +145,7 @@ int ima_calc_buffer_hash(const void *buf, loff_t len, struct ima_digest_data *hash); int ima_calc_field_array_hash(struct ima_field_data *field_data, struct ima_template_entry *entry); -int __init ima_calc_boot_aggregate(struct ima_digest_data *hash); +int ima_calc_boot_aggregate(struct ima_digest_data *hash); void ima_add_violation(struct file *file, const unsigned char *filename, struct integrity_iint_cache *iint, const char *op, const char *cause); diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index 5201f5ec2ce4..002fdf6994d5 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -806,8 +806,8 @@ static void __init ima_pcrread(u32 idx, struct tpm_digest *d) * hash algorithm for reading the TPM PCRs as for calculating the boot * aggregate digest as stored in the measurement list. */ -static int __init ima_calc_boot_aggregate_tfm(char *digest, u16 alg_id, - struct crypto_shash *tfm) +static int ima_calc_boot_aggregate_tfm(char *digest, u16 alg_id, + struct crypto_shash *tfm) { struct tpm_digest d = { .alg_id = alg_id, .digest = {0} }; int rc; @@ -835,7 +835,7 @@ static int __init ima_calc_boot_aggregate_tfm(char *digest, u16 alg_id, return rc; } -int __init ima_calc_boot_aggregate(struct ima_digest_data *hash) +int ima_calc_boot_aggregate(struct ima_digest_data *hash) { struct crypto_shash *tfm; u16 crypto_id, alg_id; diff --git a/security/integrity/ima/ima_init.c b/security/integrity/ima/ima_init.c index fc1e1002b48d..4902fe7bd570 100644 --- a/security/integrity/ima/ima_init.c +++ b/security/integrity/ima/ima_init.c @@ -19,7 +19,7 @@ #include "ima.h" /* name for boot aggregate entry */ -static const char boot_aggregate_name[] = "boot_aggregate"; +const char boot_aggregate_name[] = "boot_aggregate"; struct tpm_chip *ima_tpm_chip; /* Add the boot aggregate to the IMA measurement list and extend diff --git a/security/integrity/ima/ima_template_lib.c b/security/integrity/ima/ima_template_lib.c index 9cd1e50f3ccc..635c6ac05050 100644 --- a/security/integrity/ima/ima_template_lib.c +++ b/security/integrity/ima/ima_template_lib.c @@ -286,6 +286,24 @@ int ima_eventdigest_init(struct ima_event_data *event_data, goto out; } + if ((const char *)event_data->filename == boot_aggregate_name) { + if (ima_tpm_chip) { + hash.hdr.algo = HASH_ALGO_SHA1; + result = ima_calc_boot_aggregate(&hash.hdr); + + /* algo can change depending on available PCR banks */ + if (!result && hash.hdr.algo != HASH_ALGO_SHA1) + result = -EINVAL; + + if (result < 0) + memset(&hash, 0, sizeof(hash)); + } + + cur_digest = hash.hdr.digest; + cur_digestsize = hash_digest_size[HASH_ALGO_SHA1]; + goto out; + } + if (!event_data->file) /* missing info to re-calculate the digest */ return -EINVAL; From 6b6aeffc932d5469c0dbb114bee59f34e8e02e65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Guerrero=20=C3=81lvarez?= Date: Thu, 16 Apr 2020 16:14:56 +0200 Subject: [PATCH 288/427] ext4: fix a style issue in fs/ext4/acl.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixed an if statement where braces were not needed. Link: https://lore.kernel.org/r/20200416141456.1089-1-carlosteniswarrior@gmail.com Signed-off-by: Carlos Guerrero Álvarez Signed-off-by: Theodore Ts'o Reviewed-by: Ritesh Harjani --- fs/ext4/acl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 8c7bbf3e566d..b3eba92f38f5 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -215,9 +215,8 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, value, size, xattr_flags); kfree(value); - if (!error) { + if (!error) set_cached_acl(inode, type, acl); - } return error; } From 9e52484c713321e84e8834803a44ca0a001376d2 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Wed, 15 Apr 2020 16:31:39 -0400 Subject: [PATCH 289/427] ext4: remove EXT4_GET_BLOCKS_KEEP_SIZE flag The eofblocks code was removed in the 5.7 release by "ext4: remove EOFBLOCKS_FL and associated code" (4337ecd1fe99). The ext4_map_blocks() flag used to trigger it can now be removed as well. Signed-off-by: Eric Whitney Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200415203140.30349-2-enwlinux@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 -- fs/ext4/extents.c | 4 ---- fs/ext4/inode.c | 12 ++++-------- include/trace/events/ext4.h | 1 - 4 files changed, 4 insertions(+), 15 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 91eb4381cae5..c8d060627448 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -609,8 +609,6 @@ enum { #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 /* Don't normalize allocation size (used for fallocate) */ #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 - /* Request will not result in inode size update (user for fallocate) */ -#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 /* Convert written extents to unwritten */ #define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100 /* Write zeros to newly created written extents */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f2b577b315a0..5809891eadab 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4507,8 +4507,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, } flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - if (mode & FALLOC_FL_KEEP_SIZE) - flags |= EXT4_GET_BLOCKS_KEEP_SIZE; /* Wait all existing dio workers, newcomers will block on i_mutex */ inode_dio_wait(inode); @@ -4647,8 +4645,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - if (mode & FALLOC_FL_KEEP_SIZE) - flags |= EXT4_GET_BLOCKS_KEEP_SIZE; inode_lock(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2a4aae6acdcb..693a3722337a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -432,11 +432,9 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, flags & - EXT4_GET_BLOCKS_KEEP_SIZE); + retval = ext4_ext_map_blocks(handle, inode, map, 0); } else { - retval = ext4_ind_map_blocks(handle, inode, map, flags & - EXT4_GET_BLOCKS_KEEP_SIZE); + retval = ext4_ind_map_blocks(handle, inode, map, 0); } up_read((&EXT4_I(inode)->i_data_sem)); @@ -541,11 +539,9 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, flags & - EXT4_GET_BLOCKS_KEEP_SIZE); + retval = ext4_ext_map_blocks(handle, inode, map, 0); } else { - retval = ext4_ind_map_blocks(handle, inode, map, flags & - EXT4_GET_BLOCKS_KEEP_SIZE); + retval = ext4_ind_map_blocks(handle, inode, map, 0); } if (retval > 0) { unsigned int status; diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 19c87661eeec..40ff8a2fc763 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -45,7 +45,6 @@ struct partial_cluster; { EXT4_GET_BLOCKS_CONVERT, "CONVERT" }, \ { EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \ { EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \ - { EXT4_GET_BLOCKS_KEEP_SIZE, "KEEP_SIZE" }, \ { EXT4_GET_BLOCKS_ZERO, "ZERO" }) /* From 493e83aafa02316bc79ec90041c378d7902194fa Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Wed, 15 Apr 2020 16:31:40 -0400 Subject: [PATCH 290/427] ext4: translate a few more map flags to strings in tracepoints As new ext4_map_blocks() flags have been added, not all have gotten flag bit to string translations to make tracepoint output more readable. Fix that, and go one step further by adding a translation for the EXT4_EX_NOCACHE flag as well. The EXT4_EX_FORCE_CACHE flag can never be set in a tracepoint in the current code, so there's no need to bother with a translation for it right now. Signed-off-by: Eric Whitney Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200415203140.30349-3-enwlinux@gmail.com Signed-off-by: Theodore Ts'o --- include/trace/events/ext4.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 40ff8a2fc763..280475c1cecc 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -45,7 +45,10 @@ struct partial_cluster; { EXT4_GET_BLOCKS_CONVERT, "CONVERT" }, \ { EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \ { EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \ - { EXT4_GET_BLOCKS_ZERO, "ZERO" }) + { EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, "CONVERT_UNWRITTEN" }, \ + { EXT4_GET_BLOCKS_ZERO, "ZERO" }, \ + { EXT4_GET_BLOCKS_IO_SUBMIT, "IO_SUBMIT" }, \ + { EXT4_EX_NOCACHE, "EX_NOCACHE" }) /* * __print_flags() requires that all enum values be wrapped in the From 39c0ae163f3b3ae691e7cce226ba1984ef6456b1 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Mon, 20 Apr 2020 12:29:18 +0800 Subject: [PATCH 291/427] ext4: remove unnecessary comparisons to bool Fix the following coccicheck warning: fs/ext4/extents_status.c:1057:5-28: WARNING: Comparison to bool fs/ext4/inode.c:2314:18-24: WARNING: Comparison to bool Signed-off-by: Jason Yan Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/20200420042918.19459-1-yanaijie@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 2 +- fs/ext4/inode.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index d996b44d2265..e75171535375 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1054,7 +1054,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end; /* record the first block of the first delonly extent seen */ - if (rc->first_do_lblk_found == false) { + if (!rc->first_do_lblk_found) { rc->first_do_lblk = i; rc->first_do_lblk_found = true; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 693a3722337a..4a3381eb1bbe 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2307,7 +2307,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) * mapping, or maybe the page was submitted for IO. * So we return to call further extent mapping. */ - if (err < 0 || map_bh == true) + if (err < 0 || map_bh) goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_page(mpd, page); From c36a71b4e35ab35340facdd6964a00956b9fef0a Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Mon, 20 Apr 2020 19:39:59 -0700 Subject: [PATCH 292/427] ext4: fix EXT_MAX_EXTENT/INDEX to check for zeroed eh_max If eh->eh_max is 0, EXT_MAX_EXTENT/INDEX would evaluate to unsigned (-1) resulting in illegal memory accesses. Although there is no consistent repro, we see that generic/019 sometimes crashes because of this bug. Ran gce-xfstests smoke and verified that there were no regressions. Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20200421023959.20879-2-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ext4_extents.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 1c216fcc202a..44e59881a1f0 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -170,10 +170,13 @@ struct partial_cluster { (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) #define EXT_LAST_INDEX(__hdr__) \ (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) -#define EXT_MAX_EXTENT(__hdr__) \ - (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) +#define EXT_MAX_EXTENT(__hdr__) \ + ((le16_to_cpu((__hdr__)->eh_max)) ? \ + ((EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \ + : 0) #define EXT_MAX_INDEX(__hdr__) \ - (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) + ((le16_to_cpu((__hdr__)->eh_max)) ? \ + ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) : 0) static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode) { From 3bbd0ef26098d241dc59ee77ba14b7dab0df0786 Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Thu, 23 Apr 2020 13:09:27 +0800 Subject: [PATCH 293/427] ext4: fix buffer_head refcnt leak when ext4_iget() fails ext4_orphan_get() invokes ext4_read_inode_bitmap(), which returns a reference of the specified buffer_head object to "bitmap_bh" with increased refcnt. When ext4_orphan_get() returns, local variable "bitmap_bh" becomes invalid, so the refcount should be decreased to keep refcount balanced. The reference counting issue happens in one exception handling path of ext4_orphan_get(). When ext4_iget() fails, the function forgets to decrease the refcnt increased by ext4_read_inode_bitmap(), causing a refcnt leak. Fix this issue by calling brelse() when ext4_iget() fails. Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Cc: stable@kernel.org Link: https://lore.kernel.org/r/1587618568-13418-1-git-send-email-xiyuyang19@fudan.edu.cn Signed-off-by: Theodore Ts'o --- fs/ext4/ialloc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 4b8c9a9bdf0c..011bcb8c4770 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1246,6 +1246,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) ext4_error_err(sb, -err, "couldn't read orphan inode %lu (err %d)", ino, err); + brelse(bitmap_bh); return inode; } From 4301efa4c7cca11556dd89899eee066d49b47bf7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 21 Apr 2020 10:54:44 +0200 Subject: [PATCH 294/427] writeback: Export inode_io_list_del() Ext4 needs to remove inode from writeback lists after it is out of visibility of its journalling machinery (which can still dirty the inode). Export inode_io_list_del() for it. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20200421085445.5731-3-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/fs-writeback.c | 1 + fs/internal.h | 2 -- include/linux/writeback.h | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 76ac9c7d32ec..e58bd5f758d0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1126,6 +1126,7 @@ void inode_io_list_del(struct inode *inode) inode_io_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); } +EXPORT_SYMBOL(inode_io_list_del); /* * mark an inode as under writeback on the sb diff --git a/fs/internal.h b/fs/internal.h index aa5d45524e87..8819d0d58b03 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -143,8 +143,6 @@ extern int dentry_needs_remove_privs(struct dentry *dentry); /* * fs-writeback.c */ -extern void inode_io_list_del(struct inode *inode); - extern long get_nr_dirty_inodes(void); extern int invalidate_inodes(struct super_block *, bool); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a19d845dd7eb..902aa317621b 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -197,6 +197,7 @@ void wakeup_flusher_threads(enum wb_reason reason); void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); +void inode_io_list_del(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) From ceff86fddae8748fe00d4f2d249cb02cae62ad84 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 21 Apr 2020 10:54:45 +0200 Subject: [PATCH 295/427] ext4: Avoid freeing inodes on dirty list When we are evicting inode with journalled data, we may race with transaction commit in the following way: CPU0 CPU1 jbd2_journal_commit_transaction() evict(inode) inode_io_list_del() inode_wait_for_writeback() process BJ_Forget list __jbd2_journal_insert_checkpoint() __jbd2_journal_refile_buffer() __jbd2_journal_unfile_buffer() if (test_clear_buffer_jbddirty(bh)) mark_buffer_dirty(bh) __mark_inode_dirty(inode) ext4_evict_inode(inode) frees the inode This results in use-after-free issues in the writeback code (or the assertion added in the previous commit triggering). Fix the problem by removing inode from writeback lists once all the page cache is evicted and so inode cannot be added to writeback lists again. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20200421085445.5731-4-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4a3381eb1bbe..a7087ff533bb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -220,6 +220,16 @@ void ext4_evict_inode(struct inode *inode) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages_final(&inode->i_data); + /* + * For inodes with journalled data, transaction commit could have + * dirtied the inode. Flush worker is ignoring it because of I_FREEING + * flag but we still need to remove the inode from the writeback lists. + */ + if (!list_empty_careful(&inode->i_io_list)) { + WARN_ON_ONCE(!ext4_should_journal_data(inode)); + inode_io_list_del(inode); + } + /* * Protect us against freezing - iput() caller didn't have to have any * protection against it From 8418897f1bf87da0cb6936489d57a4320c32c0af Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Thu, 23 Apr 2020 15:46:44 +0800 Subject: [PATCH 296/427] ext4: fix error pointer dereference Don't pass error pointers to brelse(). commit 7159a986b420 ("ext4: fix some error pointer dereferences") has fixed some cases, fix the remaining one case. Once ext4_xattr_block_find()->ext4_sb_bread() failed, error pointer is stored in @bs->bh, which will be passed to brelse() in the cleanup routine of ext4_xattr_set_handle(). This will then cause a NULL panic crash in __brelse(). BUG: unable to handle kernel NULL pointer dereference at 000000000000005b RIP: 0010:__brelse+0x1b/0x50 Call Trace: ext4_xattr_set_handle+0x163/0x5d0 ext4_xattr_set+0x95/0x110 __vfs_setxattr+0x6b/0x80 __vfs_setxattr_noperm+0x68/0x1b0 vfs_setxattr+0xa0/0xb0 setxattr+0x12c/0x1a0 path_setxattr+0x8d/0xc0 __x64_sys_setxattr+0x27/0x30 do_syscall_64+0x60/0x250 entry_SYSCALL_64_after_hwframe+0x49/0xbe In this case, @bs->bh stores '-EIO' actually. Fixes: fb265c9cb49e ("ext4: add ext4_sb_bread() to disambiguate ENOMEM cases") Signed-off-by: Jeffle Xu Reviewed-by: Joseph Qi Cc: stable@kernel.org # 2.6.19 Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/1587628004-95123-1-git-send-email-jefflexu@linux.alibaba.com Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 21df43a25328..01ba66373e97 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1800,8 +1800,11 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, if (EXT4_I(inode)->i_file_acl) { /* The inode already has an extended attribute block. */ bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); - if (IS_ERR(bs->bh)) - return PTR_ERR(bs->bh); + if (IS_ERR(bs->bh)) { + error = PTR_ERR(bs->bh); + bs->bh = NULL; + return error; + } ea_bdebug(bs->bh, "b_count=%d, refcount=%d", atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); From 4209ae12b12265d475bba28634184423149bd14f Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Sun, 26 Apr 2020 18:34:37 -0700 Subject: [PATCH 297/427] ext4: handle ext4_mark_inode_dirty errors ext4_mark_inode_dirty() can fail for real reasons. Ignoring its return value may lead ext4 to ignore real failures that would result in corruption / crashes. Harden ext4_mark_inode_dirty error paths to fail as soon as possible and return errors to the caller whenever appropriate. One of the possible scnearios when this bug could affected is that while creating a new inode, its directory entry gets added successfully but while writing the inode itself mark_inode_dirty returns error which is ignored. This would result in inconsistency that the directory entry points to a non-existent inode. Ran gce-xfstests smoke tests and verified that there were no regressions. Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20200427013438.219117-1-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/acl.c | 2 +- fs/ext4/ext4.h | 2 +- fs/ext4/ext4_jbd2.h | 5 ++- fs/ext4/extents.c | 34 +++++++++++--------- fs/ext4/file.c | 11 +++++-- fs/ext4/indirect.c | 4 ++- fs/ext4/inline.c | 6 ++-- fs/ext4/inode.c | 38 ++++++++++++++++------- fs/ext4/migrate.c | 12 ++++--- fs/ext4/namei.c | 76 +++++++++++++++++++++++++++++---------------- fs/ext4/super.c | 16 ++++++---- fs/ext4/xattr.c | 6 ++-- 12 files changed, 139 insertions(+), 73 deletions(-) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index b3eba92f38f5..76f634d185f1 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -255,7 +255,7 @@ retry: if (!error && update_mode) { inode->i_mode = mode; inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + error = ext4_mark_inode_dirty(handle, inode); } out_stop: ext4_journal_stop(handle); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c8d060627448..884ce3086486 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3352,7 +3352,7 @@ struct ext4_extent; */ #define EXT_MAX_BLOCKS 0xffffffff -extern int ext4_ext_tree_init(handle_t *handle, struct inode *); +extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode); extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 4b9002f0e84c..3bacf76d2609 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -222,7 +222,10 @@ ext4_mark_iloc_dirty(handle_t *handle, int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc); -int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); +#define ext4_mark_inode_dirty(__h, __i) \ + __ext4_mark_inode_dirty((__h), (__i), __func__, __LINE__) +int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, + const char *func, unsigned int line); int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5809891eadab..49ea6973d65f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -816,7 +816,7 @@ ext4_ext_binsearch(struct inode *inode, } -int ext4_ext_tree_init(handle_t *handle, struct inode *inode) +void ext4_ext_tree_init(handle_t *handle, struct inode *inode) { struct ext4_extent_header *eh; @@ -826,7 +826,6 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode) eh->eh_magic = EXT4_EXT_MAGIC; eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); ext4_mark_inode_dirty(handle, inode); - return 0; } struct ext4_ext_path * @@ -1319,7 +1318,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, ext4_idx_pblock(EXT_FIRST_INDEX(neh))); le16_add_cpu(&neh->eh_depth, 1); - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); out: brelse(bh); @@ -4363,7 +4362,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, struct inode *inode = file_inode(file); handle_t *handle; int ret = 0; - int ret2 = 0; + int ret2 = 0, ret3 = 0; int retries = 0; int depth = 0; struct ext4_map_blocks map; @@ -4423,10 +4422,11 @@ retry: if (ext4_update_inode_size(inode, epos) & 0x1) inode->i_mtime = inode->i_ctime; } - ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); - ret2 = ext4_journal_stop(handle); - if (ret2) + ret3 = ext4_journal_stop(handle); + ret2 = ret3 ? ret3 : ret2; + if (unlikely(ret2)) break; } if (ret == -ENOSPC && @@ -4575,7 +4575,9 @@ static long ext4_zero_range(struct file *file, loff_t offset, inode->i_mtime = inode->i_ctime = current_time(inode); if (new_size) ext4_update_inode_size(inode, new_size); - ext4_mark_inode_dirty(handle, inode); + ret = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret)) + goto out_handle; /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); @@ -4585,6 +4587,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (file->f_flags & O_SYNC) ext4_handle_sync(handle); +out_handle: ext4_journal_stop(handle); out_mutex: inode_unlock(inode); @@ -4696,8 +4699,7 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len) { unsigned int max_blocks; - int ret = 0; - int ret2 = 0; + int ret = 0, ret2 = 0, ret3 = 0; struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; unsigned int credits = 0; @@ -4730,9 +4732,13 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, "ext4_ext_map_blocks returned %d", inode->i_ino, map.m_lblk, map.m_len, ret); - ext4_mark_inode_dirty(handle, inode); - if (credits) - ret2 = ext4_journal_stop(handle); + ret2 = ext4_mark_inode_dirty(handle, inode); + if (credits) { + ret3 = ext4_journal_stop(handle); + if (unlikely(ret3)) + ret2 = ret3; + } + if (ret <= 0 || ret2) break; } @@ -5269,7 +5275,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (IS_SYNC(inode)) ext4_handle_sync(handle); inode->i_mtime = inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + ret = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 0d624250a62b..b8e69f9e3858 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -287,6 +287,7 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, bool truncate = false; u8 blkbits = inode->i_blkbits; ext4_lblk_t written_blk, end_blk; + int ret; /* * Note that EXT4_I(inode)->i_disksize can get extended up to @@ -327,8 +328,14 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, goto truncate; } - if (ext4_update_inode_size(inode, offset + written)) - ext4_mark_inode_dirty(handle, inode); + if (ext4_update_inode_size(inode, offset + written)) { + ret = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret)) { + written = ret; + ext4_journal_stop(handle); + goto truncate; + } + } /* * We may need to truncate allocated but not written blocks beyond EOF. diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 107f0043f67f..be2b66eb65f7 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -467,7 +467,9 @@ static int ext4_splice_branch(handle_t *handle, /* * OK, we spliced it into the inode itself on a direct block. */ - ext4_mark_inode_dirty(handle, ar->inode); + err = ext4_mark_inode_dirty(handle, ar->inode); + if (unlikely(err)) + goto err_out; jbd_debug(5, "splicing direct\n"); } return err; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index f35e289e17aa..c3a1ad2db122 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1260,7 +1260,7 @@ out: int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode) { - int ret, inline_size, no_expand; + int ret, ret2, inline_size, no_expand; void *inline_start; struct ext4_iloc iloc; @@ -1314,7 +1314,9 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, out: ext4_write_unlock_xattr(dir, &no_expand); - ext4_mark_inode_dirty(handle, dir); + ret2 = ext4_mark_inode_dirty(handle, dir); + if (unlikely(ret2 && !ret)) + ret = ret2; brelse(iloc.bh); return ret; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a7087ff533bb..456e8a6b4809 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1302,7 +1302,7 @@ static int ext4_write_end(struct file *file, * filesystems. */ if (i_size_changed || inline_data) - ext4_mark_inode_dirty(handle, inode); + ret = ext4_mark_inode_dirty(handle, inode); if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied @@ -3083,7 +3083,7 @@ static int ext4_da_write_end(struct file *file, * new_i_size is less that inode->i_size * bu greater than i_disksize.(hint delalloc) */ - ext4_mark_inode_dirty(handle, inode); + ret = ext4_mark_inode_dirty(handle, inode); } } @@ -3100,7 +3100,7 @@ static int ext4_da_write_end(struct file *file, if (ret2 < 0) ret = ret2; ret2 = ext4_journal_stop(handle); - if (!ret) + if (unlikely(ret2 && !ret)) ret = ret2; return ret ? ret : copied; @@ -3892,6 +3892,8 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t len) { handle_t *handle; + int ret; + loff_t size = i_size_read(inode); WARN_ON(!inode_is_locked(inode)); @@ -3905,10 +3907,10 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, if (IS_ERR(handle)) return PTR_ERR(handle); ext4_update_i_disksize(inode, size); - ext4_mark_inode_dirty(handle, inode); + ret = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); - return 0; + return ret; } static void ext4_wait_dax_page(struct ext4_inode_info *ei) @@ -3960,7 +3962,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) loff_t first_block_offset, last_block_offset; handle_t *handle; unsigned int credits; - int ret = 0; + int ret = 0, ret2 = 0; trace_ext4_punch_hole(inode, offset, length, 0); @@ -4083,7 +4085,9 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) ext4_handle_sync(handle); inode->i_mtime = inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret2)) + ret = ret2; if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: @@ -4152,7 +4156,7 @@ int ext4_truncate(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); unsigned int credits; - int err = 0; + int err = 0, err2; handle_t *handle; struct address_space *mapping = inode->i_mapping; @@ -4240,7 +4244,9 @@ out_stop: ext4_orphan_del(handle, inode); inode->i_mtime = inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + err2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err2 && !err)) + err = err2; ext4_journal_stop(handle); trace_ext4_truncate_exit(inode); @@ -5298,6 +5304,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) inode->i_gid = attr->ia_gid; error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); + if (unlikely(error)) + return error; } if (attr->ia_valid & ATTR_SIZE) { @@ -5783,7 +5791,8 @@ out_unlock: * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) * we start and wait on commits. */ -int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) +int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, + const char *func, unsigned int line) { struct ext4_iloc iloc; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -5793,13 +5802,18 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) - return err; + goto out; if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize) ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize, iloc, handle); - return ext4_mark_iloc_dirty(handle, inode, &iloc); + err = ext4_mark_iloc_dirty(handle, inode, &iloc); +out: + if (unlikely(err)) + ext4_error_inode_err(inode, func, line, 0, err, + "mark_inode_dirty error"); + return err; } /* diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index fb6520f37135..c5e3fc998211 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -287,7 +287,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, struct inode *tmp_inode) { - int retval; + int retval, retval2 = 0; __le32 i_data[3]; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode); @@ -342,7 +342,9 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, * i_blocks when freeing the indirect meta-data blocks */ retval = free_ind_block(handle, inode, i_data); - ext4_mark_inode_dirty(handle, inode); + retval2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(retval2 && !retval)) + retval = retval2; err_out: return retval; @@ -601,7 +603,7 @@ int ext4_ind_migrate(struct inode *inode) ext4_lblk_t start, end; ext4_fsblk_t blk; handle_t *handle; - int ret; + int ret, ret2 = 0; if (!ext4_has_feature_extents(inode->i_sb) || (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) @@ -655,7 +657,9 @@ int ext4_ind_migrate(struct inode *inode) memset(ei->i_data, 0, sizeof(ei->i_data)); for (i = start; i <= end; i++) ei->i_data[i] = cpu_to_le32(blk++); - ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret2 && !ret)) + ret = ret2; errout: ext4_journal_stop(handle); up_write(&EXT4_I(inode)->i_data_sem); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a8aca4772aaa..56738b538ddf 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1993,7 +1993,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, { unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; - int err; + int err, err2; if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); @@ -2028,12 +2028,12 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); inode_inc_iversion(dir); - ext4_mark_inode_dirty(handle, dir); + err2 = ext4_mark_inode_dirty(handle, dir); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_dirblock(handle, dir, bh); if (err) ext4_std_error(dir->i_sb, err); - return 0; + return err ? err : err2; } /* @@ -2223,7 +2223,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, } ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); dx_fallback++; - ext4_mark_inode_dirty(handle, dir); + retval = ext4_mark_inode_dirty(handle, dir); + if (unlikely(retval)) + goto out; } blocks = dir->i_size >> sb->s_blocksize_bits; for (block = 0; block < blocks; block++) { @@ -2576,12 +2578,12 @@ static int ext4_add_nondir(handle_t *handle, struct inode *inode = *inodep; int err = ext4_add_entry(handle, dentry, inode); if (!err) { - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); d_instantiate_new(dentry, inode); *inodep = NULL; - return 0; + return err; } drop_nlink(inode); ext4_orphan_add(handle, inode); @@ -2775,7 +2777,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; - int err, credits, retries = 0; + int err, err2 = 0, credits, retries = 0; if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; @@ -2808,7 +2810,9 @@ out_clear_inode: clear_nlink(inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); - ext4_mark_inode_dirty(handle, inode); + err2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err2)) + err = err2; ext4_journal_stop(handle); iput(inode); goto out_retry; @@ -3148,10 +3152,12 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) inode->i_size = 0; ext4_orphan_add(handle, inode); inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + retval = ext4_mark_inode_dirty(handle, inode); + if (retval) + goto end_rmdir; ext4_dec_count(handle, dir); ext4_update_dx_flag(dir); - ext4_mark_inode_dirty(handle, dir); + retval = ext4_mark_inode_dirty(handle, dir); #ifdef CONFIG_UNICODE /* VFS negative dentries are incompatible with Encoding and @@ -3221,7 +3227,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) goto end_unlink; dir->i_ctime = dir->i_mtime = current_time(dir); ext4_update_dx_flag(dir); - ext4_mark_inode_dirty(handle, dir); + retval = ext4_mark_inode_dirty(handle, dir); + if (retval) + goto end_unlink; if (inode->i_nlink == 0) ext4_warning_inode(inode, "Deleting file '%.*s' with no links", dentry->d_name.len, dentry->d_name.name); @@ -3230,7 +3238,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (!inode->i_nlink) ext4_orphan_add(handle, inode); inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + retval = ext4_mark_inode_dirty(handle, inode); #ifdef CONFIG_UNICODE /* VFS negative dentries are incompatible with Encoding and @@ -3419,7 +3427,7 @@ retry: err = ext4_add_entry(handle, dentry, inode); if (!err) { - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); /* this can happen only for tmpfile being * linked the first time */ @@ -3531,7 +3539,7 @@ static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent, static int ext4_setent(handle_t *handle, struct ext4_renament *ent, unsigned ino, unsigned file_type) { - int retval; + int retval, retval2; BUFFER_TRACE(ent->bh, "get write access"); retval = ext4_journal_get_write_access(handle, ent->bh); @@ -3543,19 +3551,19 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, inode_inc_iversion(ent->dir); ent->dir->i_ctime = ent->dir->i_mtime = current_time(ent->dir); - ext4_mark_inode_dirty(handle, ent->dir); + retval = ext4_mark_inode_dirty(handle, ent->dir); BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); if (!ent->inlined) { - retval = ext4_handle_dirty_dirblock(handle, ent->dir, ent->bh); - if (unlikely(retval)) { - ext4_std_error(ent->dir->i_sb, retval); - return retval; + retval2 = ext4_handle_dirty_dirblock(handle, ent->dir, ent->bh); + if (unlikely(retval2)) { + ext4_std_error(ent->dir->i_sb, retval2); + return retval2; } } brelse(ent->bh); ent->bh = NULL; - return 0; + return retval; } static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, @@ -3790,7 +3798,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, EXT4_FT_CHRDEV); if (retval) goto end_rename; - ext4_mark_inode_dirty(handle, whiteout); + retval = ext4_mark_inode_dirty(handle, whiteout); + if (unlikely(retval)) + goto end_rename; } if (!new.bh) { retval = ext4_add_entry(handle, new.dentry, old.inode); @@ -3811,7 +3821,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, * rename. */ old.inode->i_ctime = current_time(old.inode); - ext4_mark_inode_dirty(handle, old.inode); + retval = ext4_mark_inode_dirty(handle, old.inode); + if (unlikely(retval)) + goto end_rename; if (!whiteout) { /* @@ -3840,12 +3852,18 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, } else { ext4_inc_count(handle, new.dir); ext4_update_dx_flag(new.dir); - ext4_mark_inode_dirty(handle, new.dir); + retval = ext4_mark_inode_dirty(handle, new.dir); + if (unlikely(retval)) + goto end_rename; } } - ext4_mark_inode_dirty(handle, old.dir); + retval = ext4_mark_inode_dirty(handle, old.dir); + if (unlikely(retval)) + goto end_rename; if (new.inode) { - ext4_mark_inode_dirty(handle, new.inode); + retval = ext4_mark_inode_dirty(handle, new.inode); + if (unlikely(retval)) + goto end_rename; if (!new.inode->i_nlink) ext4_orphan_add(handle, new.inode); } @@ -3979,8 +3997,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, ctime = current_time(old.inode); old.inode->i_ctime = ctime; new.inode->i_ctime = ctime; - ext4_mark_inode_dirty(handle, old.inode); - ext4_mark_inode_dirty(handle, new.inode); + retval = ext4_mark_inode_dirty(handle, old.inode); + if (unlikely(retval)) + goto end_rename; + retval = ext4_mark_inode_dirty(handle, new.inode); + if (unlikely(retval)) + goto end_rename; if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 49821d8a0910..c3983f87587d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5880,7 +5880,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL; inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, S_NOATIME | S_IMMUTABLE); - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); unlock_inode: inode_unlock(inode); @@ -5982,12 +5982,14 @@ static int ext4_quota_off(struct super_block *sb, int type) * this is not a hard failure and quotas are already disabled. */ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); - if (IS_ERR(handle)) + if (IS_ERR(handle)) { + err = PTR_ERR(handle); goto out_unlock; + } EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); inode->i_mtime = inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); out_unlock: inode_unlock(inode); @@ -6045,7 +6047,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, { struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); - int err, offset = off & (sb->s_blocksize - 1); + int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1); int retries = 0; struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -6093,9 +6095,11 @@ out: if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; - ext4_mark_inode_dirty(handle, inode); + err2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err2 && !err)) + err = err2; } - return len; + return err ? err : len; } #endif diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 01ba66373e97..9b29a40738ac 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1327,7 +1327,7 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode, int blocksize = ea_inode->i_sb->s_blocksize; int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; int csize, wsize = 0; - int ret = 0; + int ret = 0, ret2 = 0; int retries = 0; retry: @@ -1385,7 +1385,9 @@ retry: ext4_update_i_disksize(ea_inode, wsize); inode_unlock(ea_inode); - ext4_mark_inode_dirty(handle, ea_inode); + ret2 = ext4_mark_inode_dirty(handle, ea_inode); + if (unlikely(ret2 && !ret)) + ret = ret2; out: brelse(bh); From b60ca3343e78761c6ebe6ff52c628893c8297b73 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Sun, 26 Apr 2020 18:34:38 -0700 Subject: [PATCH 298/427] ext4: don't ignore return values from ext4_ext_dirty() Don't ignore return values from ext4_ext_dirty, since the errors indicate valid failures below Ext4. In all of the other instances of ext4_ext_dirty calls, the error return value is handled in some way. This patch makes those remaining couple of places to handle ext4_ext_dirty errors as well. In case of ext4_split_extent_at(), the ignorance of return value is intentional. The reason is that we are already in error path and there isn't much we can do if ext4_ext_dirty returns error. This patch adds a comment for that case explaining why we ignore the return value. In the longer run, we probably should make sure that errors from other mark_dirty routines are handled as well. Ran gce-xfstests smoke tests and verified that there were no regressions. Signed-off-by: Harshad Shirwadkar Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200427013438.219117-2-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 49ea6973d65f..6befdecf977a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3243,6 +3243,10 @@ out: fix_extent_len: ex->ee_len = orig_ex.ee_len; + /* + * Ignore ext4_ext_dirty return value since we are already in error path + * and err is a non-zero error code. + */ ext4_ext_dirty(handle, inode, path + path->p_depth); return err; } @@ -3502,7 +3506,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } if (allocated) { /* Mark the block containing both extents as dirty */ - ext4_ext_dirty(handle, inode, path + depth); + err = ext4_ext_dirty(handle, inode, path + depth); /* Update path to point to the right extent */ path[depth].p_ext = abut_ex; From ee802f86899341d835c629a4ad9607135f1b71a2 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Thu, 30 Apr 2020 14:53:17 -0400 Subject: [PATCH 299/427] ext4: remove dead GET_BLOCKS_ZERO code There's no call to ext4_map_blocks() in the current ext4 code with a flags argument that combines EXT4_GET_BLOCKS_CONVERT and EXT4_GET_BLOCKS_ZERO. Remove the code that corresponds to this case from ext4_ext_handle_unwritten_extents(). Signed-off-by: Eric Whitney Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/20200430185320.23001-2-enwlinux@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 6befdecf977a..728f4a620fde 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3829,14 +3829,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, } /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { - if (flags & EXT4_GET_BLOCKS_ZERO) { - if (allocated > map->m_len) - allocated = map->m_len; - err = ext4_issue_zeroout(inode, map->m_lblk, newblock, - allocated); - if (err < 0) - goto out2; - } ret = ext4_convert_unwritten_extents_endio(handle, inode, map, ppath); if (ret >= 0) From bee6cf00c7f17df27c842c169db31d53bdd775ba Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Thu, 30 Apr 2020 14:53:18 -0400 Subject: [PATCH 300/427] ext4: remove redundant GET_BLOCKS_CONVERT code Remove the redundant code assigning values to ext4_map_blocks components in ext4_ext_handle_unwritten_extents() for the EXT4_GET_BLOCKS_CONVERT case, using the code at the function exit instead. Clean up and reorder that code to eliminate more redundancy and improve readability. Signed-off-by: Eric Whitney Link: https://lore.kernel.org/r/20200430185320.23001-3-enwlinux@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 728f4a620fde..57f02e2d2f52 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3829,20 +3829,14 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, } /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { - ret = ext4_convert_unwritten_extents_endio(handle, inode, map, + err = ext4_convert_unwritten_extents_endio(handle, inode, map, ppath); - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); - else - err = ret; - map->m_flags |= EXT4_MAP_MAPPED; - map->m_pblk = newblock; - if (allocated > map->m_len) - allocated = map->m_len; - map->m_len = allocated; - goto out2; + if (err < 0) + goto out2; + ext4_update_inode_fsync_trans(handle, inode, 1); + goto map_out; } - /* buffered IO case */ + /* buffered IO cases */ /* * repeat fallocate creation request * we already have an unwritten extent @@ -3876,18 +3870,14 @@ out: } else allocated = ret; map->m_flags |= EXT4_MAP_NEW; - if (allocated > map->m_len) - allocated = map->m_len; - map->m_len = allocated; - map_out: map->m_flags |= EXT4_MAP_MAPPED; out1: + map->m_pblk = newblock; if (allocated > map->m_len) allocated = map->m_len; - ext4_ext_show_leaf(inode, path); - map->m_pblk = newblock; map->m_len = allocated; + ext4_ext_show_leaf(inode, path); out2: return err ? err : allocated; } From 779e26517b3600830fe58933d5f97627711c9435 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Thu, 30 Apr 2020 14:53:19 -0400 Subject: [PATCH 301/427] ext4: clean up GET_BLOCKS_PRE_IO error handling If the call to ext4_split_convert_extents() fails in the EXT4_GET_BLOCKS_PRE_IO case within ext4_ext_handle_unwritten_extents(), error out through the exit point at function end rather than jumping through an intermediate point. Fix the error handling in the event ext4_split_convert_extents() returns 0, which it shouldn't do when splitting an existing extent. The current code returns the passed in value of allocated (which is likely non-zero) while failing to set m_flags, m_pblk, and m_len. Signed-off-by: Eric Whitney Link: https://lore.kernel.org/r/20200430185320.23001-4-enwlinux@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 57f02e2d2f52..c63bc13f9a72 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3818,12 +3818,25 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, trace_ext4_ext_handle_unwritten_extents(inode, map, flags, allocated, newblock); - /* get_block() before submit the IO, split the extent */ + /* get_block() before submitting IO, split the extent */ if (flags & EXT4_GET_BLOCKS_PRE_IO) { ret = ext4_split_convert_extents(handle, inode, map, ppath, flags | EXT4_GET_BLOCKS_CONVERT); - if (ret <= 0) - goto out; + if (ret < 0) { + err = ret; + goto out2; + } + /* + * shouldn't get a 0 return when splitting an extent unless + * m_len is 0 (bug) or extent has been corrupted + */ + if (unlikely(ret == 0)) { + EXT4_ERROR_INODE(inode, + "unexpected ret == 0, m_len = %u", + map->m_len); + err = -EFSCORRUPTED; + goto out2; + } map->m_flags |= EXT4_MAP_UNWRITTEN; goto out; } @@ -3863,12 +3876,13 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); -out: + if (ret <= 0) { err = ret; goto out2; - } else - allocated = ret; + } +out: + allocated = ret; map->m_flags |= EXT4_MAP_NEW; map_out: map->m_flags |= EXT4_MAP_MAPPED; From be809e1274ebc043640eeeb287accb7b4a4bcbff Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Thu, 30 Apr 2020 14:53:20 -0400 Subject: [PATCH 302/427] ext4: clean up ext4_ext_convert_to_initialized() error handling If ext4_ext_convert_to_initialized() fails when called within ext4_ext_handle_unwritten_extents(), immediately error out through the exit point at function end. Fix the error handling in the event ext4_ext_convert_to_initialized() returns 0, which it shouldn't do when converting an existing extent. The current code returns the passed in value of allocated (which is likely non-zero) while failing to set m_flags, m_pblk, and m_len. Signed-off-by: Eric Whitney Link: https://lore.kernel.org/r/20200430185320.23001-5-enwlinux@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c63bc13f9a72..03b94c6e90cc 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3872,15 +3872,28 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, goto out1; } - /* buffered write, writepage time, convert*/ + /* + * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1. + * For buffered writes, at writepage time, etc. Convert a + * discovered unwritten extent to written. + */ ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); - - if (ret <= 0) { + if (ret < 0) { err = ret; goto out2; } + ext4_update_inode_fsync_trans(handle, inode, 1); + /* + * shouldn't get a 0 return when converting an unwritten extent + * unless m_len is 0 (bug) or extent has been corrupted + */ + if (unlikely(ret == 0)) { + EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u", + map->m_len); + err = -EFSCORRUPTED; + goto out2; + } + out: allocated = ret; map->m_flags |= EXT4_MAP_NEW; From 80dd4978ddd78b2fe5aec2d52b69cbc2d06b08d8 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 3 May 2020 22:06:47 +0200 Subject: [PATCH 303/427] ext4: fix a typo in a comment s/extnets/extents/ Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/20200503200647.154701-1-christophe.jaillet@wanadoo.fr Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 03b94c6e90cc..3ca797faa86b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4503,7 +4503,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, inode_lock(inode); /* - * Indirect files do not support unwritten extnets + * Indirect files do not support unwritten extents */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { ret = -EOPNOTSUPP; From 08adf452e628b0e2ce9a01048cfbec52353703d7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 6 May 2020 11:31:40 -0700 Subject: [PATCH 304/427] ext4: fix race between ext4_sync_parent() and rename() 'igrab(d_inode(dentry->d_parent))' without holding dentry->d_lock is broken because without d_lock, d_parent can be concurrently changed due to a rename(). Then if the old directory is immediately deleted, old d_parent->inode can be NULL. That causes a NULL dereference in igrab(). To fix this, use dget_parent() to safely grab a reference to the parent dentry, which pins the inode. This also eliminates the need to use d_find_any_alias() other than for the initial inode, as we no longer throw away the dentry at each step. This is an extremely hard race to hit, but it is possible. Adding a udelay() in between the reads of ->d_parent and its ->d_inode makes it reproducible on a no-journal filesystem using the following program: #include #include int main() { if (fork()) { for (;;) { mkdir("dir1", 0700); int fd = open("dir1/file", O_RDWR|O_CREAT|O_SYNC); write(fd, "X", 1); close(fd); } } else { mkdir("dir2", 0700); for (;;) { rename("dir1/file", "dir2/file"); rmdir("dir1"); } } } Fixes: d59729f4e794 ("ext4: fix races in ext4_sync_parent()") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20200506183140.541194-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/fsync.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index e10206e7f4bb..093c359952cd 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -44,30 +44,28 @@ */ static int ext4_sync_parent(struct inode *inode) { - struct dentry *dentry = NULL; - struct inode *next; + struct dentry *dentry, *next; int ret = 0; if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) return 0; - inode = igrab(inode); + dentry = d_find_any_alias(inode); + if (!dentry) + return 0; while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); - dentry = d_find_any_alias(inode); - if (!dentry) - break; - next = igrab(d_inode(dentry->d_parent)); + + next = dget_parent(dentry); dput(dentry); - if (!next) - break; - iput(inode); - inode = next; + dentry = next; + inode = dentry->d_inode; + /* * The directory inode may have gone through rmdir by now. But * the inode itself and its blocks are still allocated (we hold - * a reference to the inode so it didn't go through - * ext4_evict_inode()) and so we are safe to flush metadata - * blocks and the inode. + * a reference to the inode via its dentry), so it didn't go + * through ext4_evict_inode()) and so we are safe to flush + * metadata blocks and the inode. */ ret = sync_mapping_buffers(inode->i_mapping); if (ret) @@ -76,7 +74,7 @@ static int ext4_sync_parent(struct inode *inode) if (ret) break; } - iput(inode); + dput(dentry); return ret; } From 73c384c0cdaa8ea9ca9ef2d0cff6a25930f1648e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 7 May 2020 10:50:28 -0700 Subject: [PATCH 305/427] ext4: avoid ext4_error()'s caused by ENOMEM in the truncate path We can't fail in the truncate path without requiring an fsck. Add work around for this by using a combination of retry loops and the __GFP_NOFAIL flag. From: Theodore Ts'o Signed-off-by: Theodore Ts'o Signed-off-by: Anna Pendleton Reviewed-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20200507175028.15061-1-pendleton@google.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/extents.c | 43 +++++++++++++++++++++++++++++++++---------- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 884ce3086486..5d901bf92ce9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -630,6 +630,7 @@ enum { */ #define EXT4_EX_NOCACHE 0x40000000 #define EXT4_EX_FORCE_CACHE 0x20000000 +#define EXT4_EX_NOFAIL 0x10000000 /* * Flags used by ext4_free_blocks diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3ca797faa86b..ff7eeb5a77ef 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -297,11 +297,14 @@ ext4_force_split_extent_at(handle_t *handle, struct inode *inode, { struct ext4_ext_path *path = *ppath; int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); + int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO; + + if (nofail) + flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL; return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ? EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, - EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | - (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0)); + flags); } static int @@ -487,8 +490,12 @@ __read_extent_tree_block(const char *function, unsigned int line, { struct buffer_head *bh; int err; + gfp_t gfp_flags = __GFP_MOVABLE | GFP_NOFS; - bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS); + if (flags & EXT4_EX_NOFAIL) + gfp_flags |= __GFP_NOFAIL; + + bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); @@ -837,6 +844,10 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, struct ext4_ext_path *path = orig_path ? *orig_path : NULL; short int depth, i, ppos = 0; int ret; + gfp_t gfp_flags = GFP_NOFS; + + if (flags & EXT4_EX_NOFAIL) + gfp_flags |= __GFP_NOFAIL; eh = ext_inode_hdr(inode); depth = ext_depth(inode); @@ -857,7 +868,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, if (!path) { /* account possible depth increase */ path = kcalloc(depth + 2, sizeof(struct ext4_ext_path), - GFP_NOFS); + gfp_flags); if (unlikely(!path)) return ERR_PTR(-ENOMEM); path[0].p_maxdepth = depth + 1; @@ -1007,9 +1018,13 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ext4_fsblk_t newblock, oldblock; __le32 border; ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ + gfp_t gfp_flags = GFP_NOFS; int err = 0; size_t ext_size = 0; + if (flags & EXT4_EX_NOFAIL) + gfp_flags |= __GFP_NOFAIL; + /* make decision: where to split? */ /* FIXME: now decision is simplest: at current extent */ @@ -1043,7 +1058,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, * We need this to handle errors and free blocks * upon them. */ - ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), GFP_NOFS); + ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), gfp_flags); if (!ablocks) return -ENOMEM; @@ -2019,7 +2034,7 @@ prepend: if (next != EXT_MAX_BLOCKS) { ext_debug("next leaf block - %u\n", next); BUG_ON(npath != NULL); - npath = ext4_find_extent(inode, next, NULL, 0); + npath = ext4_find_extent(inode, next, NULL, gb_flags); if (IS_ERR(npath)) return PTR_ERR(npath); BUG_ON(npath->p_depth != path->p_depth); @@ -2792,7 +2807,8 @@ again: ext4_fsblk_t pblk; /* find extent for or closest extent to this block */ - path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); + path = ext4_find_extent(inode, end, NULL, + EXT4_EX_NOCACHE | EXT4_EX_NOFAIL); if (IS_ERR(path)) { ext4_journal_stop(handle); return PTR_ERR(path); @@ -2878,7 +2894,7 @@ again: le16_to_cpu(path[k].p_hdr->eh_entries)+1; } else { path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), - GFP_NOFS); + GFP_NOFS | __GFP_NOFAIL); if (path == NULL) { ext4_journal_stop(handle); return -ENOMEM; @@ -3303,7 +3319,7 @@ static int ext4_split_extent(handle_t *handle, * Update path is required because previous ext4_split_extent_at() may * result in split of original leaf or extent zeroout. */ - path = ext4_find_extent(inode, map->m_lblk, ppath, 0); + path = ext4_find_extent(inode, map->m_lblk, ppath, flags); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); @@ -4365,7 +4381,14 @@ retry: } if (err) return err; - return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); +retry_remove_space: + err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); + if (err == -ENOMEM) { + cond_resched(); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_remove_space; + } + return err; } static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, From 212da3ec6fafe458cd54c3a365cbcb4f9bd794e1 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:41 +0530 Subject: [PATCH 306/427] ext4: mballoc: print bb_free info even when it is 0 Improve the debugging msg by also printing even if bb_free is 0. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/c894f1d1d30f86ae38f4e3a861949665b6dc61cd.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 30d5d97548c4..bcfaaad62167 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4170,8 +4170,6 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) } ext4_unlock_group(sb, i); - if (grp->bb_free == 0) - continue; printk(KERN_ERR "%u: %d/%d \n", i, grp->bb_free, grp->bb_fragments); } From e68cf40c0d098a63bc571bc5981dee6c2013c494 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:42 +0530 Subject: [PATCH 307/427] ext4: mballoc: refactor ext4_mb_show_ac() This factors out ext4_mb_show_pa() function to show all the group's preallocation info. This could be useful info to be added in later patches. There should be no functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/8f07d890b0038dcc935e9c10e6043ec9f3792721.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 65 +++++++++++++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 25 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index bcfaaad62167..d1464d9110ef 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4120,38 +4120,16 @@ repeat: } #ifdef CONFIG_EXT4_DEBUG -static void ext4_mb_show_ac(struct ext4_allocation_context *ac) +static inline void ext4_mb_show_pa(struct super_block *sb) { - struct super_block *sb = ac->ac_sb; - ext4_group_t ngroups, i; + ext4_group_t i, ngroups; if (!ext4_mballoc_debug || (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) return; - ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:" - " Allocation context details:"); - ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d", - ac->ac_status, ac->ac_flags); - ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " - "goal %lu/%lu/%lu@%lu, " - "best %lu/%lu/%lu@%lu cr %d", - (unsigned long)ac->ac_o_ex.fe_group, - (unsigned long)ac->ac_o_ex.fe_start, - (unsigned long)ac->ac_o_ex.fe_len, - (unsigned long)ac->ac_o_ex.fe_logical, - (unsigned long)ac->ac_g_ex.fe_group, - (unsigned long)ac->ac_g_ex.fe_start, - (unsigned long)ac->ac_g_ex.fe_len, - (unsigned long)ac->ac_g_ex.fe_logical, - (unsigned long)ac->ac_b_ex.fe_group, - (unsigned long)ac->ac_b_ex.fe_start, - (unsigned long)ac->ac_b_ex.fe_len, - (unsigned long)ac->ac_b_ex.fe_logical, - (int)ac->ac_criteria); - ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found); - ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); ngroups = ext4_get_groups_count(sb); + ext4_msg(sb, KERN_ERR, "groups: "); for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); struct ext4_prealloc_space *pa; @@ -4175,9 +4153,46 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) } printk(KERN_ERR "\n"); } + +static void ext4_mb_show_ac(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + + if (!ext4_mballoc_debug || + (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) + return; + + ext4_msg(sb, KERN_ERR, "Can't allocate:" + " Allocation context details:"); + ext4_msg(sb, KERN_ERR, "status %d flags %d", + ac->ac_status, ac->ac_flags); + ext4_msg(sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " + "goal %lu/%lu/%lu@%lu, " + "best %lu/%lu/%lu@%lu cr %d", + (unsigned long)ac->ac_o_ex.fe_group, + (unsigned long)ac->ac_o_ex.fe_start, + (unsigned long)ac->ac_o_ex.fe_len, + (unsigned long)ac->ac_o_ex.fe_logical, + (unsigned long)ac->ac_g_ex.fe_group, + (unsigned long)ac->ac_g_ex.fe_start, + (unsigned long)ac->ac_g_ex.fe_len, + (unsigned long)ac->ac_g_ex.fe_logical, + (unsigned long)ac->ac_b_ex.fe_group, + (unsigned long)ac->ac_b_ex.fe_start, + (unsigned long)ac->ac_b_ex.fe_len, + (unsigned long)ac->ac_b_ex.fe_logical, + (int)ac->ac_criteria); + ext4_msg(sb, KERN_ERR, "%d found", ac->ac_found); + ext4_mb_show_pa(sb); +} #else +static inline void ext4_mb_show_pa(struct super_block *sb) +{ + return; +} static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) { + ext4_mb_show_pa(ac->ac_sb); return; } #endif From bbc4ec77e9f9c7ac71aee15c6adbd1674fe66c60 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:43 +0530 Subject: [PATCH 308/427] ext4: mballoc: add more mb_debug() msgs This patch adds some more debugging mb_debug() msgs to help improve mballoc code debugging. Other than adding more mb_debug() msgs at few more places, there should be no other functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/5fc8e7788b924e211fcfa4a4c1d2f8503511661a.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d1464d9110ef..2e4697e7b945 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2108,7 +2108,7 @@ static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t ngroups, group, i; - int cr; + int cr = -1; int err = 0, first_err = 0; struct ext4_sb_info *sbi; struct super_block *sb; @@ -2260,6 +2260,10 @@ repeat: out: if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) err = first_err; + + mb_debug(1, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", + ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, + ac->ac_flags, cr, err); return err; } @@ -3918,7 +3922,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, mb_debug(1, "discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) - return 0; + goto out_dbg; bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { @@ -3926,7 +3930,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", err, group); - return 0; + goto out_dbg; } err = ext4_mb_load_buddy(sb, group, &e4b); @@ -3934,7 +3938,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, ext4_warning(sb, "Error %d loading buddy information for %u", err, group); put_bh(bitmap_bh); - return 0; + goto out_dbg; } if (needed == 0) @@ -3979,6 +3983,8 @@ repeat: /* found anything to free? */ if (list_empty(&list)) { BUG_ON(free != 0); + mb_debug(1, "Someone else may have freed PA for this group %u\n", + group); goto out; } @@ -4003,6 +4009,9 @@ out: ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); +out_dbg: + mb_debug(1, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", + free, group, grp->bb_free); return free; } @@ -4538,6 +4547,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ar->len = ar->len >> 1; } if (!ar->len) { + ext4_mb_show_pa(sb); *errp = -ENOSPC; return 0; } From 36bad4233cc50e29022c1095666935cee1c978ad Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:44 +0530 Subject: [PATCH 309/427] ext4: mballoc: correct the mb_debug() format specifier for pa_len var pa->pa_len is an integer. Fix all of the format specifier used in mb_debug() for pa_len to %d instead of %u. As such no functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/af4987f643c586f62bcc9961e43f0a67151d5551.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 2e4697e7b945..49de715d04f9 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3720,7 +3720,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_deleted = 0; pa->pa_type = MB_INODE_PA; - mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa, + mb_debug(1, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_inode_pa(ac, pa); @@ -3780,7 +3780,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_deleted = 0; pa->pa_type = MB_GROUP_PA; - mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa, + mb_debug(1, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_group_pa(ac, pa); @@ -3862,10 +3862,10 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, } if (free != pa->pa_free) { ext4_msg(e4b->bd_sb, KERN_CRIT, - "pa %p: logic %lu, phys. %lu, len %lu", + "pa %p: logic %lu, phys. %lu, len %d", pa, (unsigned long) pa->pa_lstart, (unsigned long) pa->pa_pstart, - (unsigned long) pa->pa_len); + pa->pa_len); ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", free, pa->pa_free); /* @@ -4152,7 +4152,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb) ext4_get_group_no_and_offset(sb, pa->pa_pstart, NULL, &start); spin_unlock(&pa->pa_lock); - printk(KERN_ERR "PA:%u:%d:%u \n", i, + printk(KERN_ERR "PA:%u:%d:%d \n", i, start, pa->pa_len); } ext4_unlock_group(sb, i); From 004379d0b02ab8e8efbee1f2f878d5f578bed72c Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:45 +0530 Subject: [PATCH 310/427] ext4: mballoc: fix few other format specifier in mb_debug() Fix few other format specifiers in mb_debug() msgs. As such no other functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/574fa7f833abf2dbf3b53a2fea3195e71f6cdbd8.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 49de715d04f9..4ada63cf425f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3280,8 +3280,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } - mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size, - (unsigned) orig_size, (unsigned) start); + mb_debug(1, "goal: %lld(was %lld) blocks at %u\n", size, orig_size, + start); } static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) @@ -3370,7 +3370,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, BUG_ON(pa->pa_free < len); pa->pa_free -= len; - mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa); + mb_debug(1, "use %llu/%d from inode pa %p\n", start, len, pa); } /* @@ -3577,7 +3577,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_set_bits(bitmap, start, len); preallocated += len; } - mb_debug(1, "preallocated %u for group %u\n", preallocated, group); + mb_debug(1, "preallocated %d for group %u\n", preallocated, group); } static void ext4_mb_pa_callback(struct rcu_head *head) @@ -4173,7 +4173,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) ext4_msg(sb, KERN_ERR, "Can't allocate:" " Allocation context details:"); - ext4_msg(sb, KERN_ERR, "status %d flags %d", + ext4_msg(sb, KERN_ERR, "status %u flags 0x%x", ac->ac_status, ac->ac_flags); ext4_msg(sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " "goal %lu/%lu/%lu@%lu, " @@ -4191,7 +4191,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); - ext4_msg(sb, KERN_ERR, "%d found", ac->ac_found); + ext4_msg(sb, KERN_ERR, "%u found", ac->ac_found); ext4_mb_show_pa(sb); } #else From f283529abac45d8c2b4d4b69d356cca9e6a2de43 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:46 +0530 Subject: [PATCH 311/427] ext4: mballoc: simplify error handling in ext4_init_mballoc() This patch simplifies error handling logic in ext4_init_mballoc(), by adding all the cleanups at one place at the end of that function. There should be no functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/8621a7bc68f7107a9ac4292afeb784515333bd25.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4ada63cf425f..aaf43c6c08e1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2913,23 +2913,26 @@ int __init ext4_init_mballoc(void) ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, SLAB_RECLAIM_ACCOUNT); if (ext4_pspace_cachep == NULL) - return -ENOMEM; + goto out; ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, SLAB_RECLAIM_ACCOUNT); - if (ext4_ac_cachep == NULL) { - kmem_cache_destroy(ext4_pspace_cachep); - return -ENOMEM; - } + if (ext4_ac_cachep == NULL) + goto out_pa_free; ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, SLAB_RECLAIM_ACCOUNT); - if (ext4_free_data_cachep == NULL) { - kmem_cache_destroy(ext4_pspace_cachep); - kmem_cache_destroy(ext4_ac_cachep); - return -ENOMEM; - } + if (ext4_free_data_cachep == NULL) + goto out_ac_free; + return 0; + +out_ac_free: + kmem_cache_destroy(ext4_ac_cachep); +out_pa_free: + kmem_cache_destroy(ext4_pspace_cachep); +out: + return -ENOMEM; } void ext4_exit_mballoc(void) From 4fca8f07790a62c2b3da028ae423cf4d71c1bacd Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:47 +0530 Subject: [PATCH 312/427] ext4: mballoc: make ext4_mb_use_preallocated() return type as bool Change return type of function ext4_mb_use_preallocated() to bool to better reflect what this function can return. There should be no functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/7880cb6ef911465beafefcd7e9c3ea214688744b.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index aaf43c6c08e1..262a53f1d283 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3432,7 +3432,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block, /* * search goal blocks in preallocated space */ -static noinline_for_stack int +static noinline_for_stack bool ext4_mb_use_preallocated(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); @@ -3444,7 +3444,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* only data can be preallocated */ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) - return 0; + return false; /* first, try per-file preallocation */ rcu_read_lock(); @@ -3471,7 +3471,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) spin_unlock(&pa->pa_lock); ac->ac_criteria = 10; rcu_read_unlock(); - return 1; + return true; } spin_unlock(&pa->pa_lock); } @@ -3479,12 +3479,12 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* can we use group allocation? */ if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) - return 0; + return false; /* inode may have no locality group for some reason */ lg = ac->ac_lg; if (lg == NULL) - return 0; + return false; order = fls(ac->ac_o_ex.fe_len) - 1; if (order > PREALLOC_TB_SIZE - 1) /* The max size of hash table is PREALLOC_TB_SIZE */ @@ -3513,9 +3513,9 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) if (cpa) { ext4_mb_use_group_pa(ac, cpa); ac->ac_criteria = 20; - return 1; + return true; } - return 0; + return false; } /* From a345021553f7e6343b05b1ad1c25ed931140b47c Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:48 +0530 Subject: [PATCH 313/427] ext4: mballoc: refactor code inside DOUBLE_CHECK into separate function This patch implemets mb_group_bb_bitmap_alloc() and mb_group_bb_bitmap_free() function to remove #ifdef DOUBLE_CHECK macro and it's related code from inside ext4_mb_add_groupinfo()/ext4_mb_release(). There should be no functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/8c2095d74b779f0254a19b24982490dc6f07c4f9.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 50 +++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 262a53f1d283..3555e72f149c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -511,6 +511,26 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) } } +static void mb_group_bb_bitmap_alloc(struct super_block *sb, + struct ext4_group_info *grp, ext4_group_t group) +{ + struct buffer_head *bh; + + grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); + BUG_ON(grp->bb_bitmap == NULL); + + bh = ext4_read_block_bitmap(sb, group); + BUG_ON(IS_ERR_OR_NULL(bh)); + + memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize); + put_bh(bh); +} + +static void mb_group_bb_bitmap_free(struct ext4_group_info *grp) +{ + kfree(grp->bb_bitmap); +} + #else static inline void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, int first, int count) @@ -526,6 +546,17 @@ static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) { return; } + +static inline void mb_group_bb_bitmap_alloc(struct super_block *sb, + struct ext4_group_info *grp, ext4_group_t group) +{ + return; +} + +static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp) +{ + return; +} #endif #ifdef AGGRESSIVE_CHECK @@ -2456,20 +2487,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, meta_group_info[i]->bb_free_root = RB_ROOT; meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ -#ifdef DOUBLE_CHECK - { - struct buffer_head *bh; - meta_group_info[i]->bb_bitmap = - kmalloc(sb->s_blocksize, GFP_NOFS); - BUG_ON(meta_group_info[i]->bb_bitmap == NULL); - bh = ext4_read_block_bitmap(sb, group); - BUG_ON(IS_ERR_OR_NULL(bh)); - memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, - sb->s_blocksize); - put_bh(bh); - } -#endif - + mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); return 0; exit_group_info: @@ -2736,9 +2754,7 @@ int ext4_mb_release(struct super_block *sb) for (i = 0; i < ngroups; i++) { cond_resched(); grinfo = ext4_get_group_info(sb, i); -#ifdef DOUBLE_CHECK - kfree(grinfo->bb_bitmap); -#endif + mb_group_bb_bitmap_free(grinfo); ext4_lock_group(sb, i); ext4_mb_cleanup_pa(grinfo); ext4_unlock_group(sb, i); From eb2b8ebb867fa895d5c4768310998bc940f6506c Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:49 +0530 Subject: [PATCH 314/427] ext4: mballoc: fix possible NULL ptr & remove BUG_ONs from DOUBLE_CHECK Make sure to check for e4b->bd_info->bb_bitmap == NULL, in mb_cmp_bitmaps() and return if NULL, to avoid possible NULL ptr dereference. Similar to how we do this in other ifdef DOUBLE_CHECK functions. Also remove the BUG_ON() logic if kmalloc() or ext4_read_block_bitmap() fails. We should simply mark grp->bb_bitmap as NULL if above happens. In fact ext4_read_block_bitmap() may even return an error in case of resize ioctl. Hence remove this BUG_ON logic (fstests ext4/032 may trigger this). Link: https://lore.kernel.org/r/9a54f8a696ff17c057cd571be3d15ac3ec1407f1.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 3555e72f149c..c713d06e70b7 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -493,6 +493,8 @@ static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) { + if (unlikely(e4b->bd_info->bb_bitmap == NULL)) + return; if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { unsigned char *b1, *b2; int i; @@ -517,10 +519,15 @@ static void mb_group_bb_bitmap_alloc(struct super_block *sb, struct buffer_head *bh; grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); - BUG_ON(grp->bb_bitmap == NULL); + if (!grp->bb_bitmap) + return; bh = ext4_read_block_bitmap(sb, group); - BUG_ON(IS_ERR_OR_NULL(bh)); + if (IS_ERR_OR_NULL(bh)) { + kfree(grp->bb_bitmap); + grp->bb_bitmap = NULL; + return; + } memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize); put_bh(bh); From 9bee5779ee26d6debc84e0f1e4e54daa93f13ebc Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:50 +0530 Subject: [PATCH 315/427] ext4: balloc: use task_pid_nr() helper Use task_pid_nr() function instead of current->pid. There should be no functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/4b58403e15e9c8deb34a1b93deb3fc9cd153ab84.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index a32e5f7b5385..1ba46d87cdf1 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -903,10 +903,11 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) return bg_start; if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) - colour = (current->pid % 16) * + colour = (task_pid_nr(current) % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); else - colour = (current->pid % 16) * ((last_block - bg_start) / 16); + colour = (task_pid_nr(current) % 16) * + ((last_block - bg_start) / 16); return bg_start + colour; } From 6db074618969dc6fac4978e8043945fd440b310a Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:51 +0530 Subject: [PATCH 316/427] ext4: use BIT() macro for BH_** state bits Simply use BIT() macro for all BH_** state bits instead of open coding it. There should be no functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/57667689f51a3f9dba2fcef7d3425187fa3ba69f.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 8 ++++---- fs/ext4/inode.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5d901bf92ce9..89cac4e32018 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -171,10 +171,10 @@ struct ext4_allocation_request { * well as to store the information returned by ext4_map_blocks(). It * takes less room on the stack than a struct buffer_head. */ -#define EXT4_MAP_NEW (1 << BH_New) -#define EXT4_MAP_MAPPED (1 << BH_Mapped) -#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) -#define EXT4_MAP_BOUNDARY (1 << BH_Boundary) +#define EXT4_MAP_NEW BIT(BH_New) +#define EXT4_MAP_MAPPED BIT(BH_Mapped) +#define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) +#define EXT4_MAP_BOUNDARY BIT(BH_Boundary) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 456e8a6b4809..043ee7efce5f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2084,7 +2084,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) return err; } -#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay)) +#define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay)) /* * mballoc gives us at most this number of blocks... @@ -2364,7 +2364,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; - if (map->m_flags & (1 << BH_Delay)) + if (map->m_flags & BIT(BH_Delay)) get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; err = ext4_map_blocks(handle, inode, map, get_blocks_flags); From ec8c60be96d6de74be601fbca56342efb9a1e039 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:52 +0530 Subject: [PATCH 317/427] ext4: improve ext_debug() msg in case of block allocation failure ext4_map_blocks() has ext_debug msg early at the start of function. We also get ext_debug msg if we could allocate a block from ext4_ext_map_blocks(). But there is no ext_debug() msg in case of block allocation failure. So add one along with error code. Also add more info in ext_debug() msg like how many blocks were allocated v/s how many were requested in ext4_ext_map_blocks(). Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/1610ec2aa932396be00f9d552fe29da473ead176.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 4 ++-- fs/ext4/inode.c | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ff7eeb5a77ef..7b4b0c0110ac 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4246,10 +4246,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; - ext_debug("allocate new block: goal %llu, found %llu/%u\n", - ar.goal, newblock, allocated); allocated_clusters = ar.len; ar.len = EXT4_C2B(sbi, ar.len) - offset; + ext_debug("allocate new block: goal %llu, found %llu/%u, requested %u\n", + ar.goal, newblock, ar.len, allocated); if (ar.len > allocated) ar.len = allocated; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 043ee7efce5f..0a52f98512d7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -732,6 +732,10 @@ out_sem: return ret; } } + + if (retval < 0) + ext_debug("failed for inode %lu with err %d\n", + inode->i_ino, retval); return retval; } From 8ec2d31b27f683e3deeaf5d562b534a695052de3 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:53 +0530 Subject: [PATCH 318/427] ext4: replace EXT_DEBUG with __maybe_unused in ext4_ext_handle_unwritten_extents() Replace EXT_DEBUG with __maybe_unused from inside ext4_ext_handle_unwritten_extents() function. There should be no functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/ae335b94506cd9db9d2648c1f4dd25a80f9f3ce2.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7b4b0c0110ac..2f711cc3cdce 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3813,9 +3813,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, int flags, unsigned int allocated, ext4_fsblk_t newblock) { -#ifdef EXT_DEBUG - struct ext4_ext_path *path = *ppath; -#endif + struct ext4_ext_path __maybe_unused *path = *ppath; int ret = 0; int err = 0; From d3df14535f4a5b5af58ef12b4263202df3155356 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:54 +0530 Subject: [PATCH 319/427] ext4: mballoc: make mb_debug() implementation to use pr_debug() mb_debug() msg had only 1 control level for all type of msgs. And if we enable mballoc_debug then all of those msgs would be enabled. Instead of adding multiple debug levels for mb_debug() msgs, use pr_debug() with which we could have finer control to print msgs at all of different levels (i.e. at file, func, line no.). Also add process name/pid, superblk id, and other info in mb_debug() msg. This also kills the mballoc_debug module parameter, since it is not needed any more. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/f0c660cbde9e2edbe95c67942ca9ad80dd2231eb.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/Kconfig | 3 +- fs/ext4/mballoc.c | 104 +++++++++++++++++++++------------------------- fs/ext4/mballoc.h | 16 +++---- 3 files changed, 55 insertions(+), 68 deletions(-) diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 2a592e38cdfe..02376ddb0cb5 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -99,8 +99,7 @@ config EXT4_DEBUG Enables run-time debugging support for the ext4 filesystem. If you select Y here, then you will be able to turn on debugging - with a command such as: - echo 1 > /sys/module/ext4/parameters/mballoc_debug + using dynamic debug control for mb_debug() msgs. config EXT4_KUNIT_TESTS tristate "KUnit tests for ext4" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c713d06e70b7..33a69424942c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -18,13 +18,6 @@ #include #include -#ifdef CONFIG_EXT4_DEBUG -ushort ext4_mballoc_debug __read_mostly; - -module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644); -MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc"); -#endif - /* * MUSTDO: * - test ext4_ext_search_left() and ext4_ext_search_right() @@ -858,14 +851,14 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) char *bitmap; struct ext4_group_info *grinfo; - mb_debug(1, "init page %lu\n", page->index); - inode = page->mapping->host; sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); blocksize = i_blocksize(inode); blocks_per_page = PAGE_SIZE / blocksize; + mb_debug(sb, "init page %lu\n", page->index); + groups_per_page = blocks_per_page >> 1; if (groups_per_page == 0) groups_per_page = 1; @@ -905,7 +898,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) bh[i] = NULL; goto out; } - mb_debug(1, "read bitmap for group %u\n", group); + mb_debug(sb, "read bitmap for group %u\n", group); } /* wait for I/O completion */ @@ -950,7 +943,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) if ((first_block + i) & 1) { /* this is block of buddy */ BUG_ON(incore == NULL); - mb_debug(1, "put buddy for group %u in page %lu/%x\n", + mb_debug(sb, "put buddy for group %u in page %lu/%x\n", group, page->index, i * blocksize); trace_ext4_mb_buddy_bitmap_load(sb, group); grinfo = ext4_get_group_info(sb, group); @@ -970,7 +963,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) } else { /* this is block of bitmap */ BUG_ON(incore != NULL); - mb_debug(1, "put bitmap for group %u in page %lu/%x\n", + mb_debug(sb, "put bitmap for group %u in page %lu/%x\n", group, page->index, i * blocksize); trace_ext4_mb_bitmap_load(sb, group); @@ -1076,7 +1069,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) int ret = 0; might_sleep(); - mb_debug(1, "init group %u\n", group); + mb_debug(sb, "init group %u\n", group); this_grp = ext4_get_group_info(sb, group); /* * This ensures that we don't reinit the buddy cache @@ -1148,7 +1141,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, struct inode *inode = sbi->s_buddy_cache; might_sleep(); - mb_debug(1, "load group %u\n", group); + mb_debug(sb, "load group %u\n", group); blocks_per_page = PAGE_SIZE / sb->s_blocksize; grp = ext4_get_group_info(sb, group); @@ -2299,7 +2292,7 @@ out: if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) err = first_err; - mb_debug(1, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", + mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, ac->ac_flags, cr, err); return err; @@ -2731,7 +2724,7 @@ out: } /* need to called with the ext4 group lock held */ -static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) +static int ext4_mb_cleanup_pa(struct ext4_group_info *grp) { struct ext4_prealloc_space *pa; struct list_head *cur, *tmp; @@ -2743,9 +2736,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) count++; kmem_cache_free(ext4_pspace_cachep, pa); } - if (count) - mb_debug(1, "mballoc: %u PAs left\n", count); - + return count; } int ext4_mb_release(struct super_block *sb) @@ -2756,6 +2747,7 @@ int ext4_mb_release(struct super_block *sb) struct ext4_group_info *grinfo, ***group_info; struct ext4_sb_info *sbi = EXT4_SB(sb); struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); + int count; if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { @@ -2763,7 +2755,10 @@ int ext4_mb_release(struct super_block *sb) grinfo = ext4_get_group_info(sb, i); mb_group_bb_bitmap_free(grinfo); ext4_lock_group(sb, i); - ext4_mb_cleanup_pa(grinfo); + count = ext4_mb_cleanup_pa(grinfo); + if (count) + mb_debug(sb, "mballoc: %d PAs left\n", + count); ext4_unlock_group(sb, i); kmem_cache_free(cachep, grinfo); } @@ -2836,7 +2831,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb, struct ext4_group_info *db; int err, count = 0, count2 = 0; - mb_debug(1, "gonna free %u blocks in group %u (0x%p):", + mb_debug(sb, "gonna free %u blocks in group %u (0x%p):", entry->efd_count, entry->efd_group, entry); err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); @@ -2876,7 +2871,8 @@ static void ext4_free_data_in_buddy(struct super_block *sb, kmem_cache_free(ext4_free_data_cachep, entry); ext4_mb_unload_buddy(&e4b); - mb_debug(1, "freed %u blocks in %u structures\n", count, count2); + mb_debug(sb, "freed %d blocks in %d structures\n", count, + count2); } /* @@ -3107,8 +3103,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) BUG_ON(lg == NULL); ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; - mb_debug(1, "#%u: goal %u blocks for locality group\n", - current->pid, ac->ac_g_ex.fe_len); + mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len); } /* @@ -3306,8 +3301,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } - mb_debug(1, "goal: %lld(was %lld) blocks at %u\n", size, orig_size, - start); + mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size, + orig_size, start); } static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) @@ -3396,7 +3391,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, BUG_ON(pa->pa_free < len); pa->pa_free -= len; - mb_debug(1, "use %llu/%d from inode pa %p\n", start, len, pa); + mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa); } /* @@ -3420,7 +3415,8 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, * in on-disk bitmap -- see ext4_mb_release_context() * Other CPUs are prevented from allocating from this pa by lg_mutex */ - mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); + mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n", + pa->pa_lstart-len, len, pa); } /* @@ -3603,7 +3599,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_set_bits(bitmap, start, len); preallocated += len; } - mb_debug(1, "preallocated %d for group %u\n", preallocated, group); + mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); } static void ext4_mb_pa_callback(struct rcu_head *head) @@ -3746,8 +3742,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_deleted = 0; pa->pa_type = MB_INODE_PA; - mb_debug(1, "new inode pa %p: %llu/%d for %u\n", pa, - pa->pa_pstart, pa->pa_len, pa->pa_lstart); + mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, + pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_inode_pa(ac, pa); ext4_mb_use_inode_pa(ac, pa); @@ -3806,8 +3802,8 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_deleted = 0; pa->pa_type = MB_GROUP_PA; - mb_debug(1, "new group pa %p: %llu/%d for %u\n", pa, - pa->pa_pstart, pa->pa_len, pa->pa_lstart); + mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, + pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_group_pa(ac, pa); ext4_mb_use_group_pa(ac, pa); @@ -3874,7 +3870,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, if (bit >= end) break; next = mb_find_next_bit(bitmap_bh->b_data, end, bit); - mb_debug(1, " free preallocated %u/%u in group %u\n", + mb_debug(sb, "free preallocated %u/%u in group %u\n", (unsigned) ext4_group_first_block_no(sb, group) + bit, (unsigned) next - bit, (unsigned) group); free += next - bit; @@ -3945,8 +3941,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, int busy = 0; int free = 0; - mb_debug(1, "discard preallocation for group %u\n", group); - + mb_debug(sb, "discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) goto out_dbg; @@ -4009,7 +4004,7 @@ repeat: /* found anything to free? */ if (list_empty(&list)) { BUG_ON(free != 0); - mb_debug(1, "Someone else may have freed PA for this group %u\n", + mb_debug(sb, "Someone else may have freed PA for this group %u\n", group); goto out; } @@ -4036,7 +4031,7 @@ out: ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); out_dbg: - mb_debug(1, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", + mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", free, group, grp->bb_free); return free; } @@ -4066,7 +4061,8 @@ void ext4_discard_preallocations(struct inode *inode) return; } - mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino); + mb_debug(sb, "discard preallocation for inode %lu\n", + inode->i_ino); trace_ext4_discard_preallocations(inode); INIT_LIST_HEAD(&list); @@ -4159,12 +4155,11 @@ static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; - if (!ext4_mballoc_debug || - (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) + if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) return; ngroups = ext4_get_groups_count(sb); - ext4_msg(sb, KERN_ERR, "groups: "); + mb_debug(sb, "groups: "); for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); struct ext4_prealloc_space *pa; @@ -4178,30 +4173,27 @@ static inline void ext4_mb_show_pa(struct super_block *sb) ext4_get_group_no_and_offset(sb, pa->pa_pstart, NULL, &start); spin_unlock(&pa->pa_lock); - printk(KERN_ERR "PA:%u:%d:%d \n", i, - start, pa->pa_len); + mb_debug(sb, "PA:%u:%d:%d\n", i, start, + pa->pa_len); } ext4_unlock_group(sb, i); - - printk(KERN_ERR "%u: %d/%d \n", - i, grp->bb_free, grp->bb_fragments); + mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free, + grp->bb_fragments); } - printk(KERN_ERR "\n"); } static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - if (!ext4_mballoc_debug || - (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) + if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) return; - ext4_msg(sb, KERN_ERR, "Can't allocate:" + mb_debug(sb, "Can't allocate:" " Allocation context details:"); - ext4_msg(sb, KERN_ERR, "status %u flags 0x%x", + mb_debug(sb, "status %u flags 0x%x", ac->ac_status, ac->ac_flags); - ext4_msg(sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " + mb_debug(sb, "orig %lu/%lu/%lu@%lu, " "goal %lu/%lu/%lu@%lu, " "best %lu/%lu/%lu@%lu cr %d", (unsigned long)ac->ac_o_ex.fe_group, @@ -4217,7 +4209,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); - ext4_msg(sb, KERN_ERR, "%u found", ac->ac_found); + mb_debug(sb, "%u found", ac->ac_found); ext4_mb_show_pa(sb); } #else @@ -4330,7 +4322,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, * locality group. this is a policy, actually */ ext4_mb_group_or_file(ac); - mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " + mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, " "left: %u/%u, right %u/%u to %swritable\n", (unsigned) ar->len, (unsigned) ar->logical, (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, @@ -4351,7 +4343,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, struct list_head discard_list; struct ext4_prealloc_space *pa, *tmp; - mb_debug(1, "discard locality group preallocation\n"); + mb_debug(sb, "discard locality group preallocation\n"); INIT_LIST_HEAD(&discard_list); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 88c98f17e3d9..6b4d17c2935d 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -24,19 +24,15 @@ #include "ext4.h" /* + * mb_debug() dynamic printk msgs could be used to debug mballoc code. */ #ifdef CONFIG_EXT4_DEBUG -extern ushort ext4_mballoc_debug; - -#define mb_debug(n, fmt, ...) \ -do { \ - if ((n) <= ext4_mballoc_debug) { \ - printk(KERN_DEBUG "(%s, %d): %s: " fmt, \ - __FILE__, __LINE__, __func__, ##__VA_ARGS__); \ - } \ -} while (0) +#define mb_debug(sb, fmt, ...) \ + pr_debug("[%s/%d] EXT4-fs (%s): (%s, %d): %s: " fmt, \ + current->comm, task_pid_nr(current), sb->s_id, \ + __FILE__, __LINE__, __func__, ##__VA_ARGS__) #else -#define mb_debug(n, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#define mb_debug(sb, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ From 70aa1554b01474ab08d08e5a18b0215a7ff1e8dc Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:55 +0530 Subject: [PATCH 320/427] ext4: make ext_debug() implementation to use pr_debug() ext_debug() msgs could be helpful, provided those could be enabled without recompiling kernel and also if we could selectively enable only required prints for case by case debugging. So make ext_debug() implementation use pr_debug(). Also change ext_debug() to be defined with CONFIG_EXT4_DEBUG. So EXT_DEBUG macro now mostly remain for below 3 functions. ext4_ext_show_path/leaf/move() (whose print msgs use ext_debug() which again could be dynamically enabled using pr_debug()) This also changes the ext_debug() to take inode as a parameter to add inode no. in all of it's msgs. Prints additional info like process name / pid, superblock id etc. This also removes any explicit function names passed in ext_debug(). Since ext_debug() on it's own prints file, func and line no. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/d31dc189b0aeda9384fe7665e36da7cd8c61571f.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/Kconfig | 2 +- fs/ext4/ext4.h | 20 +++++-- fs/ext4/extents.c | 144 ++++++++++++++++++++++------------------------ fs/ext4/inode.c | 11 ++-- 4 files changed, 88 insertions(+), 89 deletions(-) diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 02376ddb0cb5..cf9e430514c4 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -99,7 +99,7 @@ config EXT4_DEBUG Enables run-time debugging support for the ext4 filesystem. If you select Y here, then you will be able to turn on debugging - using dynamic debug control for mb_debug() msgs. + using dynamic debug control for mb_debug() / ext_debug() msgs. config EXT4_KUNIT_TESTS tristate "KUnit tests for ext4" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 89cac4e32018..80866f124b9a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -80,14 +80,22 @@ #define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif -/* - * Turn on EXT_DEBUG to get lots of info about extents operations. - */ + /* + * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c + */ #define EXT_DEBUG__ -#ifdef EXT_DEBUG -#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) + +/* + * Dynamic printk for controlled extents debugging. + */ +#ifdef CONFIG_EXT4_DEBUG +#define ext_debug(ino, fmt, ...) \ + pr_debug("[%s/%d] EXT4-fs (%s): ino %lu: (%s, %d): %s:" fmt, \ + current->comm, task_pid_nr(current), \ + ino->i_sb->s_id, ino->i_ino, __FILE__, __LINE__, \ + __func__, ##__VA_ARGS__) #else -#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#define ext_debug(ino, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* data type for block offset of block group */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2f711cc3cdce..969f4c030cf0 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -607,22 +607,22 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) { int k, l = path->p_depth; - ext_debug("path:"); + ext_debug(inode, "path:"); for (k = 0; k <= l; k++, path++) { if (path->p_idx) { - ext_debug(" %d->%llu", + ext_debug(inode, " %d->%llu", le32_to_cpu(path->p_idx->ei_block), ext4_idx_pblock(path->p_idx)); } else if (path->p_ext) { - ext_debug(" %d:[%d]%d:%llu ", + ext_debug(inode, " %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext), ext4_ext_pblock(path->p_ext)); } else - ext_debug(" []"); + ext_debug(inode, " []"); } - ext_debug("\n"); + ext_debug(inode, "\n"); } static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) @@ -638,14 +638,14 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) eh = path[depth].p_hdr; ex = EXT_FIRST_EXTENT(eh); - ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino); + ext_debug(inode, "Displaying leaf extents\n"); for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { - ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), + ext_debug(inode, "%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); } - ext_debug("\n"); + ext_debug(inode, "\n"); } static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, @@ -658,10 +658,9 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent_idx *idx; idx = path[level].p_idx; while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { - ext_debug("%d: move %d:%llu in new index %llu\n", level, - le32_to_cpu(idx->ei_block), - ext4_idx_pblock(idx), - newblock); + ext_debug(inode, "%d: move %d:%llu in new index %llu\n", + level, le32_to_cpu(idx->ei_block), + ext4_idx_pblock(idx), newblock); idx++; } @@ -670,7 +669,7 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, ex = path[depth].p_ext; while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { - ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", + ext_debug(inode, "move %d:%llu:[%d]%d in new leaf %llu\n", le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), ext4_ext_is_unwritten(ex), @@ -714,7 +713,7 @@ ext4_ext_binsearch_idx(struct inode *inode, struct ext4_extent_idx *r, *l, *m; - ext_debug("binsearch for %u(idx): ", block); + ext_debug(inode, "binsearch for %u(idx): ", block); l = EXT_FIRST_INDEX(eh) + 1; r = EXT_LAST_INDEX(eh); @@ -724,13 +723,13 @@ ext4_ext_binsearch_idx(struct inode *inode, r = m - 1; else l = m + 1; - ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), - m, le32_to_cpu(m->ei_block), - r, le32_to_cpu(r->ei_block)); + ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l, + le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block), + r, le32_to_cpu(r->ei_block)); } path->p_idx = l - 1; - ext_debug(" -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), + ext_debug(inode, " -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), ext4_idx_pblock(path->p_idx)); #ifdef CHECK_BINSEARCH @@ -781,7 +780,7 @@ ext4_ext_binsearch(struct inode *inode, return; } - ext_debug("binsearch for %u: ", block); + ext_debug(inode, "binsearch for %u: ", block); l = EXT_FIRST_EXTENT(eh) + 1; r = EXT_LAST_EXTENT(eh); @@ -792,13 +791,13 @@ ext4_ext_binsearch(struct inode *inode, r = m - 1; else l = m + 1; - ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), - m, le32_to_cpu(m->ee_block), - r, le32_to_cpu(r->ee_block)); + ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l, + le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block), + r, le32_to_cpu(r->ee_block)); } path->p_ext = l - 1; - ext_debug(" -> %d:%llu:[%d]%d ", + ext_debug(inode, " -> %d:%llu:[%d]%d ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_pblock(path->p_ext), ext4_ext_is_unwritten(path->p_ext), @@ -881,7 +880,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, ext4_cache_extents(inode, eh); /* walk through the tree */ while (i) { - ext_debug("depth %d: num %d, max %d\n", + ext_debug(inode, "depth %d: num %d, max %d\n", ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); ext4_ext_binsearch_idx(inode, path + ppos, block); @@ -958,18 +957,20 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, if (logical > le32_to_cpu(curp->p_idx->ei_block)) { /* insert after */ - ext_debug("insert new index %d after: %llu\n", logical, ptr); + ext_debug(inode, "insert new index %d after: %llu\n", + logical, ptr); ix = curp->p_idx + 1; } else { /* insert before */ - ext_debug("insert new index %d before: %llu\n", logical, ptr); + ext_debug(inode, "insert new index %d before: %llu\n", + logical, ptr); ix = curp->p_idx; } len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; BUG_ON(len < 0); if (len > 0) { - ext_debug("insert new index %d: " + ext_debug(inode, "insert new index %d: " "move %d indices from 0x%p to 0x%p\n", logical, len, ix, ix + 1); memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); @@ -1036,12 +1037,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, } if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { border = path[depth].p_ext[1].ee_block; - ext_debug("leaf will be split." + ext_debug(inode, "leaf will be split." " next leaf starts at %d\n", le32_to_cpu(border)); } else { border = newext->ee_block; - ext_debug("leaf will be added." + ext_debug(inode, "leaf will be added." " next leaf starts at %d\n", le32_to_cpu(border)); } @@ -1063,7 +1064,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, return -ENOMEM; /* allocate all needed blocks */ - ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); + ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at); for (a = 0; a < depth - at; a++) { newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err, flags); @@ -1149,7 +1150,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, goto cleanup; } if (k) - ext_debug("create %d intermediate indices\n", k); + ext_debug(inode, "create %d intermediate indices\n", k); /* insert new index into current index block */ /* current depth stored in i var */ i = depth - 1; @@ -1176,7 +1177,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, fidx->ei_block = border; ext4_idx_store_pblock(fidx, oldblock); - ext_debug("int.index at %d (block %llu): %u -> %llu\n", + ext_debug(inode, "int.index at %d (block %llu): %u -> %llu\n", i, newblock, le32_to_cpu(border), oldblock); /* move remainder of path[i] to the new index block */ @@ -1190,7 +1191,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, } /* start copy indexes */ m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; - ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, + ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx, EXT_MAX_INDEX(path[i].p_hdr)); ext4_ext_show_move(inode, path, newblock, i); if (m) { @@ -1327,7 +1328,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, EXT_FIRST_INDEX(neh)->ei_block = EXT_FIRST_EXTENT(neh)->ee_block; } - ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", + ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %llu\n", le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), ext4_idx_pblock(EXT_FIRST_INDEX(neh))); @@ -1969,7 +1970,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, /* Try to append newex to the ex */ if (ext4_can_extents_be_merged(inode, ex, newext)) { - ext_debug("append [%d]%d block to %u:[%d]%d" + ext_debug(inode, "append [%d]%d block to %u:[%d]%d" "(from %llu)\n", ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), @@ -1994,7 +1995,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, prepend: /* Try to prepend newex to the ex */ if (ext4_can_extents_be_merged(inode, newext, ex)) { - ext_debug("prepend %u[%d]%d block to %u:[%d]%d" + ext_debug(inode, "prepend %u[%d]%d block to %u:[%d]%d" "(from %llu)\n", le32_to_cpu(newext->ee_block), ext4_ext_is_unwritten(newext), @@ -2032,7 +2033,7 @@ prepend: if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) next = ext4_ext_next_leaf_block(path); if (next != EXT_MAX_BLOCKS) { - ext_debug("next leaf block - %u\n", next); + ext_debug(inode, "next leaf block - %u\n", next); BUG_ON(npath != NULL); npath = ext4_find_extent(inode, next, NULL, gb_flags); if (IS_ERR(npath)) @@ -2040,12 +2041,12 @@ prepend: BUG_ON(npath->p_depth != path->p_depth); eh = npath[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { - ext_debug("next leaf isn't full(%d)\n", + ext_debug(inode, "next leaf isn't full(%d)\n", le16_to_cpu(eh->eh_entries)); path = npath; goto has_space; } - ext_debug("next leaf has no free space(%d,%d)\n", + ext_debug(inode, "next leaf has no free space(%d,%d)\n", le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); } @@ -2071,7 +2072,7 @@ has_space: if (!nearex) { /* there is no extent in this leaf, create first one */ - ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n", + ext_debug(inode, "first extent in the leaf: %u:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), @@ -2081,7 +2082,7 @@ has_space: if (le32_to_cpu(newext->ee_block) > le32_to_cpu(nearex->ee_block)) { /* Insert after */ - ext_debug("insert %u:%llu:[%d]%d before: " + ext_debug(inode, "insert %u:%llu:[%d]%d before: " "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), @@ -2092,7 +2093,7 @@ has_space: } else { /* Insert before */ BUG_ON(newext->ee_block == nearex->ee_block); - ext_debug("insert %u:%llu:[%d]%d after: " + ext_debug(inode, "insert %u:%llu:[%d]%d after: " "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), @@ -2102,7 +2103,7 @@ has_space: } len = EXT_LAST_EXTENT(eh) - nearex + 1; if (len > 0) { - ext_debug("insert %u:%llu:[%d]%d: " + ext_debug(inode, "insert %u:%llu:[%d]%d: " "move %d extents from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), @@ -2246,7 +2247,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, return; hole_len = min(es.es_lblk - hole_start, hole_len); } - ext_debug(" -> %u:%u\n", hole_start, hole_len); + ext_debug(inode, " -> %u:%u\n", hole_start, hole_len); ext4_es_insert_extent(inode, hole_start, hole_len, ~0, EXTENT_STATUS_HOLE); } @@ -2283,7 +2284,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, err = ext4_ext_dirty(handle, inode, path); if (err) return err; - ext_debug("index is empty, remove it, free block %llu\n", leaf); + ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf); trace_ext4_ext_rm_idx(inode, leaf); ext4_free_blocks(handle, inode, NULL, leaf, 1, @@ -2562,7 +2563,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ext4_fsblk_t pblk; /* the header must be checked already in ext4_ext_remove_space() */ - ext_debug("truncate since %u in leaf to %u\n", start, end); + ext_debug(inode, "truncate since %u in leaf to %u\n", start, end); if (!path[depth].p_hdr) path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); eh = path[depth].p_hdr; @@ -2588,7 +2589,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, else unwritten = 0; - ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, + ext_debug(inode, "remove ext %u:[%d]%d\n", ex_ee_block, unwritten, ex_ee_len); path[depth].p_ext = ex; @@ -2596,7 +2597,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, b = ex_ee_block+ex_ee_len - 1 < end ? ex_ee_block+ex_ee_len - 1 : end; - ext_debug(" border %u:%u\n", a, b); + ext_debug(inode, " border %u:%u\n", a, b); /* If this extent is beyond the end of the hole, skip it */ if (end < ex_ee_block) { @@ -2705,7 +2706,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (err) goto out; - ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num, + ext_debug(inode, "new extent: %u:%u:%llu\n", ex_ee_block, num, ext4_ext_pblock(ex)); ex--; ex_ee_block = le32_to_cpu(ex->ee_block); @@ -2782,7 +2783,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, partial.lblk = 0; partial.state = initial; - ext_debug("truncate since %u to %u\n", start, end); + ext_debug(inode, "truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE, @@ -2924,7 +2925,7 @@ again: /* this is index block */ if (!path[i].p_hdr) { - ext_debug("initialize header\n"); + ext_debug(inode, "initialize header\n"); path[i].p_hdr = ext_block_hdr(path[i].p_bh); } @@ -2932,7 +2933,7 @@ again: /* this level hasn't been touched yet */ path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; - ext_debug("init index ptr: hdr 0x%p, num %d\n", + ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n", path[i].p_hdr, le16_to_cpu(path[i].p_hdr->eh_entries)); } else { @@ -2940,13 +2941,13 @@ again: path[i].p_idx--; } - ext_debug("level %d - index, first 0x%p, cur 0x%p\n", + ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n", i, EXT_FIRST_INDEX(path[i].p_hdr), path[i].p_idx); if (ext4_ext_more_to_rm(path + i)) { struct buffer_head *bh; /* go to the next level */ - ext_debug("move to level %d (block %llu)\n", + ext_debug(inode, "move to level %d (block %llu)\n", i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); bh = read_extent_tree_block(inode, @@ -2982,7 +2983,7 @@ again: brelse(path[i].p_bh); path[i].p_bh = NULL; i--; - ext_debug("return to level %d\n", i); + ext_debug(inode, "return to level %d\n", i); } } @@ -3150,8 +3151,7 @@ static int ext4_split_extent_at(handle_t *handle, BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); - ext_debug("ext4_split_extents_at: inode %lu, logical" - "block %llu\n", inode->i_ino, (unsigned long long)split); + ext_debug(inode, "logical block %llu\n", (unsigned long long)split); ext4_ext_show_leaf(inode, path); @@ -3388,9 +3388,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, int err = 0; int split_flag = EXT4_EXT_DATA_VALID2; - ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" - "block %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)map->m_lblk, map_len); + ext_debug(inode, "logical block %llu, max_blocks %u\n", + (unsigned long long)map->m_lblk, map_len); sbi = EXT4_SB(inode->i_sb); eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) @@ -3642,8 +3641,7 @@ static int ext4_split_convert_extents(handle_t *handle, unsigned int ee_len; int split_flag = 0, depth; - ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n", - __func__, inode->i_ino, + ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)map->m_lblk, map->m_len); eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) @@ -3689,8 +3687,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" - "block %llu, max_blocks %u\n", inode->i_ino, + ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); /* If extent is larger than requested it is a clear sign that we still @@ -3760,8 +3757,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - ext_debug("%s: inode %lu, logical" - "block %llu, max_blocks %u\n", __func__, inode->i_ino, + ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); if (ee_block != map->m_lblk || ee_len > map->m_len) { @@ -3817,10 +3813,9 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, int ret = 0; int err = 0; - ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " - "block %llu, max_blocks %u, flags %x, allocated %u\n", - inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, - flags, allocated); + ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n", + (unsigned long long)map->m_lblk, map->m_len, flags, + allocated); ext4_ext_show_leaf(inode, path); /* @@ -4057,8 +4052,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_allocation_request ar; ext4_lblk_t cluster_offset; - ext_debug("blocks %u/%u requested for inode %lu\n", - map->m_lblk, map->m_len, inode->i_ino); + ext_debug(inode, "blocks %u/%u requested\n", map->m_lblk, map->m_len); trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ @@ -4105,8 +4099,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, newblock = map->m_lblk - ee_block + ee_start; /* number of remaining blocks in the extent */ allocated = ee_len - (map->m_lblk - ee_block); - ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, - ee_block, ee_len, newblock); + ext_debug(inode, "%u fit into %u:%d -> %llu\n", + map->m_lblk, ee_block, ee_len, newblock); /* * If the extent is initialized check whether the @@ -4246,7 +4240,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, goto out2; allocated_clusters = ar.len; ar.len = EXT4_C2B(sbi, ar.len) - offset; - ext_debug("allocate new block: goal %llu, found %llu/%u, requested %u\n", + ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n", ar.goal, newblock, ar.len, allocated); if (ar.len > allocated) ar.len = allocated; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0a52f98512d7..e7bf9388538b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -501,9 +501,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, #endif map->m_flags = 0; - ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," - "logical block %lu\n", inode->i_ino, flags, map->m_len, - (unsigned long) map->m_lblk); + ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n", + flags, map->m_len, (unsigned long) map->m_lblk); /* * ext4_map_blocks returns an int, and m_len is an unsigned int @@ -734,8 +733,7 @@ out_sem: } if (retval < 0) - ext_debug("failed for inode %lu with err %d\n", - inode->i_ino, retval); + ext_debug(inode, "failed with err %d\n", retval); return retval; } @@ -1691,8 +1689,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, invalid_block = ~0; map->m_flags = 0; - ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u," - "logical block %lu\n", inode->i_ino, map->m_len, + ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); /* Lookup extent status tree firstly */ From 8ad8d710035edf8d14f8ecc5fa15f8e1a27ecb59 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Sun, 10 May 2020 11:58:05 -0400 Subject: [PATCH 321/427] ext4: rework map struct instantiation in ext4_ext_map_blocks() The path performing block allocations in ext4_ext_map_blocks() contains code trimming the length of a new extent that is repeated later in the function. This code is both redundant and unnecessary as the exact length of the new extent has already been calculated. Rewrite the instantiation of the map struct in this case to use the available values, avoiding the overhead of unnecessary conversions and improving clarity. Add another map struct instantiation tailored specifically to the separate case for an existing written extent. Remove an old comment that no longer appears applicable to the current code. Signed-off-by: Eric Whitney Link: https://lore.kernel.org/r/20200510155805.18808-1-enwlinux@gmail.com Signed-off-by: Theodore Ts'o Reviewed-by: Ritesh Harjani --- fs/ext4/extents.c | 50 +++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 969f4c030cf0..ce394706c61a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4045,7 +4045,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_ext_path *path = NULL; struct ext4_extent newex, *ex, *ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - ext4_fsblk_t newblock = 0; + ext4_fsblk_t newblock = 0, pblk; int err = 0, depth, ret; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; @@ -4060,7 +4060,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; - goto out2; + goto out; } depth = ext_depth(inode); @@ -4076,7 +4076,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, (unsigned long) map->m_lblk, depth, path[depth].p_block); err = -EFSCORRUPTED; - goto out2; + goto out; } ex = path[depth].p_ext; @@ -4110,8 +4110,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { err = convert_initialized_extent(handle, inode, map, &path, &allocated); - goto out2; + goto out; } else if (!ext4_ext_is_unwritten(ex)) { + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = newblock; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; + ext4_ext_show_leaf(inode, path); goto out; } @@ -4122,7 +4128,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, err = ret; else allocated = ret; - goto out2; + goto out; } } @@ -4147,7 +4153,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, map->m_pblk = 0; map->m_len = min_t(unsigned int, map->m_len, hole_len); - goto out2; + goto out; } /* @@ -4171,12 +4177,12 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ar.lleft = map->m_lblk; err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); if (err) - goto out2; + goto out; ar.lright = map->m_lblk; ex2 = NULL; err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); if (err) - goto out2; + goto out; /* Check if the extent after searching to the right implies a * cluster we can use. */ @@ -4237,7 +4243,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ar.flags |= EXT4_MB_USE_RESERVED; newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) - goto out2; + goto out; allocated_clusters = ar.len; ar.len = EXT4_C2B(sbi, ar.len) - offset; ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n", @@ -4247,7 +4253,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, got_allocated_blocks: /* try to insert new extent into found leaf and return */ - ext4_ext_store_pblock(&newex, newblock + offset); + pblk = newblock + offset; + ext4_ext_store_pblock(&newex, pblk); newex.ee_len = cpu_to_le16(ar.len); /* Mark unwritten */ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { @@ -4272,16 +4279,9 @@ got_allocated_blocks: EXT4_C2B(sbi, allocated_clusters), fb_flags); } - goto out2; + goto out; } - /* previous routine could use block we allocated */ - newblock = ext4_ext_pblock(&newex); - allocated = ext4_ext_get_actual_len(&newex); - if (allocated > map->m_len) - allocated = map->m_len; - map->m_flags |= EXT4_MAP_NEW; - /* * Reduce the reserved cluster count to reflect successful deferred * allocation of delayed allocated clusters or direct allocation of @@ -4327,14 +4327,14 @@ got_allocated_blocks: ext4_update_inode_fsync_trans(handle, inode, 1); else ext4_update_inode_fsync_trans(handle, inode, 0); -out: - if (allocated > map->m_len) - allocated = map->m_len; + + map->m_flags |= (EXT4_MAP_NEW | EXT4_MAP_MAPPED); + map->m_pblk = pblk; + map->m_len = ar.len; + allocated = map->m_len; ext4_ext_show_leaf(inode, path); - map->m_flags |= EXT4_MAP_MAPPED; - map->m_pblk = newblock; - map->m_len = allocated; -out2: + +out: ext4_ext_drop_refs(path); kfree(path); From de8ff14cab998f51a3a289d2b58d6d440782f77e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 May 2020 14:52:52 -0700 Subject: [PATCH 322/427] ext4: add casefold flag to EXT4_INODE_* flags No one currently needs EXT4_INODE_CASEFOLD, but add it to keep the EXT4_INODE_* definitions in sync with the EXT4_*_FL definitions. Also make it clearer that the casefold flag is only for directories. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20200510215252.87833-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 80866f124b9a..af60be906528 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -425,7 +425,7 @@ struct flex_groups { /* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */ #define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ -#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */ +#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */ @@ -498,6 +498,7 @@ enum { /* 22 was formerly EXT4_INODE_EOFBLOCKS */ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ + EXT4_INODE_CASEFOLD = 30, /* Casefolded directory */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ }; @@ -543,6 +544,7 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(EA_INODE); CHECK_FLAG_VALUE(INLINE_DATA); CHECK_FLAG_VALUE(PROJINHERIT); + CHECK_FLAG_VALUE(CASEFOLD); CHECK_FLAG_VALUE(RESERVED); } From 53f86b170dfa8d50b8b3fb1c5cf17c33b2327db2 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 20 May 2020 12:10:32 +0530 Subject: [PATCH 323/427] ext4: mballoc: add blocks to PA list under same spinlock after allocating blocks ext4_mb_discard_preallocations() only checks for grp->bb_prealloc_list of every group to discard the group's PA to free up the space if allocation request fails. Consider below race:- Process A Process B 1. allocate blocks 1. Fails block allocation from ext4_mb_regular_allocator() ext4_lock_group() allocated blocks more than ac_o_ex.fe_len ext4_unlock_group() 2. Scans the grp->bb_prealloc_list (under ext4_lock_group()) and find nothing and thus return -ENOSPC. 2. Add the additional blocks to PA list ext4_lock_group() add blocks to grp->bb_prealloc_list ext4_unlock_group() Above race could be avoided if we add those additional blocks to grp->bb_prealloc_list at the same time with block allocation when ext4_lock_group() was still held. With this discard-PA will know if there are actually any blocks which could be freed from the PA Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/a2217dd782585b42328981832e6d396abaaccb80.1589955723.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 97 ++++++++++++++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 35 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 33a69424942c..decc5168d126 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -349,6 +349,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); +static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { @@ -1701,6 +1702,14 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, sbi->s_mb_last_start = ac->ac_f_ex.fe_start; spin_unlock(&sbi->s_md_lock); } + /* + * As we've just preallocated more space than + * user requested originally, we store allocated + * space in a special descriptor. + */ + if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) + ext4_mb_new_preallocation(ac); + } /* @@ -1949,7 +1958,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, ext4_mb_use_best_found(ac, e4b); - BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); + BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len); if (EXT4_SB(sb)->s_mb_stats) atomic_inc(&EXT4_SB(sb)->s_bal_2orders); @@ -3675,7 +3684,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, /* * creates new preallocated space for given inode */ -static noinline_for_stack int +static noinline_for_stack void ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; @@ -3688,10 +3697,9 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); + BUG_ON(ac->ac_pa == NULL); - pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); - if (pa == NULL) - return -ENOMEM; + pa = ac->ac_pa; if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { int winl; @@ -3735,7 +3743,6 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; - atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_group_list); @@ -3755,21 +3762,17 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_obj_lock = &ei->i_prealloc_lock; pa->pa_inode = ac->ac_inode; - ext4_lock_group(sb, ac->ac_b_ex.fe_group); list_add(&pa->pa_group_list, &grp->bb_prealloc_list); - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); spin_lock(pa->pa_obj_lock); list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); spin_unlock(pa->pa_obj_lock); - - return 0; } /* * creates new preallocated space for locality group inodes belongs to */ -static noinline_for_stack int +static noinline_for_stack void ext4_mb_new_group_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; @@ -3781,11 +3784,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); + BUG_ON(ac->ac_pa == NULL); - BUG_ON(ext4_pspace_cachep == NULL); - pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); - if (pa == NULL) - return -ENOMEM; + pa = ac->ac_pa; /* preallocation can change ac_b_ex, thus we store actually * allocated blocks for history */ @@ -3795,7 +3796,6 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_lstart = pa->pa_pstart; pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; - atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_group_list); @@ -3816,26 +3816,20 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_obj_lock = &lg->lg_prealloc_lock; pa->pa_inode = NULL; - ext4_lock_group(sb, ac->ac_b_ex.fe_group); list_add(&pa->pa_group_list, &grp->bb_prealloc_list); - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); /* * We will later add the new pa to the right bucket * after updating the pa_free in ext4_mb_release_context */ - return 0; } -static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) +static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) { - int err; - if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) - err = ext4_mb_new_group_pa(ac); + ext4_mb_new_group_pa(ac); else - err = ext4_mb_new_inode_pa(ac); - return err; + ext4_mb_new_inode_pa(ac); } /* @@ -4150,6 +4144,29 @@ repeat: } } +static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac) +{ + struct ext4_prealloc_space *pa; + + BUG_ON(ext4_pspace_cachep == NULL); + pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS); + if (!pa) + return -ENOMEM; + atomic_set(&pa->pa_count, 1); + ac->ac_pa = pa; + return 0; +} + +static void ext4_mb_pa_free(struct ext4_allocation_context *ac) +{ + struct ext4_prealloc_space *pa = ac->ac_pa; + + BUG_ON(!pa); + ac->ac_pa = NULL; + WARN_ON(!atomic_dec_and_test(&pa->pa_count)); + kmem_cache_free(ext4_pspace_cachep, pa); +} + #ifdef CONFIG_EXT4_DEBUG static inline void ext4_mb_show_pa(struct super_block *sb) { @@ -4606,23 +4623,28 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, if (!ext4_mb_use_preallocated(ac)) { ac->ac_op = EXT4_MB_HISTORY_ALLOC; ext4_mb_normalize_request(ac, ar); + + *errp = ext4_mb_pa_alloc(ac); + if (*errp) + goto errout; repeat: /* allocate space in core */ *errp = ext4_mb_regular_allocator(ac); - if (*errp) - goto discard_and_exit; - - /* as we've just preallocated more space than - * user requested originally, we store allocated - * space in a special descriptor */ - if (ac->ac_status == AC_STATUS_FOUND && - ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) - *errp = ext4_mb_new_preallocation(ac); + /* + * pa allocated above is added to grp->bb_prealloc_list only + * when we were able to allocate some block i.e. when + * ac->ac_status == AC_STATUS_FOUND. + * And error from above mean ac->ac_status != AC_STATUS_FOUND + * So we have to free this pa here itself. + */ if (*errp) { - discard_and_exit: + ext4_mb_pa_free(ac); ext4_discard_allocated_blocks(ac); goto errout; } + if (ac->ac_status == AC_STATUS_FOUND && + ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) + ext4_mb_pa_free(ac); } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); @@ -4637,6 +4659,11 @@ repeat: freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); if (freed) goto repeat; + /* + * If block allocation fails then the pa allocated above + * needs to be freed here itself. + */ + ext4_mb_pa_free(ac); *errp = -ENOSPC; } From cf5e2ca6c99077d128e971149f0c262e808ca831 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 20 May 2020 12:10:33 +0530 Subject: [PATCH 324/427] ext4: mballoc: refactor ext4_mb_discard_preallocations() Implement ext4_mb_discard_preallocations_should_retry() which we will need in later patches to add more logic like check for sequence number match to see if we should retry for block allocation or not. There should be no functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/1cfae0098d2aa9afbeb59331401258182868c8f2.1589955723.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index decc5168d126..b75408d72773 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4543,6 +4543,17 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) return freed; } +static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, + struct ext4_allocation_context *ac) +{ + int freed; + + freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); + if (freed) + return true; + return false; +} + /* * Main entry point into mballoc to allocate blocks * it tries to use preallocation first, then falls back @@ -4551,7 +4562,6 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, struct ext4_allocation_request *ar, int *errp) { - int freed; struct ext4_allocation_context *ac = NULL; struct ext4_sb_info *sbi; struct super_block *sb; @@ -4656,8 +4666,7 @@ repeat: ar->len = ac->ac_b_ex.fe_len; } } else { - freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); - if (freed) + if (ext4_mb_discard_preallocations_should_retry(sb, ac)) goto repeat; /* * If block allocation fails then the pa allocated above From 07b5b8e1ac4004b7db1065a301df65cd434c31c9 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 20 May 2020 12:10:34 +0530 Subject: [PATCH 325/427] ext4: mballoc: introduce pcpu seqcnt for freeing PA to improve ENOSPC handling There could be a race in function ext4_mb_discard_group_preallocations() where the 1st thread may iterate through group's bb_prealloc_list and remove all the PAs and add to function's local list head. Now if the 2nd thread comes in to discard the group preallocations, it will see that the group->bb_prealloc_list is empty and will return 0. Consider for a case where we have less number of groups (for e.g. just group 0), this may even return an -ENOSPC error from ext4_mb_new_blocks() (where we call for ext4_mb_discard_group_preallocations()). But that is wrong, since 2nd thread should have waited for 1st thread to release all the PAs and should have retried for allocation. Since 1st thread was anyway going to discard the PAs. The algorithm using this percpu seq counter goes below: 1. We sample the percpu discard_pa_seq counter before trying for block allocation in ext4_mb_new_blocks(). 2. We increment this percpu discard_pa_seq counter when we either allocate or free these blocks i.e. while marking those blocks as used/free in mb_mark_used()/mb_free_blocks(). 3. We also increment this percpu seq counter when we successfully identify that the bb_prealloc_list is not empty and hence proceed for discarding of those PAs inside ext4_mb_discard_group_preallocations(). Now to make sure that the regular fast path of block allocation is not affected, as a small optimization we only sample the percpu seq counter on that cpu. Only when the block allocation fails and when freed blocks found were 0, that is when we sample percpu seq counter for all cpus using below function ext4_get_discard_pa_seq_sum(). This happens after making sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty. It can be well argued that why don't just check for grp->bb_free to see if there are any free blocks to be allocated. So here are the two concerns which were discussed:- 1. If for some reason the blocks available in the group are not appropriate for allocation logic (say for e.g. EXT4_MB_HINT_GOAL_ONLY, although this is not yet implemented), then the retry logic may result into infinte looping since grp->bb_free is non-zero. 2. Also before preallocation was clubbed with block allocation with the same ext4_lock_group() held, there were lot of races where grp->bb_free could not be reliably relied upon. Due to above, this patch considers discard_pa_seq logic to determine if we should retry for block allocation. Say if there are are n threads trying for block allocation and none of those could allocate or discard any of the blocks, then all of those n threads will fail the block allocation and return -ENOSPC error. (Since the seq counter for all of those will match as no block allocation/discard was done during that duration). Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/7f254686903b87c419d798742fd9a1be34f0657b.1589955723.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 56 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 5 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b75408d72773..754ff9f65199 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -351,6 +351,35 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); +/* + * The algorithm using this percpu seq counter goes below: + * 1. We sample the percpu discard_pa_seq counter before trying for block + * allocation in ext4_mb_new_blocks(). + * 2. We increment this percpu discard_pa_seq counter when we either allocate + * or free these blocks i.e. while marking those blocks as used/free in + * mb_mark_used()/mb_free_blocks(). + * 3. We also increment this percpu seq counter when we successfully identify + * that the bb_prealloc_list is not empty and hence proceed for discarding + * of those PAs inside ext4_mb_discard_group_preallocations(). + * + * Now to make sure that the regular fast path of block allocation is not + * affected, as a small optimization we only sample the percpu seq counter + * on that cpu. Only when the block allocation fails and when freed blocks + * found were 0, that is when we sample percpu seq counter for all cpus using + * below function ext4_get_discard_pa_seq_sum(). This happens after making + * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty. + */ +static DEFINE_PER_CPU(u64, discard_pa_seq); +static inline u64 ext4_get_discard_pa_seq_sum(void) +{ + int __cpu; + u64 __seq = 0; + + for_each_possible_cpu(__cpu) + __seq += per_cpu(discard_pa_seq, __cpu); + return __seq; +} + static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { #if BITS_PER_LONG == 64 @@ -1462,6 +1491,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); + this_cpu_inc(discard_pa_seq); e4b->bd_info->bb_free += count; if (first < e4b->bd_info->bb_first_free) e4b->bd_info->bb_first_free = first; @@ -1603,6 +1633,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) mb_check_buddy(e4b); mb_mark_used_double(e4b, start, len); + this_cpu_inc(discard_pa_seq); e4b->bd_info->bb_free -= len; if (e4b->bd_info->bb_first_free == start) e4b->bd_info->bb_first_free += len; @@ -3962,6 +3993,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, INIT_LIST_HEAD(&list); repeat: ext4_lock_group(sb, group); + this_cpu_inc(discard_pa_seq); list_for_each_entry_safe(pa, tmp, &grp->bb_prealloc_list, pa_group_list) { spin_lock(&pa->pa_lock); @@ -4544,14 +4576,26 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) } static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, - struct ext4_allocation_context *ac) + struct ext4_allocation_context *ac, u64 *seq) { int freed; + u64 seq_retry = 0; + bool ret = false; freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); - if (freed) - return true; - return false; + if (freed) { + ret = true; + goto out_dbg; + } + seq_retry = ext4_get_discard_pa_seq_sum(); + if (seq_retry != *seq) { + *seq = seq_retry; + ret = true; + } + +out_dbg: + mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no"); + return ret; } /* @@ -4568,6 +4612,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ext4_fsblk_t block = 0; unsigned int inquota = 0; unsigned int reserv_clstrs = 0; + u64 seq; might_sleep(); sb = ar->inode->i_sb; @@ -4630,6 +4675,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, } ac->ac_op = EXT4_MB_HISTORY_PREALLOC; + seq = *this_cpu_ptr(&discard_pa_seq); if (!ext4_mb_use_preallocated(ac)) { ac->ac_op = EXT4_MB_HISTORY_ALLOC; ext4_mb_normalize_request(ac, ar); @@ -4666,7 +4712,7 @@ repeat: ar->len = ac->ac_b_ex.fe_len; } } else { - if (ext4_mb_discard_preallocations_should_retry(sb, ac)) + if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) goto repeat; /* * If block allocation fails then the pa allocated above From 8ef123fe02ca0923b01b57bdf639800a23a2faa8 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 20 May 2020 12:10:35 +0530 Subject: [PATCH 326/427] ext4: mballoc: refactor ext4_mb_good_group() ext4_mb_good_group() definition was changed some time back and now it even initializes the buddy cache (via ext4_mb_init_group()), if in case the EXT4_MB_GRP_NEED_INIT() is true for a group. Note that ext4_mb_init_group() could sleep and so should not be called under a spinlock held. This is fine as of now because ext4_mb_good_group() is called before loading the buddy bitmap without ext4_lock_group() held and again called after loading the bitmap, only this time with ext4_lock_group() held. But still this whole thing is confusing. So this patch refactors out ext4_mb_good_group_nolock() which should be called when without holding ext4_lock_group(). Also in further patches we hold the spinlock (ext4_lock_group()) while doing any calculations which involves grp->bb_free or grp->bb_fragments. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/d9f7d031a5fbe1c943fae6bf1ff5cdf0604ae722.1589955723.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 78 ++++++++++++++++++++++++++++++----------------- 1 file changed, 50 insertions(+), 28 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 754ff9f65199..c9297c878a90 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2106,15 +2106,14 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, } /* - * This is now called BEFORE we load the buddy bitmap. + * This is also called BEFORE we load the buddy bitmap. * Returns either 1 or 0 indicating that the group is either suitable - * for the allocation or not. In addition it can also return negative - * error code when something goes wrong. + * for the allocation or not. */ -static int ext4_mb_good_group(struct ext4_allocation_context *ac, +static bool ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, int cr) { - unsigned free, fragments; + ext4_grpblk_t free, fragments; int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); @@ -2122,23 +2121,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, free = grp->bb_free; if (free == 0) - return 0; + return false; if (cr <= 2 && free < ac->ac_g_ex.fe_len) - return 0; + return false; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) - return 0; - - /* We only do this if the grp has never been initialized */ - if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { - int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS); - if (ret) - return ret; - } + return false; fragments = grp->bb_fragments; if (fragments == 0) - return 0; + return false; switch (cr) { case 0: @@ -2148,31 +2140,63 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, if ((ac->ac_flags & EXT4_MB_HINT_DATA) && (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && ((group % flex_size) == 0)) - return 0; + return false; if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) || (free / fragments) >= ac->ac_g_ex.fe_len) - return 1; + return true; if (grp->bb_largest_free_order < ac->ac_2order) - return 0; + return false; - return 1; + return true; case 1: if ((free / fragments) >= ac->ac_g_ex.fe_len) - return 1; + return true; break; case 2: if (free >= ac->ac_g_ex.fe_len) - return 1; + return true; break; case 3: - return 1; + return true; default: BUG(); } - return 0; + return false; +} + +/* + * This could return negative error code if something goes wrong + * during ext4_mb_init_group(). This should not be called with + * ext4_lock_group() held. + */ +static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, + ext4_group_t group, int cr) +{ + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); + ext4_grpblk_t free; + int ret = 0; + + free = grp->bb_free; + if (free == 0) + goto out; + if (cr <= 2 && free < ac->ac_g_ex.fe_len) + goto out; + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + goto out; + + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS); + if (ret) + return ret; + } + + ret = ext4_mb_good_group(ac, group, cr); +out: + return ret; } static noinline_for_stack int @@ -2260,7 +2284,7 @@ repeat: group = 0; /* This now checks without needing the buddy page */ - ret = ext4_mb_good_group(ac, group, cr); + ret = ext4_mb_good_group_nolock(ac, group, cr); if (ret <= 0) { if (!first_err) first_err = ret; @@ -2278,11 +2302,9 @@ repeat: * block group */ ret = ext4_mb_good_group(ac, group, cr); - if (ret <= 0) { + if (ret == 0) { ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); - if (!first_err) - first_err = ret; continue; } From 993778306e7901a7286322f25c7c681dd47bede6 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 20 May 2020 12:10:36 +0530 Subject: [PATCH 327/427] ext4: mballoc: use lock for checking free blocks while retrying Currently while doing block allocation grp->bb_free may be getting modified if discard is happening in parallel. For e.g. consider a case where there are lot of threads who have preallocated lot of blocks and there is a thread which is trying to discard all of this group's PA. Now it could happen that we see all of those group's bb_free is zero and fail the allocation while there is sufficient space if we free up all the PA. So this patch adds another flag "EXT4_MB_STRICT_CHECK" which will be set if we are unable to allocate any blocks in the first try (since we may not have considered blocks about to be discarded from PA lists). So during retry attempt to allocate blocks we will use ext4_lock_group() for checking if the group is good or not. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/9cb740a117c958c36596f167b12af1beae9a68b7.1589955723.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 ++ fs/ext4/mballoc.c | 13 ++++++++++++- include/trace/events/ext4.h | 3 ++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index af60be906528..dbc36e377eb0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -150,6 +150,8 @@ enum SHIFT_DIRECTION { #define EXT4_MB_USE_ROOT_BLOCKS 0x1000 /* Use blocks from reserved pool */ #define EXT4_MB_USE_RESERVED 0x2000 +/* Do strict check for free blocks while retrying block allocation */ +#define EXT4_MB_STRICT_CHECK 0x4000 struct ext4_allocation_request { /* target inode for block we're allocating */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c9297c878a90..a9083113a8c0 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2176,9 +2176,13 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_group_t group, int cr) { struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); + struct super_block *sb = ac->ac_sb; + bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK; ext4_grpblk_t free; int ret = 0; + if (should_lock) + ext4_lock_group(sb, group); free = grp->bb_free; if (free == 0) goto out; @@ -2186,6 +2190,8 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; + if (should_lock) + ext4_unlock_group(sb, group); /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { @@ -2194,8 +2200,12 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, return ret; } + if (should_lock) + ext4_lock_group(sb, group); ret = ext4_mb_good_group(ac, group, cr); out: + if (should_lock) + ext4_unlock_group(sb, group); return ret; } @@ -4610,7 +4620,8 @@ static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, goto out_dbg; } seq_retry = ext4_get_discard_pa_seq_sum(); - if (seq_retry != *seq) { + if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) { + ac->ac_flags |= EXT4_MB_STRICT_CHECK; *seq = seq_retry; ret = true; } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 280475c1cecc..cc41d692ae8e 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -35,7 +35,8 @@ struct partial_cluster; { EXT4_MB_DELALLOC_RESERVED, "DELALLOC_RESV" }, \ { EXT4_MB_STREAM_ALLOC, "STREAM_ALLOC" }, \ { EXT4_MB_USE_ROOT_BLOCKS, "USE_ROOT_BLKS" }, \ - { EXT4_MB_USE_RESERVED, "USE_RESV" }) + { EXT4_MB_USE_RESERVED, "USE_RESV" }, \ + { EXT4_MB_STRICT_CHECK, "STRICT_CHECK" }) #define show_map_flags(flags) __print_flags(flags, "|", \ { EXT4_GET_BLOCKS_CREATE, "CREATE" }, \ From dfcd4489e270282d984cd06c00f3a45d52a3f0a7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 20 May 2020 15:31:18 +0200 Subject: [PATCH 328/427] ext4: drop ext4_journal_free_reserved() Remove ext4_journal_free_reserved() function. It is never used. Signed-off-by: Jan Kara Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20200520133119.1383-2-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4_jbd2.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 3bacf76d2609..00dc668e052b 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -338,12 +338,6 @@ static inline handle_t *__ext4_journal_start(struct inode *inode, handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, int type); -static inline void ext4_journal_free_reserved(handle_t *handle) -{ - if (ext4_handle_valid(handle)) - jbd2_journal_free_reserved(handle); -} - static inline handle_t *ext4_journal_current_handle(void) { return journal_current_handle(); From 14ff6286309e2853aed50083c9a83328423fdd8c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 20 May 2020 15:31:19 +0200 Subject: [PATCH 329/427] jbd2: avoid leaking transaction credits when unreserving handle When reserved transaction handle is unused, we subtract its reserved credits in __jbd2_journal_unreserve_handle() called from jbd2_journal_stop(). However this function forgets to remove reserved credits from transaction->t_outstanding_credits and thus the transaction space that was reserved remains effectively leaked. The leaked transaction space can be quite significant in some cases and leads to unnecessarily small transactions and thus reducing throughput of the journalling machinery. E.g. fsmark workload creating lots of 4k files was observed to have about 20% lower throughput due to this when ext4 is mounted with dioread_nolock mount option. Subtract reserved credits from t_outstanding_credits as well. CC: stable@vger.kernel.org Fixes: 8f7d89f36829 ("jbd2: transaction reservation support") Reviewed-by: Andreas Dilger Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20200520133119.1383-3-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/jbd2/transaction.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 3dccc23cf010..e91aad3637a2 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -541,17 +541,24 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) } EXPORT_SYMBOL(jbd2_journal_start); -static void __jbd2_journal_unreserve_handle(handle_t *handle) +static void __jbd2_journal_unreserve_handle(handle_t *handle, transaction_t *t) { journal_t *journal = handle->h_journal; WARN_ON(!handle->h_reserved); sub_reserved_credits(journal, handle->h_total_credits); + if (t) + atomic_sub(handle->h_total_credits, &t->t_outstanding_credits); } void jbd2_journal_free_reserved(handle_t *handle) { - __jbd2_journal_unreserve_handle(handle); + journal_t *journal = handle->h_journal; + + /* Get j_state_lock to pin running transaction if it exists */ + read_lock(&journal->j_state_lock); + __jbd2_journal_unreserve_handle(handle, journal->j_running_transaction); + read_unlock(&journal->j_state_lock); jbd2_free_handle(handle); } EXPORT_SYMBOL(jbd2_journal_free_reserved); @@ -722,7 +729,8 @@ static void stop_this_handle(handle_t *handle) atomic_sub(handle->h_total_credits, &transaction->t_outstanding_credits); if (handle->h_rsv_handle) - __jbd2_journal_unreserve_handle(handle->h_rsv_handle); + __jbd2_journal_unreserve_handle(handle->h_rsv_handle, + transaction); if (atomic_dec_and_test(&transaction->t_updates)) wake_up(&journal->j_wait_updates); From 9f364e1d9537d44fdf58deeeddb51277d8327ce5 Mon Sep 17 00:00:00 2001 From: Jonathan Grant Date: Fri, 22 May 2020 16:07:58 +0100 Subject: [PATCH 330/427] add comment for ext4_dir_entry_2 file_type member Signed-off-by: Jonathan Grant Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/ad3290d5-86af-99c1-f9d5-cd1bab710429@jguk.org Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index dbc36e377eb0..a94e7aaea6e6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2061,7 +2061,7 @@ struct ext4_dir_entry_2 { __le32 inode; /* Inode number */ __le16 rec_len; /* Directory entry length */ __u8 name_len; /* Name length */ - __u8 file_type; + __u8 file_type; /* See file type macros EXT4_FT_* below */ char name[EXT4_NAME_LEN]; /* File name */ }; From 175efa81feb8405676e0136d97b10380179c92e0 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Tue, 5 May 2020 17:43:14 +0200 Subject: [PATCH 331/427] ext4: fix EXT4_MAX_LOGICAL_BLOCK macro ext4 supports max number of logical blocks in a file to be 0xffffffff. (This is since ext4_extent's ee_block is __le32). This means that EXT4_MAX_LOGICAL_BLOCK should be 0xfffffffe (starting from 0 logical offset). This patch fixes this. The issue was seen when ext4 moved to iomap_fiemap API and when overlayfs was mounted on top of ext4. Since overlayfs was missing filemap_check_ranges(), so it could pass a arbitrary huge length which lead to overflow of map.m_len logic. This patch fixes that. Fixes: d3b6f23f7167 ("ext4: move ext4_fiemap to use iomap framework") Reported-by: syzbot+77fa5bdb65cc39711820@syzkaller.appspotmail.com Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20200505154324.3226743-2-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a94e7aaea6e6..1eb07ca91fca 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -733,7 +733,7 @@ enum { #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF /* Max logical block we can support */ -#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFF +#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFE /* * Structure of an inode on the disk From 328e24ae14aeb8ef624ec181e0d546b05c34f031 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 May 2020 17:43:15 +0200 Subject: [PATCH 332/427] ext4: fix fiemap size checks for bitmap files Add an extra validation of the len parameter, as for ext4 some files might have smaller file size limits than others. This also means the redundant size check in ext4_ioctl_get_es_cache can go away, as all size checking is done in the shared fiemap handler. Signed-off-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200505154324.3226743-3-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 31 +++++++++++++++++++++++++++++++ fs/ext4/ioctl.c | 33 ++------------------------------- 2 files changed, 33 insertions(+), 31 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ce394706c61a..844773d3b64b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4862,6 +4862,28 @@ static const struct iomap_ops ext4_iomap_xattr_ops = { .iomap_begin = ext4_iomap_xattr_begin, }; +static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) +{ + u64 maxbytes; + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + maxbytes = inode->i_sb->s_maxbytes; + else + maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; + + if (*len == 0) + return -EINVAL; + if (start > maxbytes) + return -EFBIG; + + /* + * Shrink request scope to what the fs can actually handle. + */ + if (*len > maxbytes || (maxbytes - *len) < start) + *len = maxbytes - start; + return 0; +} + static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, bool from_es_cache) { @@ -4882,6 +4904,15 @@ static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (fiemap_check_flags(fieinfo, ext4_fiemap_flags)) return -EBADR; + /* + * For bitmap files the maximum size limit could be smaller than + * s_maxbytes, so check len here manually instead of just relying on the + * generic check. + */ + error = ext4_fiemap_check_ranges(inode, start, &len); + if (error) + return error; + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; error = iomap_fiemap(inode, fieinfo, start, len, diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bfc1281fc4cb..0746532ba463 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -733,29 +733,6 @@ static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa) fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid); } -/* copied from fs/ioctl.c */ -static int fiemap_check_ranges(struct super_block *sb, - u64 start, u64 len, u64 *new_len) -{ - u64 maxbytes = (u64) sb->s_maxbytes; - - *new_len = len; - - if (len == 0) - return -EINVAL; - - if (start > maxbytes) - return -EFBIG; - - /* - * Shrink request scope to what the fs can actually handle. - */ - if (len > maxbytes || (maxbytes - len) < start) - *new_len = maxbytes - start; - - return 0; -} - /* So that the fiemap access checks can't overflow on 32 bit machines. */ #define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) @@ -765,8 +742,6 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) struct fiemap __user *ufiemap = (struct fiemap __user *) arg; struct fiemap_extent_info fieinfo = { 0, }; struct inode *inode = file_inode(filp); - struct super_block *sb = inode->i_sb; - u64 len; int error; if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap))) @@ -775,11 +750,6 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) return -EINVAL; - error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length, - &len); - if (error) - return error; - fieinfo.fi_flags = fiemap.fm_flags; fieinfo.fi_extents_max = fiemap.fm_extent_count; fieinfo.fi_extents_start = ufiemap->fm_extents; @@ -792,7 +762,8 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) filemap_write_and_wait(inode->i_mapping); - error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, len); + error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, + fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) From 03a5ed24c9b8f0180a59ba7b7b9b9517fcf4335b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:08 +0200 Subject: [PATCH 333/427] ext4: split _ext4_fiemap The fiemap and EXT4_IOC_GET_ES_CACHE cases share almost no code, so split them into entirely separate functions. Signed-off-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200523073016.2944131-2-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 72 +++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 37 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 844773d3b64b..9e90f324d75f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4884,11 +4884,9 @@ static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) return 0; } -static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len, bool from_es_cache) +int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) { - ext4_lblk_t start_blk; - u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR; int error = 0; if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { @@ -4898,10 +4896,7 @@ static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } - if (from_es_cache) - ext4_fiemap_flags &= FIEMAP_FLAG_XATTR; - - if (fiemap_check_flags(fieinfo, ext4_fiemap_flags)) + if (fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR)) return -EBADR; /* @@ -4915,40 +4910,20 @@ static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; - error = iomap_fiemap(inode, fieinfo, start, len, - &ext4_iomap_xattr_ops); - } else if (!from_es_cache) { - error = iomap_fiemap(inode, fieinfo, start, len, - &ext4_iomap_report_ops); - } else { - ext4_lblk_t len_blks; - __u64 last_blk; - - start_blk = start >> inode->i_sb->s_blocksize_bits; - last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; - if (last_blk >= EXT_MAX_BLOCKS) - last_blk = EXT_MAX_BLOCKS-1; - len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; - - /* - * Walk the extent tree gathering extent information - * and pushing extents back to the user. - */ - error = ext4_fill_es_cache_info(inode, start_blk, len_blks, - fieinfo); + return iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_xattr_ops); } - return error; -} -int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len) -{ - return _ext4_fiemap(inode, fieinfo, start, len, false); + return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops); } int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { + ext4_lblk_t start_blk, len_blks; + __u64 last_blk; + int error = 0; + if (ext4_has_inline_data(inode)) { int has_inline; @@ -4959,9 +4934,32 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, return 0; } - return _ext4_fiemap(inode, fieinfo, start, len, true); -} + if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + error = ext4_ext_precache(inode); + if (error) + return error; + fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; + } + if (fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)) + return -EBADR; + + error = ext4_fiemap_check_ranges(inode, start, &len); + if (error) + return error; + + start_blk = start >> inode->i_sb->s_blocksize_bits; + last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; + if (last_blk >= EXT_MAX_BLOCKS) + last_blk = EXT_MAX_BLOCKS-1; + len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; + + /* + * Walk the extent tree gathering extent information + * and pushing extents back to the user. + */ + return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo); +} /* * ext4_access_path: From da565e792be540a5726af7f8cd50b282af0358b7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:09 +0200 Subject: [PATCH 334/427] ext4: remove the call to fiemap_check_flags in ext4_fiemap iomap_fiemap already calls fiemap_check_flags first thing, so this additional check is redundant. Signed-off-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200523073016.2944131-3-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9e90f324d75f..b365c8b407c4 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4896,9 +4896,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } - if (fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR)) - return -EBADR; - /* * For bitmap files the maximum size limit could be smaller than * s_maxbytes, so check len here manually instead of just relying on the From 44ebcd06bbb3ab3ee446b933800aca32fc4ca9b1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:10 +0200 Subject: [PATCH 335/427] fs: mark __generic_block_fiemap static There is no caller left outside of ioctl.c. Signed-off-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20200523073016.2944131-4-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/ioctl.c | 4 +--- include/linux/fs.h | 4 ---- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/ioctl.c b/fs/ioctl.c index 5e80b40bc1b5..8fe5131b1dee 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -307,8 +307,7 @@ static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) * If you use this function directly, you need to do your own locking. Use * generic_block_fiemap if you want the locking done for you. */ - -int __generic_block_fiemap(struct inode *inode, +static int __generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, loff_t start, loff_t len, get_block_t *get_block) { @@ -453,7 +452,6 @@ int __generic_block_fiemap(struct inode *inode, return ret; } -EXPORT_SYMBOL(__generic_block_fiemap); /** * generic_block_fiemap - FIEMAP for block based inodes diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6f59b4f22a..3104c6f7527b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3299,10 +3299,6 @@ static inline int vfs_fstat(int fd, struct kstat *stat) extern const char *vfs_get_link(struct dentry *, struct delayed_call *); extern int vfs_readlink(struct dentry *, char __user *, int); -extern int __generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, - loff_t start, loff_t len, - get_block_t *get_block); extern int generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len, get_block_t *get_block); From 10c5db286452b8c60e8f58e9a4c1cbc5a91e4e5b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:11 +0200 Subject: [PATCH 336/427] fs: move the fiemap definitions out of fs.h No need to pull the fiemap definitions into almost every file in the kernel build. Signed-off-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20200523073016.2944131-5-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/bad_inode.c | 1 + fs/btrfs/extent_io.h | 1 + fs/cifs/inode.c | 1 + fs/cifs/smb2ops.c | 1 + fs/ext2/inode.c | 1 + fs/ext4/ext4.h | 1 + fs/f2fs/data.c | 1 + fs/f2fs/inline.c | 1 + fs/gfs2/inode.c | 1 + fs/hpfs/file.c | 1 + fs/ioctl.c | 1 + fs/iomap/fiemap.c | 1 + fs/nilfs2/inode.c | 1 + fs/overlayfs/inode.c | 1 + fs/xfs/xfs_iops.c | 1 + include/linux/fiemap.h | 24 ++++++++++++++++++++++++ include/linux/fs.h | 19 +------------------ include/uapi/linux/fiemap.h | 6 +++--- 18 files changed, 43 insertions(+), 21 deletions(-) create mode 100644 include/linux/fiemap.h diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 8035d2a44561..54f0ce444272 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -15,6 +15,7 @@ #include #include #include +#include static int bad_file_open(struct inode *inode, struct file *filp) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2ed65bd0760e..817698bc0669 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -5,6 +5,7 @@ #include #include +#include #include "ulist.h" /* diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 390d2b15ef6e..3f276eb8ca68 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include "cifsfs.h" diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f829f4165d38..09047f1ddfb6 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "cifsfs.h" #include "cifsglob.h" #include "smb2pdu.h" diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index c885cf7d724b..0f12a0e8a8d9 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "ext2.h" #include "acl.h" #include "xattr.h" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1eb07ca91fca..9e5c332a2b94 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -36,6 +36,7 @@ #include #include #include +#include #ifdef __KERNEL__ #include #endif diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index cdf2f626bea7..25abbbb65ba0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 4167e5408151..9686ffea177e 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -8,6 +8,7 @@ #include #include +#include #include "f2fs.h" #include "node.h" diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 70b2d3a1e866..4842f313a808 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "gfs2.h" diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index b36abf9cb345..62959a8e43ad 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -9,6 +9,7 @@ #include "hpfs_fn.h" #include +#include #define BLOCKS(size) (((size) + 511) >> 9) diff --git a/fs/ioctl.c b/fs/ioctl.c index 8fe5131b1dee..3f300cc07dee 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "internal.h" diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index d55e8f491a5e..0a807bbb2b4a 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -6,6 +6,7 @@ #include #include #include +#include struct fiemap_ctx { struct fiemap_extent_info *fi; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 671085512e0f..6e1aca38931f 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "nilfs.h" #include "btnode.h" #include "segment.h" diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index b0d42ece4d7c..b5fec3410556 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "overlayfs.h" diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f7a99b3bbcf7..44c353998ac5 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -25,6 +25,7 @@ #include #include #include +#include /* * Directories have different lock order w.r.t. mmap_sem compared to regular diff --git a/include/linux/fiemap.h b/include/linux/fiemap.h new file mode 100644 index 000000000000..240d4f7d9116 --- /dev/null +++ b/include/linux/fiemap.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FIEMAP_H +#define _LINUX_FIEMAP_H 1 + +#include +#include + +struct fiemap_extent_info { + unsigned int fi_flags; /* Flags as passed from user */ + unsigned int fi_extents_mapped; /* Number of mapped extents */ + unsigned int fi_extents_max; /* Size of fiemap_extent array */ + struct fiemap_extent __user *fi_extents_start; /* Start of + fiemap_extent array */ +}; + +int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, + u64 phys, u64 len, u32 flags); +int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); + +int generic_block_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, u64 start, u64 len, + get_block_t *get_block); + +#endif /* _LINUX_FIEMAP_H 1 */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 3104c6f7527b..09bcd329c062 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -48,6 +47,7 @@ struct backing_dev_info; struct bdi_writeback; struct bio; struct export_operations; +struct fiemap_extent_info; struct hd_geometry; struct iovec; struct kiocb; @@ -1745,19 +1745,6 @@ extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, extern void inode_init_owner(struct inode *inode, const struct inode *dir, umode_t mode); extern bool may_open_dev(const struct path *path); -/* - * VFS FS_IOC_FIEMAP helper definitions. - */ -struct fiemap_extent_info { - unsigned int fi_flags; /* Flags as passed from user */ - unsigned int fi_extents_mapped; /* Number of mapped extents */ - unsigned int fi_extents_max; /* Size of fiemap_extent array */ - struct fiemap_extent __user *fi_extents_start; /* Start of - fiemap_extent array */ -}; -int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, - u64 phys, u64 len, u32 flags); -int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); /* * This is the "filldir" function type, used by readdir() to let @@ -3299,10 +3286,6 @@ static inline int vfs_fstat(int fd, struct kstat *stat) extern const char *vfs_get_link(struct dentry *, struct delayed_call *); extern int vfs_readlink(struct dentry *, char __user *, int); -extern int generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, u64 start, - u64 len, get_block_t *get_block); - extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); diff --git a/include/uapi/linux/fiemap.h b/include/uapi/linux/fiemap.h index 7a900b2377b6..24ca0c00cae3 100644 --- a/include/uapi/linux/fiemap.h +++ b/include/uapi/linux/fiemap.h @@ -9,8 +9,8 @@ * Andreas Dilger */ -#ifndef _LINUX_FIEMAP_H -#define _LINUX_FIEMAP_H +#ifndef _UAPI_LINUX_FIEMAP_H +#define _UAPI_LINUX_FIEMAP_H #include @@ -67,4 +67,4 @@ struct fiemap { #define FIEMAP_EXTENT_SHARED 0x00002000 /* Space shared with other * files. */ -#endif /* _LINUX_FIEMAP_H */ +#endif /* _UAPI_LINUX_FIEMAP_H */ From 2732881894714f545ffac42dad7ba7730069874d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:12 +0200 Subject: [PATCH 337/427] iomap: fix the iomap_fiemap prototype iomap_fiemap should take u64 start and len arguments, just like the ->fiemap prototype. Signed-off-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20200523073016.2944131-6-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/iomap/fiemap.c | 2 +- include/linux/iomap.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 0a807bbb2b4a..449705575acf 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -66,7 +66,7 @@ iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, } int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, - loff_t start, loff_t len, const struct iomap_ops *ops) + u64 start, u64 len, const struct iomap_ops *ops) { struct fiemap_ctx ctx; loff_t ret; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 8b09463dae0d..63db02528b70 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -178,7 +178,7 @@ int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops); int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - loff_t start, loff_t len, const struct iomap_ops *ops); + u64 start, u64 len, const struct iomap_ops *ops); loff_t iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops); loff_t iomap_seek_data(struct inode *inode, loff_t offset, From cddf8a2c4a8286ae60fc866eab59c8bc524e93a0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:13 +0200 Subject: [PATCH 338/427] fs: move fiemap range validation into the file systems instances Replace fiemap_check_flags with a fiemap_prep helper that also takes the inode and mapped range, and performs the sanity check and truncation previously done in fiemap_check_range. This way the validation is inside the file system itself and thus properly works for the stacked overlayfs case as well. Signed-off-by: Christoph Hellwig Reviewed-by: Amir Goldstein Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20200523073016.2944131-7-hch@lst.de Signed-off-by: Theodore Ts'o --- Documentation/filesystems/fiemap.txt | 12 +++--- fs/btrfs/inode.c | 2 +- fs/cifs/smb2ops.c | 6 ++- fs/ext4/extents.c | 5 ++- fs/f2fs/data.c | 3 +- fs/ioctl.c | 63 +++++++++++----------------- fs/iomap/fiemap.c | 2 +- fs/nilfs2/inode.c | 2 +- fs/ocfs2/extent_map.c | 3 +- include/linux/fiemap.h | 3 +- 10 files changed, 47 insertions(+), 54 deletions(-) diff --git a/Documentation/filesystems/fiemap.txt b/Documentation/filesystems/fiemap.txt index ac87e6fda842..35c8571eccb6 100644 --- a/Documentation/filesystems/fiemap.txt +++ b/Documentation/filesystems/fiemap.txt @@ -203,16 +203,18 @@ EINTR once fatal signal received. Flag checking should be done at the beginning of the ->fiemap callback via the -fiemap_check_flags() helper: +fiemap_prep() helper: -int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); +int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 *len, u32 supported_flags); The struct fieinfo should be passed in as received from ioctl_fiemap(). The set of fiemap flags which the fs understands should be passed via fs_flags. If -fiemap_check_flags finds invalid user flags, it will place the bad values in +fiemap_prep finds invalid user flags, it will place the bad values in fieinfo->fi_flags and return -EBADR. If the file system gets -EBADR, from -fiemap_check_flags(), it should immediately exit, returning that error back to -ioctl_fiemap(). +fiemap_prep(), it should immediately exit, returning that error back to +ioctl_fiemap(). Additionally the range is validate against the supported +maximum file size. For each extent in the request range, the file system should call diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 320d1062068d..1f1ec361089b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8250,7 +8250,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, { int ret; - ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS); + ret = fiemap_prep(inode, fieinfo, start, &len, BTRFS_FIEMAP_FLAGS); if (ret) return ret; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 09047f1ddfb6..828e53e795c6 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3408,8 +3408,10 @@ static int smb3_fiemap(struct cifs_tcon *tcon, int i, num, rc, flags, last_blob; u64 next; - if (fiemap_check_flags(fei, FIEMAP_FLAG_SYNC)) - return -EBADR; + rc = fiemap_prep(d_inode(cfile->dentry), fei, start, &len, + FIEMAP_FLAG_SYNC); + if (rc) + return rc; xid = get_xid(); again: diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b365c8b407c4..cea083efb650 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4938,8 +4938,9 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } - if (fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)) - return -EBADR; + error = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC); + if (error) + return error; error = ext4_fiemap_check_ranges(inode, start, &len); if (error) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 25abbbb65ba0..03faafc591b1 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1825,7 +1825,8 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } - ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR); + ret = fiemap_prep(inode, fieinfo, start, &len, + FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR); if (ret) return ret; diff --git a/fs/ioctl.c b/fs/ioctl.c index 3f300cc07dee..56bbf02209ae 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -149,61 +149,50 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, EXPORT_SYMBOL(fiemap_fill_next_extent); /** - * fiemap_check_flags - check validity of requested flags for fiemap + * fiemap_prep - check validity of requested flags for fiemap + * @inode: Inode to operate on * @fieinfo: Fiemap context passed into ->fiemap - * @fs_flags: Set of fiemap flags that the file system understands + * @start: Start of the mapped range + * @len: Length of the mapped range, can be truncated by this function. + * @supported_flags: Set of fiemap flags that the file system understands * - * Called from file system ->fiemap callback. This will compute the - * intersection of valid fiemap flags and those that the fs supports. That - * value is then compared against the user supplied flags. In case of bad user - * flags, the invalid values will be written into the fieinfo structure, and - * -EBADR is returned, which tells ioctl_fiemap() to return those values to - * userspace. For this reason, a return code of -EBADR should be preserved. + * This function must be called from each ->fiemap instance to validate the + * fiemap request against the file system parameters. * - * Returns 0 on success, -EBADR on bad flags. + * Returns 0 on success, or a negative error on failure. */ -int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags) +int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 *len, u32 supported_flags) { + u64 maxbytes = inode->i_sb->s_maxbytes; u32 incompat_flags; - incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags); - if (incompat_flags) { - fieinfo->fi_flags = incompat_flags; - return -EBADR; - } - return 0; -} -EXPORT_SYMBOL(fiemap_check_flags); - -static int fiemap_check_ranges(struct super_block *sb, - u64 start, u64 len, u64 *new_len) -{ - u64 maxbytes = (u64) sb->s_maxbytes; - - *new_len = len; - - if (len == 0) + if (*len == 0) return -EINVAL; - if (start > maxbytes) return -EFBIG; /* * Shrink request scope to what the fs can actually handle. */ - if (len > maxbytes || (maxbytes - len) < start) - *new_len = maxbytes - start; + if (*len > maxbytes || (maxbytes - *len) < start) + *len = maxbytes - start; + supported_flags &= FIEMAP_FLAGS_COMPAT; + incompat_flags = fieinfo->fi_flags & ~supported_flags; + if (incompat_flags) { + fieinfo->fi_flags = incompat_flags; + return -EBADR; + } return 0; } +EXPORT_SYMBOL(fiemap_prep); static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) { struct fiemap fiemap; struct fiemap_extent_info fieinfo = { 0, }; struct inode *inode = file_inode(filp); - struct super_block *sb = inode->i_sb; - u64 len; int error; if (!inode->i_op->fiemap) @@ -215,11 +204,6 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) return -EINVAL; - error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length, - &len); - if (error) - return error; - fieinfo.fi_flags = fiemap.fm_flags; fieinfo.fi_extents_max = fiemap.fm_extent_count; fieinfo.fi_extents_start = ufiemap->fm_extents; @@ -232,7 +216,8 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) filemap_write_and_wait(inode->i_mapping); - error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len); + error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, + fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) @@ -320,7 +305,7 @@ static int __generic_block_fiemap(struct inode *inode, bool past_eof = false, whole_file = false; int ret = 0; - ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC); if (ret) return ret; diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 449705575acf..89dca4a97e4a 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -75,7 +75,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, ctx.fi = fi; ctx.prev.type = IOMAP_HOLE; - ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC); + ret = fiemap_prep(inode, fi, start, &len, FIEMAP_FLAG_SYNC); if (ret) return ret; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 6e1aca38931f..052c2da11e4d 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -1006,7 +1006,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, unsigned int blkbits = inode->i_blkbits; int ret, n; - ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC); if (ret) return ret; diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index e3e2d1b2af51..3744179b73fa 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -746,7 +746,8 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct buffer_head *di_bh = NULL; struct ocfs2_extent_rec rec; - ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS); + ret = fiemap_prep(inode, fieinfo, map_start, &map_len, + OCFS2_FIEMAP_FLAGS); if (ret) return ret; diff --git a/include/linux/fiemap.h b/include/linux/fiemap.h index 240d4f7d9116..4e624c466583 100644 --- a/include/linux/fiemap.h +++ b/include/linux/fiemap.h @@ -13,9 +13,10 @@ struct fiemap_extent_info { fiemap_extent array */ }; +int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 *len, u32 supported_flags); int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, u64 phys, u64 len, u32 flags); -int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); int generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len, From 45dd052e67ad17c7a24874a783f41aeab15bc294 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:14 +0200 Subject: [PATCH 339/427] fs: handle FIEMAP_FLAG_SYNC in fiemap_prep By moving FIEMAP_FLAG_SYNC handling to fiemap_prep we ensure it is handled once instead of duplicated, but can still be done under fs locks, like xfs/iomap intended with its duplicate handling. Also make sure the error value of filemap_write_and_wait is propagated to user space. Signed-off-by: Christoph Hellwig Reviewed-by: Amir Goldstein Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20200523073016.2944131-8-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/btrfs/inode.c | 4 +--- fs/cifs/smb2ops.c | 3 +-- fs/ext4/extents.c | 2 +- fs/ext4/ioctl.c | 3 --- fs/f2fs/data.c | 3 +-- fs/ioctl.c | 10 ++++++---- fs/iomap/fiemap.c | 8 +------- fs/nilfs2/inode.c | 2 +- fs/ocfs2/extent_map.c | 5 +---- fs/overlayfs/inode.c | 4 ---- 10 files changed, 13 insertions(+), 31 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1f1ec361089b..529ffa5e7b45 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8243,14 +8243,12 @@ out: return ret; } -#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) - static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { int ret; - ret = fiemap_prep(inode, fieinfo, start, &len, BTRFS_FIEMAP_FLAGS); + ret = fiemap_prep(inode, fieinfo, start, &len, 0); if (ret) return ret; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 828e53e795c6..300ade2acc41 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3408,8 +3408,7 @@ static int smb3_fiemap(struct cifs_tcon *tcon, int i, num, rc, flags, last_blob; u64 next; - rc = fiemap_prep(d_inode(cfile->dentry), fei, start, &len, - FIEMAP_FLAG_SYNC); + rc = fiemap_prep(d_inode(cfile->dentry), fei, start, &len, 0); if (rc) return rc; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index cea083efb650..7d088ff1e902 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4938,7 +4938,7 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } - error = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC); + error = fiemap_prep(inode, fieinfo, start, &len, 0); if (error) return error; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0746532ba463..f81acbbb1b12 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -759,9 +759,6 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) fieinfo.fi_extents_max * sizeof(struct fiemap_extent))) return -EFAULT; - if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) - filemap_write_and_wait(inode->i_mapping); - error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 03faafc591b1..9de7dc476ed1 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1825,8 +1825,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } - ret = fiemap_prep(inode, fieinfo, start, &len, - FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR); + ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_XATTR); if (ret) return ret; diff --git a/fs/ioctl.c b/fs/ioctl.c index 56bbf02209ae..b16e962340db 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -166,6 +166,7 @@ int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, { u64 maxbytes = inode->i_sb->s_maxbytes; u32 incompat_flags; + int ret = 0; if (*len == 0) return -EINVAL; @@ -178,13 +179,17 @@ int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, if (*len > maxbytes || (maxbytes - *len) < start) *len = maxbytes - start; + supported_flags |= FIEMAP_FLAG_SYNC; supported_flags &= FIEMAP_FLAGS_COMPAT; incompat_flags = fieinfo->fi_flags & ~supported_flags; if (incompat_flags) { fieinfo->fi_flags = incompat_flags; return -EBADR; } - return 0; + + if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) + ret = filemap_write_and_wait(inode->i_mapping); + return ret; } EXPORT_SYMBOL(fiemap_prep); @@ -213,9 +218,6 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) fieinfo.fi_extents_max * sizeof(struct fiemap_extent))) return -EFAULT; - if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) - filemap_write_and_wait(inode->i_mapping); - error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 89dca4a97e4a..aab070df4a21 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -75,16 +75,10 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, ctx.fi = fi; ctx.prev.type = IOMAP_HOLE; - ret = fiemap_prep(inode, fi, start, &len, FIEMAP_FLAG_SYNC); + ret = fiemap_prep(inode, fi, start, &len, 0); if (ret) return ret; - if (fi->fi_flags & FIEMAP_FLAG_SYNC) { - ret = filemap_write_and_wait(inode->i_mapping); - if (ret) - return ret; - } - while (len > 0) { ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx, iomap_fiemap_actor); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 052c2da11e4d..25b0d368ecdb 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -1006,7 +1006,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, unsigned int blkbits = inode->i_blkbits; int ret, n; - ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC); + ret = fiemap_prep(inode, fieinfo, start, &len, 0); if (ret) return ret; diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 3744179b73fa..a94852af5510 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -733,8 +733,6 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, return 0; } -#define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) - int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 map_start, u64 map_len) { @@ -746,8 +744,7 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct buffer_head *di_bh = NULL; struct ocfs2_extent_rec rec; - ret = fiemap_prep(inode, fieinfo, map_start, &map_len, - OCFS2_FIEMAP_FLAGS); + ret = fiemap_prep(inode, fieinfo, map_start, &map_len, 0); if (ret) return ret; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index b5fec3410556..c7cb883c47b8 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -462,10 +462,6 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -EOPNOTSUPP; old_cred = ovl_override_creds(inode->i_sb); - - if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) - filemap_write_and_wait(realinode->i_mapping); - err = realinode->i_op->fiemap(realinode, fieinfo, start, len); revert_creds(old_cred); From c7d216e8c44cfc0b680d8c0de4f9bdafd92f7ef6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:15 +0200 Subject: [PATCH 340/427] fs: remove the access_ok() check in ioctl_fiemap access_ok just checks we are fed a proper user pointer. We also do that in copy_to_user itself, so no need to do this early. Signed-off-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20200523073016.2944131-9-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/ioctl.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/ioctl.c b/fs/ioctl.c index b16e962340db..d69786d1dd91 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -213,13 +213,9 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) fieinfo.fi_extents_max = fiemap.fm_extent_count; fieinfo.fi_extents_start = ufiemap->fm_extents; - if (fiemap.fm_extent_count != 0 && - !access_ok(fieinfo.fi_extents_start, - fieinfo.fi_extents_max * sizeof(struct fiemap_extent))) - return -EFAULT; - error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, fiemap.fm_length); + fiemap.fm_flags = fieinfo.fi_flags; fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) From ba988903937c1b1ce5d54567b50f2ad9604b3bfe Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:16 +0200 Subject: [PATCH 341/427] ext4: remove the access_ok() check in ext4_ioctl_get_es_cache access_ok just checks we are fed a proper user pointer. We also do that in copy_to_user itself, so no need to do this early. Signed-off-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200523073016.2944131-10-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index f81acbbb1b12..2162db0c747d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -754,11 +754,6 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) fieinfo.fi_extents_max = fiemap.fm_extent_count; fieinfo.fi_extents_start = ufiemap->fm_extents; - if (fiemap.fm_extent_count != 0 && - !access_ok(fieinfo.fi_extents_start, - fieinfo.fi_extents_max * sizeof(struct fiemap_extent))) - return -EFAULT; - error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; From 6e014c621e7271649f0d51e54dbe1db4c10486c8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 24 May 2020 16:53:16 -0600 Subject: [PATCH 342/427] ext4: don't block for O_DIRECT if IOCB_NOWAIT is set Running with some debug patches to detect illegal blocking triggered the extend/unaligned condition in ext4. If ext4 needs to extend the file (and hence go to buffered IO), or if the app is doing unaligned IO, then ext4 asks the iomap code to wait for IO completion. If the caller asked for no-wait semantics by setting IOCB_NOWAIT, then ext4 should return -EAGAIN instead. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/76152096-2bbb-7682-8fce-4cb498bcd909@kernel.dk Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index b8e69f9e3858..2a01e31a032c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -502,6 +502,12 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret <= 0) return ret; + /* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */ + if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) { + ret = -EAGAIN; + goto out; + } + offset = iocb->ki_pos; count = ret; From 6b8ed62008a49751fc71fefd2a4f89202a7c2d4d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 25 May 2020 10:12:15 +0200 Subject: [PATCH 343/427] ext4: avoid unnecessary transaction starts during writeback ext4_writepages() currently works in a loop like: start a transaction scan inode for pages to write map and submit these pages stop the transaction This loop results in starting transaction once more than is needed because in the last iteration we start a transaction only to scan the inode and find there are no pages to write. This can be significant increase in number of transaction starts for single-extent files or files that have all blocks already mapped. Furthermore we already know from previous iteration whether there are more pages to write or not. So propagate the information from mpage_prepare_extent_to_map() and avoid unnecessary looping in case there are no more pages to write. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20200525081215.29451-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e7bf9388538b..6694f0c8e0f7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1534,6 +1534,7 @@ struct mpage_da_data { struct ext4_map_blocks map; struct ext4_io_submit io_submit; /* IO submission data */ unsigned int do_map:1; + unsigned int scanned_until_end:1; }; static void mpage_release_unused_pages(struct mpage_da_data *mpd, @@ -1549,6 +1550,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, if (mpd->first_page >= mpd->next_page) return; + mpd->scanned_until_end = 0; index = mpd->first_page; end = mpd->next_page - 1; if (invalidate) { @@ -2195,7 +2197,11 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, if (err < 0) return err; } - return lblk < blocks; + if (lblk >= blocks) { + mpd->scanned_until_end = 1; + return 0; + } + return 1; } /* @@ -2553,7 +2559,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, tag); if (nr_pages == 0) - goto out; + break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -2608,6 +2614,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) pagevec_release(&pvec); cond_resched(); } + mpd->scanned_until_end = 1; return 0; out: pagevec_release(&pvec); @@ -2626,7 +2633,6 @@ static int ext4_writepages(struct address_space *mapping, struct inode *inode = mapping->host; int needed_blocks, rsv_blocks = 0, ret = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); - bool done; struct blk_plug plug; bool give_up_on_write = false; @@ -2712,7 +2718,6 @@ static int ext4_writepages(struct address_space *mapping, retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); - done = false; blk_start_plug(&plug); /* @@ -2722,6 +2727,7 @@ retry: * started. */ mpd.do_map = 0; + mpd.scanned_until_end = 0; mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd.io_submit.io_end) { ret = -ENOMEM; @@ -2737,7 +2743,7 @@ retry: if (ret < 0) goto unplug; - while (!done && mpd.first_page <= mpd.last_page) { + while (!mpd.scanned_until_end && wbc->nr_to_write > 0) { /* For each extent of pages we use new io_end */ mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd.io_submit.io_end) { @@ -2772,20 +2778,9 @@ retry: trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); ret = mpage_prepare_extent_to_map(&mpd); - if (!ret) { - if (mpd.map.m_len) - ret = mpage_map_and_submit_extent(handle, &mpd, + if (!ret && mpd.map.m_len) + ret = mpage_map_and_submit_extent(handle, &mpd, &give_up_on_write); - else { - /* - * We scanned the whole range (or exhausted - * nr_to_write), submitted what was mapped and - * didn't find anything needing mapping. We are - * done. - */ - done = true; - } - } /* * Caution: If the handle is synchronous, * ext4_journal_stop() can wait for transaction commit From 1ee0e6d47d08ef309e0975a96d643972855511d0 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 2 Jun 2020 23:17:16 -0500 Subject: [PATCH 344/427] smb3: default to minimum of two channels when multichannel specified When "multichannel" is specified on mount, make sure to default to at least two channels. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- fs/cifs/connect.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index daf90f988de1..fdfd7cf4c720 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1964,9 +1964,13 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, break; case Opt_multichannel: vol->multichannel = true; + /* if number of channels not specified, default to 2 */ + if (vol->max_channels < 2) + vol->max_channels = 2; break; case Opt_nomultichannel: vol->multichannel = false; + vol->max_channels = 1; break; case Opt_compress: vol->compression = UNKNOWN_TYPE; From e80ddeb2f70ebd0786aa7cdba3e58bc931fa0bb5 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 3 Jun 2020 01:33:58 -0500 Subject: [PATCH 345/427] smb3: fix incorrect number of credits when ioctl MaxOutputResponse > 64K We were not checking to see if ioctl requests asked for more than 64K (ie when CIFSMaxBufSize was > 64K) so when setting larger CIFSMaxBufSize then ioctls would fail with invalid parameter errors. When requests ask for more than 64K in MaxOutputResponse then we need to ask for more than 1 credit. Signed-off-by: Steve French CC: Stable Reviewed-by: Aurelien Aptel --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 06463f386a60..12de0af12f75 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2938,7 +2938,7 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, * response size smaller. */ req->MaxOutputResponse = cpu_to_le32(max_response_size); - + req->sync_hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(max_response_size, SMB2_MAX_BUFFER_SIZE)); if (is_fsctl) req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL); else From 8d7e33d6811fbd24d3a1476a1b481b704975352a Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Thu, 14 May 2020 21:03:21 +0900 Subject: [PATCH 346/427] PCI: uniphier: Add Socionext UniPhier Pro5 PCIe endpoint controller driver Add driver for the Socionext UniPhier Pro5 SoC endpoint controller. This controller is based on the DesignWare PCIe core. And add "host" to existing controller descriontions for the host controller in Kconfig. Link: https://lore.kernel.org/r/1589457801-12796-3-git-send-email-hayashi.kunihiko@socionext.com Signed-off-by: Kunihiko Hayashi Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- MAINTAINERS | 2 +- drivers/pci/controller/dwc/Kconfig | 14 +- drivers/pci/controller/dwc/Makefile | 1 + drivers/pci/controller/dwc/pcie-uniphier-ep.c | 383 ++++++++++++++++++ 4 files changed, 397 insertions(+), 3 deletions(-) create mode 100644 drivers/pci/controller/dwc/pcie-uniphier-ep.c diff --git a/MAINTAINERS b/MAINTAINERS index dc7b42d1d2ce..12c0b2fe9c32 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13143,7 +13143,7 @@ M: Kunihiko Hayashi L: linux-pci@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/pci/uniphier-pcie* -F: drivers/pci/controller/dwc/pcie-uniphier.c +F: drivers/pci/controller/dwc/pcie-uniphier* PCIE DRIVER FOR ST SPEAR13XX M: Pratyush Anand diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 03dcaf65d159..426ef5cbae45 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -281,15 +281,25 @@ config PCIE_TEGRA194_EP selected. This uses the DesignWare core. config PCIE_UNIPHIER - bool "Socionext UniPhier PCIe controllers" + bool "Socionext UniPhier PCIe host controllers" depends on ARCH_UNIPHIER || COMPILE_TEST depends on OF && HAS_IOMEM depends on PCI_MSI_IRQ_DOMAIN select PCIE_DW_HOST help - Say Y here if you want PCIe controller support on UniPhier SoCs. + Say Y here if you want PCIe host controller support on UniPhier SoCs. This driver supports LD20 and PXs3 SoCs. +config PCIE_UNIPHIER_EP + bool "Socionext UniPhier PCIe endpoint controllers" + depends on ARCH_UNIPHIER || COMPILE_TEST + depends on OF && HAS_IOMEM + depends on PCI_ENDPOINT + select PCIE_DW_EP + help + Say Y here if you want PCIe endpoint controller support on + UniPhier SoCs. This driver supports Pro5 SoC. + config PCIE_AL bool "Amazon Annapurna Labs PCIe controller" depends on OF && (ARM64 || COMPILE_TEST) diff --git a/drivers/pci/controller/dwc/Makefile b/drivers/pci/controller/dwc/Makefile index 8a637cfcf6e9..a751553fa0db 100644 --- a/drivers/pci/controller/dwc/Makefile +++ b/drivers/pci/controller/dwc/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_PCIE_HISI_STB) += pcie-histb.o obj-$(CONFIG_PCI_MESON) += pci-meson.o obj-$(CONFIG_PCIE_TEGRA194) += pcie-tegra194.o obj-$(CONFIG_PCIE_UNIPHIER) += pcie-uniphier.o +obj-$(CONFIG_PCIE_UNIPHIER_EP) += pcie-uniphier-ep.o # The following drivers are for devices that use the generic ACPI # pci_root.c driver but don't support standard ECAM config access. diff --git a/drivers/pci/controller/dwc/pcie-uniphier-ep.c b/drivers/pci/controller/dwc/pcie-uniphier-ep.c new file mode 100644 index 000000000000..148355960061 --- /dev/null +++ b/drivers/pci/controller/dwc/pcie-uniphier-ep.c @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PCIe endpoint controller driver for UniPhier SoCs + * Copyright 2018 Socionext Inc. + * Author: Kunihiko Hayashi + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pcie-designware.h" + +/* Link Glue registers */ +#define PCL_RSTCTRL0 0x0010 +#define PCL_RSTCTRL_AXI_REG BIT(3) +#define PCL_RSTCTRL_AXI_SLAVE BIT(2) +#define PCL_RSTCTRL_AXI_MASTER BIT(1) +#define PCL_RSTCTRL_PIPE3 BIT(0) + +#define PCL_RSTCTRL1 0x0020 +#define PCL_RSTCTRL_PERST BIT(0) + +#define PCL_RSTCTRL2 0x0024 +#define PCL_RSTCTRL_PHY_RESET BIT(0) + +#define PCL_MODE 0x8000 +#define PCL_MODE_REGEN BIT(8) +#define PCL_MODE_REGVAL BIT(0) + +#define PCL_APP_CLK_CTRL 0x8004 +#define PCL_APP_CLK_REQ BIT(0) + +#define PCL_APP_READY_CTRL 0x8008 +#define PCL_APP_LTSSM_ENABLE BIT(0) + +#define PCL_APP_MSI0 0x8040 +#define PCL_APP_VEN_MSI_TC_MASK GENMASK(10, 8) +#define PCL_APP_VEN_MSI_VECTOR_MASK GENMASK(4, 0) + +#define PCL_APP_MSI1 0x8044 +#define PCL_APP_MSI_REQ BIT(0) + +#define PCL_APP_INTX 0x8074 +#define PCL_APP_INTX_SYS_INT BIT(0) + +/* assertion time of INTx in usec */ +#define PCL_INTX_WIDTH_USEC 30 + +struct uniphier_pcie_ep_priv { + void __iomem *base; + struct dw_pcie pci; + struct clk *clk, *clk_gio; + struct reset_control *rst, *rst_gio; + struct phy *phy; + const struct pci_epc_features *features; +}; + +#define to_uniphier_pcie(x) dev_get_drvdata((x)->dev) + +static void uniphier_pcie_ltssm_enable(struct uniphier_pcie_ep_priv *priv, + bool enable) +{ + u32 val; + + val = readl(priv->base + PCL_APP_READY_CTRL); + if (enable) + val |= PCL_APP_LTSSM_ENABLE; + else + val &= ~PCL_APP_LTSSM_ENABLE; + writel(val, priv->base + PCL_APP_READY_CTRL); +} + +static void uniphier_pcie_phy_reset(struct uniphier_pcie_ep_priv *priv, + bool assert) +{ + u32 val; + + val = readl(priv->base + PCL_RSTCTRL2); + if (assert) + val |= PCL_RSTCTRL_PHY_RESET; + else + val &= ~PCL_RSTCTRL_PHY_RESET; + writel(val, priv->base + PCL_RSTCTRL2); +} + +static void uniphier_pcie_init_ep(struct uniphier_pcie_ep_priv *priv) +{ + u32 val; + + /* set EP mode */ + val = readl(priv->base + PCL_MODE); + val |= PCL_MODE_REGEN | PCL_MODE_REGVAL; + writel(val, priv->base + PCL_MODE); + + /* clock request */ + val = readl(priv->base + PCL_APP_CLK_CTRL); + val &= ~PCL_APP_CLK_REQ; + writel(val, priv->base + PCL_APP_CLK_CTRL); + + /* deassert PIPE3 and AXI reset */ + val = readl(priv->base + PCL_RSTCTRL0); + val |= PCL_RSTCTRL_AXI_REG | PCL_RSTCTRL_AXI_SLAVE + | PCL_RSTCTRL_AXI_MASTER | PCL_RSTCTRL_PIPE3; + writel(val, priv->base + PCL_RSTCTRL0); + + uniphier_pcie_ltssm_enable(priv, false); + + msleep(100); +} + +static int uniphier_pcie_start_link(struct dw_pcie *pci) +{ + struct uniphier_pcie_ep_priv *priv = to_uniphier_pcie(pci); + + uniphier_pcie_ltssm_enable(priv, true); + + return 0; +} + +static void uniphier_pcie_stop_link(struct dw_pcie *pci) +{ + struct uniphier_pcie_ep_priv *priv = to_uniphier_pcie(pci); + + uniphier_pcie_ltssm_enable(priv, false); +} + +static void uniphier_pcie_ep_init(struct dw_pcie_ep *ep) +{ + struct dw_pcie *pci = to_dw_pcie_from_ep(ep); + enum pci_barno bar; + + for (bar = BAR_0; bar <= BAR_5; bar++) + dw_pcie_ep_reset_bar(pci, bar); +} + +static int uniphier_pcie_ep_raise_legacy_irq(struct dw_pcie_ep *ep) +{ + struct dw_pcie *pci = to_dw_pcie_from_ep(ep); + struct uniphier_pcie_ep_priv *priv = to_uniphier_pcie(pci); + u32 val; + + /* + * This makes pulse signal to send INTx to the RC, so this should + * be cleared as soon as possible. This sequence is covered with + * mutex in pci_epc_raise_irq(). + */ + /* assert INTx */ + val = readl(priv->base + PCL_APP_INTX); + val |= PCL_APP_INTX_SYS_INT; + writel(val, priv->base + PCL_APP_INTX); + + udelay(PCL_INTX_WIDTH_USEC); + + /* deassert INTx */ + val &= ~PCL_APP_INTX_SYS_INT; + writel(val, priv->base + PCL_APP_INTX); + + return 0; +} + +static int uniphier_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, + u8 func_no, u16 interrupt_num) +{ + struct dw_pcie *pci = to_dw_pcie_from_ep(ep); + struct uniphier_pcie_ep_priv *priv = to_uniphier_pcie(pci); + u32 val; + + val = FIELD_PREP(PCL_APP_VEN_MSI_TC_MASK, func_no) + | FIELD_PREP(PCL_APP_VEN_MSI_VECTOR_MASK, interrupt_num - 1); + writel(val, priv->base + PCL_APP_MSI0); + + val = readl(priv->base + PCL_APP_MSI1); + val |= PCL_APP_MSI_REQ; + writel(val, priv->base + PCL_APP_MSI1); + + return 0; +} + +static int uniphier_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no, + enum pci_epc_irq_type type, + u16 interrupt_num) +{ + struct dw_pcie *pci = to_dw_pcie_from_ep(ep); + + switch (type) { + case PCI_EPC_IRQ_LEGACY: + return uniphier_pcie_ep_raise_legacy_irq(ep); + case PCI_EPC_IRQ_MSI: + return uniphier_pcie_ep_raise_msi_irq(ep, func_no, + interrupt_num); + default: + dev_err(pci->dev, "UNKNOWN IRQ type (%d)\n", type); + } + + return 0; +} + +static const struct pci_epc_features* +uniphier_pcie_get_features(struct dw_pcie_ep *ep) +{ + struct dw_pcie *pci = to_dw_pcie_from_ep(ep); + struct uniphier_pcie_ep_priv *priv = to_uniphier_pcie(pci); + + return priv->features; +} + +static const struct dw_pcie_ep_ops uniphier_pcie_ep_ops = { + .ep_init = uniphier_pcie_ep_init, + .raise_irq = uniphier_pcie_ep_raise_irq, + .get_features = uniphier_pcie_get_features, +}; + +static int uniphier_add_pcie_ep(struct uniphier_pcie_ep_priv *priv, + struct platform_device *pdev) +{ + struct dw_pcie *pci = &priv->pci; + struct dw_pcie_ep *ep = &pci->ep; + struct device *dev = &pdev->dev; + struct resource *res; + int ret; + + ep->ops = &uniphier_pcie_ep_ops; + + pci->dbi_base2 = devm_platform_ioremap_resource_byname(pdev, "dbi2"); + if (IS_ERR(pci->dbi_base2)) + return PTR_ERR(pci->dbi_base2); + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "addr_space"); + if (!res) + return -EINVAL; + + ep->phys_base = res->start; + ep->addr_size = resource_size(res); + + ret = dw_pcie_ep_init(ep); + if (ret) + dev_err(dev, "Failed to initialize endpoint (%d)\n", ret); + + return ret; +} + +static int uniphier_pcie_ep_enable(struct uniphier_pcie_ep_priv *priv) +{ + int ret; + + ret = clk_prepare_enable(priv->clk); + if (ret) + return ret; + + ret = clk_prepare_enable(priv->clk_gio); + if (ret) + goto out_clk_disable; + + ret = reset_control_deassert(priv->rst); + if (ret) + goto out_clk_gio_disable; + + ret = reset_control_deassert(priv->rst_gio); + if (ret) + goto out_rst_assert; + + uniphier_pcie_init_ep(priv); + + uniphier_pcie_phy_reset(priv, true); + + ret = phy_init(priv->phy); + if (ret) + goto out_rst_gio_assert; + + uniphier_pcie_phy_reset(priv, false); + + return 0; + +out_rst_gio_assert: + reset_control_assert(priv->rst_gio); +out_rst_assert: + reset_control_assert(priv->rst); +out_clk_gio_disable: + clk_disable_unprepare(priv->clk_gio); +out_clk_disable: + clk_disable_unprepare(priv->clk); + + return ret; +} + +static const struct dw_pcie_ops dw_pcie_ops = { + .start_link = uniphier_pcie_start_link, + .stop_link = uniphier_pcie_stop_link, +}; + +static int uniphier_pcie_ep_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct uniphier_pcie_ep_priv *priv; + struct resource *res; + int ret; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->features = of_device_get_match_data(dev); + if (WARN_ON(!priv->features)) + return -EINVAL; + + priv->pci.dev = dev; + priv->pci.ops = &dw_pcie_ops; + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi"); + priv->pci.dbi_base = devm_pci_remap_cfg_resource(dev, res); + if (IS_ERR(priv->pci.dbi_base)) + return PTR_ERR(priv->pci.dbi_base); + + priv->base = devm_platform_ioremap_resource_byname(pdev, "link"); + if (IS_ERR(priv->base)) + return PTR_ERR(priv->base); + + priv->clk_gio = devm_clk_get(dev, "gio"); + if (IS_ERR(priv->clk_gio)) + return PTR_ERR(priv->clk_gio); + + priv->rst_gio = devm_reset_control_get_shared(dev, "gio"); + if (IS_ERR(priv->rst_gio)) + return PTR_ERR(priv->rst_gio); + + priv->clk = devm_clk_get(dev, "link"); + if (IS_ERR(priv->clk)) + return PTR_ERR(priv->clk); + + priv->rst = devm_reset_control_get_shared(dev, "link"); + if (IS_ERR(priv->rst)) + return PTR_ERR(priv->rst); + + priv->phy = devm_phy_optional_get(dev, "pcie-phy"); + if (IS_ERR(priv->phy)) { + ret = PTR_ERR(priv->phy); + dev_err(dev, "Failed to get phy (%d)\n", ret); + return ret; + } + + platform_set_drvdata(pdev, priv); + + ret = uniphier_pcie_ep_enable(priv); + if (ret) + return ret; + + return uniphier_add_pcie_ep(priv, pdev); +} + +static const struct pci_epc_features uniphier_pro5_data = { + .linkup_notifier = false, + .msi_capable = true, + .msix_capable = false, + .align = 1 << 16, + .bar_fixed_64bit = BIT(BAR_0) | BIT(BAR_2) | BIT(BAR_4), + .reserved_bar = BIT(BAR_4), +}; + +static const struct of_device_id uniphier_pcie_ep_match[] = { + { + .compatible = "socionext,uniphier-pro5-pcie-ep", + .data = &uniphier_pro5_data, + }, + { /* sentinel */ }, +}; + +static struct platform_driver uniphier_pcie_ep_driver = { + .probe = uniphier_pcie_ep_probe, + .driver = { + .name = "uniphier-pcie-ep", + .of_match_table = uniphier_pcie_ep_match, + .suppress_bind_attrs = true, + }, +}; +builtin_platform_driver(uniphier_pcie_ep_driver); From e49c7b2f6de7ff81ca34c56e4eeb4fa740c099f2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 10 Apr 2020 20:51:51 +0100 Subject: [PATCH 347/427] afs: Build an abstraction around an "operation" concept Turn the afs_operation struct into the main way that most fileserver operations are managed. Various things are added to the struct, including the following: (1) All the parameters and results of the relevant operations are moved into it, removing corresponding fields from the afs_call struct. afs_call gets a pointer to the op. (2) The target volume is made the main focus of the operation, rather than the target vnode(s), and a bunch of op->vnode->volume are made op->volume instead. (3) Two vnode records are defined (op->file[]) for the vnode(s) involved in most operations. The vnode record (struct afs_vnode_param) contains: - The vnode pointer. - The fid of the vnode to be included in the parameters or that was returned in the reply (eg. FS.MakeDir). - The status and callback information that may be returned in the reply about the vnode. - Callback break and data version tracking for detecting simultaneous third-parth changes. (4) Pointers to dentries to be updated with new inodes. (5) An operations table pointer. The table includes pointers to functions for issuing AFS and YFS-variant RPCs, handling the success and abort of an operation and handling post-I/O-lock local editing of a directory. To make this work, the following function restructuring is made: (A) The rotation loop that issues calls to fileservers that can be found in each function that wants to issue an RPC (such as afs_mkdir()) is extracted out into common code, in a new file called fs_operation.c. (B) The rotation loops, such as the one in afs_mkdir(), are replaced with a much smaller piece of code that allocates an operation, sets the parameters and then calls out to the common code to do the actual work. (C) The code for handling the success and failure of an operation are moved into operation functions (as (5) above) and these are called from the core code at appropriate times. (D) The pseudo inode getting stuff used by the dynamic root code is moved over into dynroot.c. (E) struct afs_iget_data is absorbed into the operation struct and afs_iget() expects to be given an op pointer and a vnode record. (F) Point (E) doesn't work for the root dir of a volume, but we know the FID in advance (it's always vnode 1, unique 1), so a separate inode getter, afs_root_iget(), is provided to special-case that. (G) The inode status init/update functions now also take an op and a vnode record. (H) The RPC marshalling functions now, for the most part, just take an afs_operation struct as their only argument. All the data they need is held there. The result delivery functions write their answers there as well. (I) The call is attached to the operation and then the operation core does the waiting. And then the new operation code is, for the moment, made to just initialise the operation, get the appropriate vnode I/O locks and do the same rotation loop as before. This lays the foundation for the following changes in the future: (*) Overhauling the rotation (again). (*) Support for asynchronous I/O, where the fileserver rotation must be done asynchronously also. Signed-off-by: David Howells --- fs/afs/Makefile | 1 + fs/afs/afs.h | 1 - fs/afs/callback.c | 7 +- fs/afs/dir.c | 1215 ++++++++++++++++-------------------- fs/afs/dir_silly.c | 194 +++--- fs/afs/dynroot.c | 93 +++ fs/afs/file.c | 62 +- fs/afs/flock.c | 114 ++-- fs/afs/fs_operation.c | 246 ++++++++ fs/afs/fsclient.c | 1213 +++++++++++++++-------------------- fs/afs/inode.c | 448 ++++++------- fs/afs/internal.h | 313 ++++++---- fs/afs/rotate.c | 128 +--- fs/afs/rxrpc.c | 28 +- fs/afs/server.c | 37 +- fs/afs/super.c | 77 +-- fs/afs/volume.c | 12 +- fs/afs/write.c | 156 +++-- fs/afs/xattr.c | 300 ++++----- fs/afs/yfsclient.c | 842 +++++++++++-------------- include/trace/events/afs.h | 19 +- 21 files changed, 2661 insertions(+), 2845 deletions(-) create mode 100644 fs/afs/fs_operation.c diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 10359bea7070..924f02e9d7e7 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -18,6 +18,7 @@ kafs-y := \ file.o \ flock.o \ fsclient.o \ + fs_operation.o \ fs_probe.o \ inode.o \ main.o \ diff --git a/fs/afs/afs.h b/fs/afs/afs.h index b6d49d646ade..f8e34406243e 100644 --- a/fs/afs/afs.h +++ b/fs/afs/afs.h @@ -146,7 +146,6 @@ struct afs_file_status { struct afs_status_cb { struct afs_file_status status; struct afs_callback callback; - unsigned int cb_break; /* Pre-op callback break counter */ bool have_status; /* True if status record was retrieved */ bool have_cb; /* True if cb record was retrieved */ bool have_error; /* True if status.abort_code indicates an error */ diff --git a/fs/afs/callback.c b/fs/afs/callback.c index b16781e1683e..282dbac84435 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -46,7 +46,6 @@ static struct afs_cb_interest *afs_create_interest(struct afs_server *server, refcount_set(&new->usage, 1); new->sb = vnode->vfs_inode.i_sb; - new->vid = vnode->volume->vid; new->server = afs_get_server(server, afs_server_trace_get_new_cbi); INIT_HLIST_NODE(&new->cb_vlink); @@ -286,7 +285,6 @@ static void afs_break_one_callback(struct afs_server *server, struct afs_vol_interest *vi) { struct afs_cb_interest *cbi; - struct afs_iget_data data; struct afs_vnode *vnode; struct inode *inode; @@ -305,15 +303,12 @@ static void afs_break_one_callback(struct afs_server *server, afs_cb_break_for_volume_callback, false); write_unlock(&volume->cb_v_break_lock); } else { - data.volume = NULL; - data.fid = *fid; - /* See if we can find a matching inode - even an I_NEW * inode needs to be marked as it can have its callback * broken before we finish setting up the local inode. */ inode = find_inode_rcu(cbi->sb, fid->vnode, - afs_iget5_test, &data); + afs_ilookup5_test_by_fid, fid); if (inode) { vnode = AFS_FS_I(inode); afs_break_callback(vnode, afs_cb_break_for_callback); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index ff421db40cf2..0d3cf3af0352 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -99,8 +99,6 @@ struct afs_lookup_cookie { bool found; bool one_only; unsigned short nr_fids; - struct inode **inodes; - struct afs_status_cb *statuses; struct afs_fid fids[50]; }; @@ -618,8 +616,8 @@ static int afs_lookup_filldir(struct dir_context *ctx, const char *name, } } else if (cookie->name.len == nlen && memcmp(cookie->name.name, name, nlen) == 0) { - cookie->fids[0].vnode = ino; - cookie->fids[0].unique = dtype; + cookie->fids[1].vnode = ino; + cookie->fids[1].unique = dtype; cookie->found = 1; if (cookie->one_only) return -1; @@ -630,6 +628,80 @@ static int afs_lookup_filldir(struct dir_context *ctx, const char *name, return ret; } +/* + * Deal with the result of a successful lookup operation. Turn all the files + * into inodes and save the first one - which is the one we actually want. + */ +static void afs_do_lookup_success(struct afs_operation *op) +{ + struct afs_vnode_param *vp; + struct afs_vnode *vnode; + struct inode *inode; + u32 abort_code; + int i; + + _enter(""); + + for (i = 0; i < op->nr_files; i++) { + switch (i) { + case 0: + vp = &op->file[0]; + abort_code = vp->scb.status.abort_code; + if (abort_code != 0) { + op->abort_code = abort_code; + op->error = afs_abort_to_error(abort_code); + } + break; + + case 1: + vp = &op->file[1]; + break; + + default: + vp = &op->more_files[i - 2]; + break; + } + + if (!vp->scb.have_status && !vp->scb.have_error) + continue; + + _debug("do [%u]", i); + if (vp->vnode) { + if (!test_bit(AFS_VNODE_UNSET, &vp->vnode->flags)) + afs_vnode_commit_status(op, vp); + } else if (vp->scb.status.abort_code == 0) { + inode = afs_iget(op, vp); + if (!IS_ERR(inode)) { + vnode = AFS_FS_I(inode); + afs_cache_permit(vnode, op->key, + 0 /* Assume vnode->cb_break is 0 */ + + op->cb_v_break, + &vp->scb); + vp->vnode = vnode; + vp->put_vnode = true; + } + } else { + _debug("- abort %d %llx:%llx.%x", + vp->scb.status.abort_code, + vp->fid.vid, vp->fid.vnode, vp->fid.unique); + } + } + + _leave(""); +} + +static const struct afs_operation_ops afs_inline_bulk_status_operation = { + .issue_afs_rpc = afs_fs_inline_bulk_status, + .issue_yfs_rpc = yfs_fs_inline_bulk_status, + .success = afs_do_lookup_success, +}; + +static const struct afs_operation_ops afs_fetch_status_operation = { + .issue_afs_rpc = afs_fs_fetch_status, + .issue_yfs_rpc = yfs_fs_fetch_status, + .success = afs_do_lookup_success, +}; + /* * Do a lookup in a directory. We make use of bulk lookup to query a slew of * files in one go and create inodes for them. The inode of the file we were @@ -639,16 +711,15 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, struct key *key) { struct afs_lookup_cookie *cookie; - struct afs_cb_interest *dcbi, *cbi = NULL; - struct afs_super_info *as = dir->i_sb->s_fs_info; - struct afs_status_cb *scb; - struct afs_iget_data iget_data; - struct afs_operation fc; + struct afs_cb_interest *dcbi; + struct afs_vnode_param *vp; + struct afs_operation *op; struct afs_server *server; struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode; struct inode *inode = NULL, *ti; afs_dataversion_t data_version = READ_ONCE(dvnode->status.data_version); - int ret, i; + long ret; + int i; _enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry); @@ -656,6 +727,8 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, if (!cookie) return ERR_PTR(-ENOMEM); + for (i = 0; i < ARRAY_SIZE(cookie->fids); i++) + cookie->fids[i].vid = dvnode->fid.vid; cookie->ctx.actor = afs_lookup_filldir; cookie->name = dentry->d_name; cookie->nr_fids = 2; /* slot 0 is saved for the fid we actually want @@ -672,61 +745,64 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, } read_sequnlock_excl(&dvnode->cb_lock); - for (i = 0; i < 50; i++) - cookie->fids[i].vid = as->volume->vid; - /* search the directory */ ret = afs_dir_iterate(dir, &cookie->ctx, key, &data_version); - if (ret < 0) { - inode = ERR_PTR(ret); + if (ret < 0) goto out; - } dentry->d_fsdata = (void *)(unsigned long)data_version; - inode = ERR_PTR(-ENOENT); + ret = -ENOENT; if (!cookie->found) goto out; /* Check to see if we already have an inode for the primary fid. */ - iget_data.fid = cookie->fids[0]; - iget_data.volume = dvnode->volume; - iget_data.cb_v_break = dvnode->volume->cb_v_break; - iget_data.cb_s_break = 0; - inode = ilookup5(dir->i_sb, cookie->fids[0].vnode, - afs_iget5_test, &iget_data); + inode = ilookup5(dir->i_sb, cookie->fids[1].vnode, + afs_ilookup5_test_by_fid, &cookie->fids[1]); if (inode) + goto out; /* We do */ + + /* Okay, we didn't find it. We need to query the server - and whilst + * we're doing that, we're going to attempt to look up a bunch of other + * vnodes also. + */ + op = afs_alloc_operation(NULL, dvnode->volume); + if (IS_ERR(op)) { + ret = PTR_ERR(op); goto out; + } + + afs_op_set_vnode(op, 0, dvnode); + afs_op_set_fid(op, 1, &cookie->fids[1]); + + op->nr_files = cookie->nr_fids; + _debug("nr_files %u", op->nr_files); /* Need space for examining all the selected files */ - inode = ERR_PTR(-ENOMEM); - cookie->statuses = kvcalloc(cookie->nr_fids, sizeof(struct afs_status_cb), - GFP_KERNEL); - if (!cookie->statuses) - goto out; + op->error = -ENOMEM; + if (op->nr_files > 2) { + op->more_files = kvcalloc(op->nr_files - 2, + sizeof(struct afs_vnode_param), + GFP_KERNEL); + if (!op->more_files) + goto out_op; - cookie->inodes = kcalloc(cookie->nr_fids, sizeof(struct inode *), - GFP_KERNEL); - if (!cookie->inodes) - goto out_s; + for (i = 2; i < op->nr_files; i++) { + vp = &op->more_files[i - 2]; + vp->fid = cookie->fids[i]; - cookie->fids[1] = dvnode->fid; - cookie->statuses[1].cb_break = afs_calc_vnode_cb_break(dvnode); - cookie->inodes[1] = igrab(&dvnode->vfs_inode); - - for (i = 2; i < cookie->nr_fids; i++) { - scb = &cookie->statuses[i]; - - /* Find any inodes that already exist and get their - * callback counters. - */ - iget_data.fid = cookie->fids[i]; - ti = ilookup5_nowait(dir->i_sb, iget_data.fid.vnode, - afs_iget5_test, &iget_data); - if (!IS_ERR_OR_NULL(ti)) { - vnode = AFS_FS_I(ti); - scb->cb_break = afs_calc_vnode_cb_break(vnode); - cookie->inodes[i] = ti; + /* Find any inodes that already exist and get their + * callback counters. + */ + ti = ilookup5_nowait(dir->i_sb, vp->fid.vnode, + afs_ilookup5_test_by_fid, &vp->fid); + if (!IS_ERR_OR_NULL(ti)) { + vnode = AFS_FS_I(ti); + vp->dv_before = vnode->status.data_version; + vp->cb_break_before = afs_calc_vnode_cb_break(vnode); + vp->vnode = vnode; + vp->put_vnode = true; + } } } @@ -734,120 +810,40 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, * lookups contained therein are stored in the reply without aborting * the whole operation. */ - if (cookie->one_only) - goto no_inline_bulk_status; - - inode = ERR_PTR(-ERESTARTSYS); - if (afs_begin_vnode_operation(&fc, dvnode, key, true)) { - while (afs_select_fileserver(&fc)) { - if (test_bit(AFS_SERVER_FL_NO_IBULK, - &fc.cbi->server->flags)) { - fc.ac.abort_code = RX_INVALID_OPERATION; - fc.ac.error = -ECONNABORTED; - break; - } - iget_data.cb_v_break = dvnode->volume->cb_v_break; - iget_data.cb_s_break = fc.cbi->server->cb_s_break; - afs_fs_inline_bulk_status(&fc, - afs_v2net(dvnode), - cookie->fids, - cookie->statuses, - cookie->nr_fids, NULL); - } - - if (fc.ac.error == 0) - cbi = afs_get_cb_interest(fc.cbi); - if (fc.ac.abort_code == RX_INVALID_OPERATION) - set_bit(AFS_SERVER_FL_NO_IBULK, &fc.cbi->server->flags); - inode = ERR_PTR(afs_end_vnode_operation(&fc)); + op->error = -ENOTSUPP; + if (!cookie->one_only) { + op->ops = &afs_inline_bulk_status_operation; + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); } - if (!IS_ERR(inode)) - goto success; - if (fc.ac.abort_code != RX_INVALID_OPERATION) - goto out_c; + if (op->error == -ENOTSUPP) { + /* We could try FS.BulkStatus next, but this aborts the entire + * op if any of the lookups fails - so, for the moment, revert + * to FS.FetchStatus for op->file[1]. + */ + op->fetch_status.which = 1; + op->ops = &afs_fetch_status_operation; + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + } + inode = ERR_PTR(op->error); -no_inline_bulk_status: - /* We could try FS.BulkStatus next, but this aborts the entire op if - * any of the lookups fails - so, for the moment, revert to - * FS.FetchStatus for just the primary fid. - */ - inode = ERR_PTR(-ERESTARTSYS); - if (afs_begin_vnode_operation(&fc, dvnode, key, true)) { - while (afs_select_fileserver(&fc)) { - iget_data.cb_v_break = dvnode->volume->cb_v_break; - iget_data.cb_s_break = fc.cbi->server->cb_s_break; - scb = &cookie->statuses[0]; - afs_fs_fetch_status(&fc, - afs_v2net(dvnode), - cookie->fids, - scb, - NULL); - } - - if (fc.ac.error == 0) - cbi = afs_get_cb_interest(fc.cbi); - inode = ERR_PTR(afs_end_vnode_operation(&fc)); +out_op: + if (op->error == 0) { + inode = &op->file[1].vnode->vfs_inode; + op->file[1].vnode = NULL; } - if (IS_ERR(inode)) - goto out_c; - -success: - /* Turn all the files into inodes and save the first one - which is the - * one we actually want. - */ - scb = &cookie->statuses[0]; - if (scb->status.abort_code != 0) - inode = ERR_PTR(afs_abort_to_error(scb->status.abort_code)); - - for (i = 0; i < cookie->nr_fids; i++) { - struct afs_status_cb *scb = &cookie->statuses[i]; - - if (!scb->have_status && !scb->have_error) - continue; - - if (cookie->inodes[i]) { - struct afs_vnode *iv = AFS_FS_I(cookie->inodes[i]); - - if (test_bit(AFS_VNODE_UNSET, &iv->flags)) - continue; - - afs_vnode_commit_status(&fc, iv, - scb->cb_break, NULL, scb); - continue; - } - - if (scb->status.abort_code != 0) - continue; - - iget_data.fid = cookie->fids[i]; - ti = afs_iget(dir->i_sb, key, &iget_data, scb, cbi, dvnode); - if (!IS_ERR(ti)) - afs_cache_permit(AFS_FS_I(ti), key, - 0 /* Assume vnode->cb_break is 0 */ + - iget_data.cb_v_break, - scb); - if (i == 0) { - inode = ti; - } else { - if (!IS_ERR(ti)) - iput(ti); - } - } - -out_c: - afs_put_cb_interest(afs_v2net(dvnode), cbi); - if (cookie->inodes) { - for (i = 0; i < cookie->nr_fids; i++) - iput(cookie->inodes[i]); - kfree(cookie->inodes); - } -out_s: - kvfree(cookie->statuses); + if (op->file[0].scb.have_status) + dentry->d_fsdata = (void *)(unsigned long)op->file[0].scb.status.data_version; + else + dentry->d_fsdata = (void *)(unsigned long)op->file[0].dv_before; + ret = afs_put_operation(op); out: kfree(cookie); - return inode; + _leave(""); + return inode ?: ERR_PTR(ret); } /* @@ -963,6 +959,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, if (!IS_ERR_OR_NULL(inode)) fid = AFS_FS_I(inode)->fid; + _debug("splice %px", dentry->d_inode); d = d_splice_alias(inode, dentry); if (!IS_ERR_OR_NULL(d)) { d->d_fsdata = dentry->d_fsdata; @@ -970,6 +967,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, } else { trace_afs_lookup(dvnode, &dentry->d_name, &fid); } + _leave(""); return d; } @@ -1220,130 +1218,97 @@ void afs_d_release(struct dentry *dentry) /* * Create a new inode for create/mkdir/symlink */ -static void afs_vnode_new_inode(struct afs_operation *fc, - struct dentry *new_dentry, - struct afs_iget_data *new_data, - struct afs_status_cb *new_scb) +static void afs_vnode_new_inode(struct afs_operation *op) { + struct afs_vnode_param *vp = &op->file[1]; struct afs_vnode *vnode; struct inode *inode; - if (fc->ac.error < 0) - return; + _enter(""); - inode = afs_iget(fc->vnode->vfs_inode.i_sb, fc->key, - new_data, new_scb, fc->cbi, fc->vnode); + ASSERTCMP(op->error, ==, 0); + + inode = afs_iget(op, vp); if (IS_ERR(inode)) { /* ENOMEM or EINTR at a really inconvenient time - just abandon * the new directory on the server. */ - fc->ac.error = PTR_ERR(inode); + op->error = PTR_ERR(inode); return; } vnode = AFS_FS_I(inode); set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); - if (fc->ac.error == 0) - afs_cache_permit(vnode, fc->key, vnode->cb_break, new_scb); - d_instantiate(new_dentry, inode); + if (!op->error) + afs_cache_permit(vnode, op->key, vnode->cb_break, &vp->scb); + d_instantiate(op->dentry, inode); } -static void afs_prep_for_new_inode(struct afs_operation *fc, - struct afs_iget_data *iget_data) +static void afs_create_success(struct afs_operation *op) { - iget_data->volume = fc->vnode->volume; - iget_data->cb_v_break = fc->vnode->volume->cb_v_break; - iget_data->cb_s_break = fc->cbi->server->cb_s_break; + _enter("op=%08x", op->debug_id); + afs_check_for_remote_deletion(op, op->file[0].vnode); + afs_vnode_commit_status(op, &op->file[0]); + afs_update_dentry_version(op, &op->file[0], op->dentry); + afs_vnode_new_inode(op); } -/* - * Note that a dentry got changed. We need to set d_fsdata to the data version - * number derived from the result of the operation. It doesn't matter if - * d_fsdata goes backwards as we'll just revalidate. - */ -static void afs_update_dentry_version(struct afs_operation *fc, - struct dentry *dentry, - struct afs_status_cb *scb) +static void afs_create_edit_dir(struct afs_operation *op) { - if (fc->ac.error == 0) - dentry->d_fsdata = - (void *)(unsigned long)scb->status.data_version; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; + struct afs_vnode *dvnode = dvp->vnode; + + _enter("op=%08x", op->debug_id); + + down_write(&dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + dvnode->status.data_version == dvp->dv_before + dvp->dv_delta) + afs_edit_dir_add(dvnode, &op->dentry->d_name, &vp->fid, + op->create.reason); + up_write(&dvnode->validate_lock); } +static void afs_create_put(struct afs_operation *op) +{ + _enter("op=%08x", op->debug_id); + + if (op->error) + d_drop(op->dentry); +} + +static const struct afs_operation_ops afs_mkdir_operation = { + .issue_afs_rpc = afs_fs_make_dir, + .issue_yfs_rpc = yfs_fs_make_dir, + .success = afs_create_success, + .edit_dir = afs_create_edit_dir, + .put = afs_create_put, +}; + /* * create a directory on an AFS filesystem */ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { - struct afs_iget_data iget_data; - struct afs_status_cb *scb; - struct afs_operation fc; + struct afs_operation *op; struct afs_vnode *dvnode = AFS_FS_I(dir); - struct key *key; - afs_dataversion_t data_version; - int ret; - - mode |= S_IFDIR; _enter("{%llx:%llu},{%pd},%ho", dvnode->fid.vid, dvnode->fid.vnode, dentry, mode); - ret = -ENOMEM; - scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL); - if (!scb) - goto error; - - key = afs_request_key(dvnode->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - goto error_scb; + op = afs_alloc_operation(NULL, dvnode->volume); + if (IS_ERR(op)) { + d_drop(dentry); + return PTR_ERR(op); } - ret = -ERESTARTSYS; - if (afs_begin_vnode_operation(&fc, dvnode, key, true)) { - data_version = dvnode->status.data_version + 1; - - while (afs_select_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(dvnode); - afs_prep_for_new_inode(&fc, &iget_data); - afs_fs_create(&fc, dentry->d_name.name, mode, - &scb[0], &iget_data.fid, &scb[1]); - } - - afs_check_for_remote_deletion(&fc, dvnode); - afs_vnode_commit_status(&fc, dvnode, fc.cb_break, - &data_version, &scb[0]); - afs_update_dentry_version(&fc, dentry, &scb[0]); - afs_vnode_new_inode(&fc, dentry, &iget_data, &scb[1]); - ret = afs_end_vnode_operation(&fc); - if (ret < 0) - goto error_key; - } else { - goto error_key; - } - - if (ret == 0) { - down_write(&dvnode->validate_lock); - if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && - dvnode->status.data_version == data_version) - afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid, - afs_edit_dir_for_create); - up_write(&dvnode->validate_lock); - } - - key_put(key); - kfree(scb); - _leave(" = 0"); - return 0; - -error_key: - key_put(key); -error_scb: - kfree(scb); -error: - d_drop(dentry); - _leave(" = %d", ret); - return ret; + afs_op_set_vnode(op, 0, dvnode); + op->file[0].dv_delta = 1; + op->dentry = dentry; + op->create.mode = S_IFDIR | mode; + op->create.reason = afs_edit_dir_for_mkdir; + op->ops = &afs_mkdir_operation; + return afs_do_sync_operation(op); } /* @@ -1361,76 +1326,86 @@ static void afs_dir_remove_subdir(struct dentry *dentry) } } +static void afs_rmdir_success(struct afs_operation *op) +{ + _enter("op=%08x", op->debug_id); + afs_check_for_remote_deletion(op, op->file[0].vnode); + afs_vnode_commit_status(op, &op->file[0]); + afs_update_dentry_version(op, &op->file[0], op->dentry); +} + +static void afs_rmdir_edit_dir(struct afs_operation *op) +{ + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode *dvnode = dvp->vnode; + + _enter("op=%08x", op->debug_id); + afs_dir_remove_subdir(op->dentry); + + down_write(&dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + dvnode->status.data_version == dvp->dv_before + dvp->dv_delta) + afs_edit_dir_remove(dvnode, &op->dentry->d_name, + afs_edit_dir_for_rmdir); + up_write(&dvnode->validate_lock); +} + +static void afs_rmdir_put(struct afs_operation *op) +{ + _enter("op=%08x", op->debug_id); + if (op->file[1].vnode) + up_write(&op->file[1].vnode->rmdir_lock); +} + +static const struct afs_operation_ops afs_rmdir_operation = { + .issue_afs_rpc = afs_fs_remove_dir, + .issue_yfs_rpc = yfs_fs_remove_dir, + .success = afs_rmdir_success, + .edit_dir = afs_rmdir_edit_dir, + .put = afs_rmdir_put, +}; + /* * remove a directory from an AFS filesystem */ static int afs_rmdir(struct inode *dir, struct dentry *dentry) { - struct afs_status_cb *scb; - struct afs_operation fc; + struct afs_operation *op; struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode = NULL; - struct key *key; - afs_dataversion_t data_version; int ret; _enter("{%llx:%llu},{%pd}", dvnode->fid.vid, dvnode->fid.vnode, dentry); - scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL); - if (!scb) - return -ENOMEM; + op = afs_alloc_operation(NULL, dvnode->volume); + if (IS_ERR(op)) + return PTR_ERR(op); - key = afs_request_key(dvnode->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - goto error; - } + afs_op_set_vnode(op, 0, dvnode); + op->file[0].dv_delta = 1; + + op->dentry = dentry; + op->ops = &afs_rmdir_operation; /* Try to make sure we have a callback promise on the victim. */ if (d_really_is_positive(dentry)) { vnode = AFS_FS_I(d_inode(dentry)); - ret = afs_validate(vnode, key); + ret = afs_validate(vnode, op->key); if (ret < 0) - goto error_key; + goto error; } if (vnode) { ret = down_write_killable(&vnode->rmdir_lock); if (ret < 0) - goto error_key; + goto error; + op->file[1].vnode = vnode; } - ret = -ERESTARTSYS; - if (afs_begin_vnode_operation(&fc, dvnode, key, true)) { - data_version = dvnode->status.data_version + 1; + return afs_do_sync_operation(op); - while (afs_select_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(dvnode); - afs_fs_remove(&fc, vnode, dentry->d_name.name, true, scb); - } - - afs_vnode_commit_status(&fc, dvnode, fc.cb_break, - &data_version, scb); - afs_update_dentry_version(&fc, dentry, scb); - ret = afs_end_vnode_operation(&fc); - if (ret == 0) { - afs_dir_remove_subdir(dentry); - down_write(&dvnode->validate_lock); - if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && - dvnode->status.data_version == data_version) - afs_edit_dir_remove(dvnode, &dentry->d_name, - afs_edit_dir_for_rmdir); - up_write(&dvnode->validate_lock); - } - } - - if (vnode) - up_write(&vnode->rmdir_lock); -error_key: - key_put(key); error: - kfree(scb); - return ret; + return afs_put_operation(op); } /* @@ -1443,52 +1418,90 @@ error: * However, if we didn't have a callback promise outstanding, or it was * outstanding on a different server, then it won't break it either... */ -static int afs_dir_remove_link(struct afs_vnode *dvnode, struct dentry *dentry, - struct key *key) +static void afs_dir_remove_link(struct afs_operation *op) { - int ret = 0; + struct afs_vnode *dvnode = op->file[0].vnode; + struct afs_vnode *vnode = op->file[1].vnode; + struct dentry *dentry = op->dentry; + int ret; - if (d_really_is_positive(dentry)) { - struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); + if (op->error != 0 || + (op->file[1].scb.have_status && op->file[1].scb.have_error)) + return; + if (d_really_is_positive(dentry)) + return; - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { - /* Already done */ - } else if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { - write_seqlock(&vnode->cb_lock); - drop_nlink(&vnode->vfs_inode); - if (vnode->vfs_inode.i_nlink == 0) { - set_bit(AFS_VNODE_DELETED, &vnode->flags); - __afs_break_callback(vnode, afs_cb_break_for_unlink); - } - write_sequnlock(&vnode->cb_lock); - ret = 0; - } else { - afs_break_callback(vnode, afs_cb_break_for_unlink); - - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) - kdebug("AFS_VNODE_DELETED"); - - ret = afs_validate(vnode, key); - if (ret == -ESTALE) - ret = 0; + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + /* Already done */ + } else if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { + write_seqlock(&vnode->cb_lock); + drop_nlink(&vnode->vfs_inode); + if (vnode->vfs_inode.i_nlink == 0) { + set_bit(AFS_VNODE_DELETED, &vnode->flags); + __afs_break_callback(vnode, afs_cb_break_for_unlink); } - _debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret); + write_sequnlock(&vnode->cb_lock); + } else { + afs_break_callback(vnode, afs_cb_break_for_unlink); + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + _debug("AFS_VNODE_DELETED"); + + ret = afs_validate(vnode, op->key); + if (ret != -ESTALE) + op->error = ret; } - return ret; + _debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, op->error); } +static void afs_unlink_success(struct afs_operation *op) +{ + _enter("op=%08x", op->debug_id); + afs_check_for_remote_deletion(op, op->file[0].vnode); + afs_vnode_commit_status(op, &op->file[0]); + afs_vnode_commit_status(op, &op->file[1]); + afs_update_dentry_version(op, &op->file[0], op->dentry); + afs_dir_remove_link(op); +} + +static void afs_unlink_edit_dir(struct afs_operation *op) +{ + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode *dvnode = dvp->vnode; + + _enter("op=%08x", op->debug_id); + down_write(&dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + dvnode->status.data_version == dvp->dv_before + dvp->dv_delta) + afs_edit_dir_remove(dvnode, &op->dentry->d_name, + afs_edit_dir_for_unlink); + up_write(&dvnode->validate_lock); +} + +static void afs_unlink_put(struct afs_operation *op) +{ + _enter("op=%08x", op->debug_id); + if (op->unlink.need_rehash && op->error < 0 && op->error != -ENOENT) + d_rehash(op->dentry); +} + +static const struct afs_operation_ops afs_unlink_operation = { + .issue_afs_rpc = afs_fs_remove_file, + .issue_yfs_rpc = yfs_fs_remove_file, + .success = afs_unlink_success, + .edit_dir = afs_unlink_edit_dir, + .put = afs_unlink_put, +}; + /* * Remove a file or symlink from an AFS filesystem. */ static int afs_unlink(struct inode *dir, struct dentry *dentry) { - struct afs_operation fc; - struct afs_status_cb *scb; + struct afs_operation *op; struct afs_vnode *dvnode = AFS_FS_I(dir); struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); - struct key *key; - bool need_rehash = false; int ret; _enter("{%llx:%llu},{%pd}", @@ -1497,269 +1510,176 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) if (dentry->d_name.len >= AFSNAMEMAX) return -ENAMETOOLONG; - ret = -ENOMEM; - scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL); - if (!scb) - goto error; + op = afs_alloc_operation(NULL, dvnode->volume); + if (IS_ERR(op)) + return PTR_ERR(op); - key = afs_request_key(dvnode->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - goto error_scb; - } + afs_op_set_vnode(op, 0, dvnode); + op->file[0].dv_delta = 1; /* Try to make sure we have a callback promise on the victim. */ - ret = afs_validate(vnode, key); - if (ret < 0) - goto error_key; + ret = afs_validate(vnode, op->key); + if (ret < 0) { + op->error = ret; + goto error; + } spin_lock(&dentry->d_lock); if (d_count(dentry) > 1) { spin_unlock(&dentry->d_lock); /* Start asynchronous writeout of the inode */ write_inode_now(d_inode(dentry), 0); - ret = afs_sillyrename(dvnode, vnode, dentry, key); - goto error_key; + op->error = afs_sillyrename(dvnode, vnode, dentry, op->key); + goto error; } if (!d_unhashed(dentry)) { /* Prevent a race with RCU lookup. */ __d_drop(dentry); - need_rehash = true; + op->unlink.need_rehash = true; } spin_unlock(&dentry->d_lock); - ret = -ERESTARTSYS; - if (afs_begin_vnode_operation(&fc, dvnode, key, true)) { - afs_dataversion_t data_version = dvnode->status.data_version + 1; - afs_dataversion_t data_version_2 = vnode->status.data_version; + op->file[1].vnode = vnode; + op->dentry = dentry; + op->ops = &afs_unlink_operation; + return afs_do_sync_operation(op); - while (afs_select_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(dvnode); - fc.cb_break_2 = afs_calc_vnode_cb_break(vnode); - - if (test_bit(AFS_SERVER_FL_IS_YFS, &fc.cbi->server->flags) && - !test_bit(AFS_SERVER_FL_NO_RM2, &fc.cbi->server->flags)) { - yfs_fs_remove_file2(&fc, vnode, dentry->d_name.name, - &scb[0], &scb[1]); - if (fc.ac.error != -ECONNABORTED || - fc.ac.abort_code != RXGEN_OPCODE) - continue; - set_bit(AFS_SERVER_FL_NO_RM2, &fc.cbi->server->flags); - } - - afs_fs_remove(&fc, vnode, dentry->d_name.name, false, &scb[0]); - } - - afs_vnode_commit_status(&fc, dvnode, fc.cb_break, - &data_version, &scb[0]); - afs_vnode_commit_status(&fc, vnode, fc.cb_break_2, - &data_version_2, &scb[1]); - afs_update_dentry_version(&fc, dentry, &scb[0]); - ret = afs_end_vnode_operation(&fc); - if (ret == 0 && !(scb[1].have_status || scb[1].have_error)) - ret = afs_dir_remove_link(dvnode, dentry, key); - - if (ret == 0) { - down_write(&dvnode->validate_lock); - if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && - dvnode->status.data_version == data_version) - afs_edit_dir_remove(dvnode, &dentry->d_name, - afs_edit_dir_for_unlink); - up_write(&dvnode->validate_lock); - } - } - - if (need_rehash && ret < 0 && ret != -ENOENT) - d_rehash(dentry); - -error_key: - key_put(key); -error_scb: - kfree(scb); error: - _leave(" = %d", ret); - return ret; + return afs_put_operation(op); } +static const struct afs_operation_ops afs_create_operation = { + .issue_afs_rpc = afs_fs_create_file, + .issue_yfs_rpc = yfs_fs_create_file, + .success = afs_create_success, + .edit_dir = afs_create_edit_dir, + .put = afs_create_put, +}; + /* * create a regular file on an AFS filesystem */ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - struct afs_iget_data iget_data; - struct afs_operation fc; - struct afs_status_cb *scb; + struct afs_operation *op; struct afs_vnode *dvnode = AFS_FS_I(dir); - struct key *key; - afs_dataversion_t data_version; - int ret; + int ret = -ENAMETOOLONG; - mode |= S_IFREG; - - _enter("{%llx:%llu},{%pd},%ho,", + _enter("{%llx:%llu},{%pd},%ho", dvnode->fid.vid, dvnode->fid.vnode, dentry, mode); - ret = -ENAMETOOLONG; if (dentry->d_name.len >= AFSNAMEMAX) goto error; - key = afs_request_key(dvnode->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); + op = afs_alloc_operation(NULL, dvnode->volume); + if (IS_ERR(op)) { + ret = PTR_ERR(op); goto error; } - ret = -ENOMEM; - scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL); - if (!scb) - goto error_scb; + afs_op_set_vnode(op, 0, dvnode); + op->file[0].dv_delta = 1; - ret = -ERESTARTSYS; - if (afs_begin_vnode_operation(&fc, dvnode, key, true)) { - data_version = dvnode->status.data_version + 1; + op->dentry = dentry; + op->create.mode = S_IFREG | mode; + op->create.reason = afs_edit_dir_for_create; + op->ops = &afs_create_operation; + return afs_do_sync_operation(op); - while (afs_select_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(dvnode); - afs_prep_for_new_inode(&fc, &iget_data); - afs_fs_create(&fc, dentry->d_name.name, mode, - &scb[0], &iget_data.fid, &scb[1]); - } - - afs_check_for_remote_deletion(&fc, dvnode); - afs_vnode_commit_status(&fc, dvnode, fc.cb_break, - &data_version, &scb[0]); - afs_update_dentry_version(&fc, dentry, &scb[0]); - afs_vnode_new_inode(&fc, dentry, &iget_data, &scb[1]); - ret = afs_end_vnode_operation(&fc); - if (ret < 0) - goto error_key; - } else { - goto error_key; - } - - down_write(&dvnode->validate_lock); - if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && - dvnode->status.data_version == data_version) - afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid, - afs_edit_dir_for_create); - up_write(&dvnode->validate_lock); - - kfree(scb); - key_put(key); - _leave(" = 0"); - return 0; - -error_scb: - kfree(scb); -error_key: - key_put(key); error: d_drop(dentry); _leave(" = %d", ret); return ret; } +static void afs_link_success(struct afs_operation *op) +{ + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; + + _enter("op=%08x", op->debug_id); + afs_vnode_commit_status(op, dvp); + afs_vnode_commit_status(op, vp); + afs_update_dentry_version(op, dvp, op->dentry); + if (op->dentry_2->d_parent == op->dentry->d_parent) + afs_update_dentry_version(op, dvp, op->dentry_2); + ihold(&vp->vnode->vfs_inode); + d_instantiate(op->dentry, &vp->vnode->vfs_inode); +} + +static void afs_link_put(struct afs_operation *op) +{ + _enter("op=%08x", op->debug_id); + if (op->error) + d_drop(op->dentry); +} + +static const struct afs_operation_ops afs_link_operation = { + .issue_afs_rpc = afs_fs_link, + .issue_yfs_rpc = yfs_fs_link, + .success = afs_link_success, + .edit_dir = afs_create_edit_dir, + .put = afs_link_put, +}; + /* * create a hard link between files in an AFS filesystem */ static int afs_link(struct dentry *from, struct inode *dir, struct dentry *dentry) { - struct afs_operation fc; - struct afs_status_cb *scb; + struct afs_operation *op; struct afs_vnode *dvnode = AFS_FS_I(dir); struct afs_vnode *vnode = AFS_FS_I(d_inode(from)); - struct key *key; - afs_dataversion_t data_version; - int ret; + int ret = -ENAMETOOLONG; _enter("{%llx:%llu},{%llx:%llu},{%pd}", vnode->fid.vid, vnode->fid.vnode, dvnode->fid.vid, dvnode->fid.vnode, dentry); - ret = -ENAMETOOLONG; if (dentry->d_name.len >= AFSNAMEMAX) goto error; - ret = -ENOMEM; - scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL); - if (!scb) + op = afs_alloc_operation(NULL, dvnode->volume); + if (IS_ERR(op)) { + ret = PTR_ERR(op); goto error; - - key = afs_request_key(dvnode->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - goto error_scb; } - ret = -ERESTARTSYS; - if (afs_begin_vnode_operation(&fc, dvnode, key, true)) { - data_version = dvnode->status.data_version + 1; + afs_op_set_vnode(op, 0, dvnode); + afs_op_set_vnode(op, 1, vnode); + op->file[0].dv_delta = 1; - if (mutex_lock_interruptible_nested(&vnode->io_lock, 1) < 0) { - afs_end_vnode_operation(&fc); - goto error_key; - } + op->dentry = dentry; + op->dentry_2 = from; + op->ops = &afs_link_operation; + op->create.reason = afs_edit_dir_for_link; + return afs_do_sync_operation(op); - while (afs_select_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(dvnode); - fc.cb_break_2 = afs_calc_vnode_cb_break(vnode); - afs_fs_link(&fc, vnode, dentry->d_name.name, - &scb[0], &scb[1]); - } - - afs_vnode_commit_status(&fc, dvnode, fc.cb_break, - &data_version, &scb[0]); - afs_vnode_commit_status(&fc, vnode, fc.cb_break_2, - NULL, &scb[1]); - ihold(&vnode->vfs_inode); - afs_update_dentry_version(&fc, dentry, &scb[0]); - d_instantiate(dentry, &vnode->vfs_inode); - - mutex_unlock(&vnode->io_lock); - ret = afs_end_vnode_operation(&fc); - if (ret < 0) - goto error_key; - } else { - goto error_key; - } - - down_write(&dvnode->validate_lock); - if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && - dvnode->status.data_version == data_version) - afs_edit_dir_add(dvnode, &dentry->d_name, &vnode->fid, - afs_edit_dir_for_link); - up_write(&dvnode->validate_lock); - - key_put(key); - kfree(scb); - _leave(" = 0"); - return 0; - -error_key: - key_put(key); -error_scb: - kfree(scb); error: d_drop(dentry); _leave(" = %d", ret); return ret; } +static const struct afs_operation_ops afs_symlink_operation = { + .issue_afs_rpc = afs_fs_symlink, + .issue_yfs_rpc = yfs_fs_symlink, + .success = afs_create_success, + .edit_dir = afs_create_edit_dir, + .put = afs_create_put, +}; + /* * create a symlink in an AFS filesystem */ static int afs_symlink(struct inode *dir, struct dentry *dentry, const char *content) { - struct afs_iget_data iget_data; - struct afs_operation fc; - struct afs_status_cb *scb; + struct afs_operation *op; struct afs_vnode *dvnode = AFS_FS_I(dir); - struct key *key; - afs_dataversion_t data_version; int ret; _enter("{%llx:%llu},{%pd},%s", @@ -1774,62 +1694,115 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, if (strlen(content) >= AFSPATHMAX) goto error; - ret = -ENOMEM; - scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL); - if (!scb) + op = afs_alloc_operation(NULL, dvnode->volume); + if (IS_ERR(op)) { + ret = PTR_ERR(op); goto error; - - key = afs_request_key(dvnode->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - goto error_scb; } - ret = -ERESTARTSYS; - if (afs_begin_vnode_operation(&fc, dvnode, key, true)) { - data_version = dvnode->status.data_version + 1; + afs_op_set_vnode(op, 0, dvnode); + op->file[0].dv_delta = 1; - while (afs_select_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(dvnode); - afs_prep_for_new_inode(&fc, &iget_data); - afs_fs_symlink(&fc, dentry->d_name.name, content, - &scb[0], &iget_data.fid, &scb[1]); - } + op->dentry = dentry; + op->ops = &afs_symlink_operation; + op->create.reason = afs_edit_dir_for_symlink; + op->create.symlink = content; + return afs_do_sync_operation(op); - afs_check_for_remote_deletion(&fc, dvnode); - afs_vnode_commit_status(&fc, dvnode, fc.cb_break, - &data_version, &scb[0]); - afs_update_dentry_version(&fc, dentry, &scb[0]); - afs_vnode_new_inode(&fc, dentry, &iget_data, &scb[1]); - ret = afs_end_vnode_operation(&fc); - if (ret < 0) - goto error_key; - } else { - goto error_key; - } - - down_write(&dvnode->validate_lock); - if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && - dvnode->status.data_version == data_version) - afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid, - afs_edit_dir_for_symlink); - up_write(&dvnode->validate_lock); - - key_put(key); - kfree(scb); - _leave(" = 0"); - return 0; - -error_key: - key_put(key); -error_scb: - kfree(scb); error: d_drop(dentry); _leave(" = %d", ret); return ret; } +static void afs_rename_success(struct afs_operation *op) +{ + _enter("op=%08x", op->debug_id); + + afs_vnode_commit_status(op, &op->file[0]); + if (op->file[1].vnode != op->file[0].vnode) + afs_vnode_commit_status(op, &op->file[1]); +} + +static void afs_rename_edit_dir(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode *orig_dvnode = orig_dvp->vnode; + struct afs_vnode *new_dvnode = new_dvp->vnode; + struct afs_vnode *vnode = AFS_FS_I(d_inode(op->dentry)); + struct dentry *old_dentry = op->dentry; + struct dentry *new_dentry = op->dentry_2; + struct inode *new_inode; + + _enter("op=%08x", op->debug_id); + + if (op->rename.rehash) { + d_rehash(op->rename.rehash); + op->rename.rehash = NULL; + } + + down_write(&orig_dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) && + orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) + afs_edit_dir_remove(orig_dvnode, &old_dentry->d_name, + afs_edit_dir_for_rename_0); + + if (new_dvnode != orig_dvnode) { + up_write(&orig_dvnode->validate_lock); + down_write(&new_dvnode->validate_lock); + } + + if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags) && + new_dvnode->status.data_version == new_dvp->dv_before + new_dvp->dv_delta) { + if (!op->rename.new_negative) + afs_edit_dir_remove(new_dvnode, &new_dentry->d_name, + afs_edit_dir_for_rename_1); + + afs_edit_dir_add(new_dvnode, &new_dentry->d_name, + &vnode->fid, afs_edit_dir_for_rename_2); + } + + new_inode = d_inode(new_dentry); + if (new_inode) { + spin_lock(&new_inode->i_lock); + if (new_inode->i_nlink > 0) + drop_nlink(new_inode); + spin_unlock(&new_inode->i_lock); + } + + /* Now we can update d_fsdata on the dentries to reflect their + * new parent's data_version. + * + * Note that if we ever implement RENAME_EXCHANGE, we'll have + * to update both dentries with opposing dir versions. + */ + afs_update_dentry_version(op, new_dvp, op->dentry); + afs_update_dentry_version(op, new_dvp, op->dentry_2); + + d_move(old_dentry, new_dentry); + + up_write(&new_dvnode->validate_lock); +} + +static void afs_rename_put(struct afs_operation *op) +{ + _enter("op=%08x", op->debug_id); + if (op->rename.rehash) + d_rehash(op->rename.rehash); + dput(op->rename.tmp); + if (op->error) + d_rehash(op->dentry); +} + +static const struct afs_operation_ops afs_rename_operation = { + .issue_afs_rpc = afs_fs_rename, + .issue_yfs_rpc = yfs_fs_rename, + .success = afs_rename_success, + .edit_dir = afs_rename_edit_dir, + .put = afs_rename_put, +}; + /* * rename a file in an AFS filesystem and/or move it between directories */ @@ -1837,15 +1810,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - struct afs_operation fc; - struct afs_status_cb *scb; + struct afs_operation *op; struct afs_vnode *orig_dvnode, *new_dvnode, *vnode; - struct dentry *tmp = NULL, *rehash = NULL; - struct inode *new_inode; - struct key *key; - afs_dataversion_t orig_data_version; - afs_dataversion_t new_data_version; - bool new_negative = d_is_negative(new_dentry); int ret; if (flags) @@ -1865,16 +1831,19 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dvnode->fid.vid, new_dvnode->fid.vnode, new_dentry); - ret = -ENOMEM; - scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL); - if (!scb) - goto error; + op = afs_alloc_operation(NULL, orig_dvnode->volume); + if (IS_ERR(op)) + return PTR_ERR(op); - key = afs_request_key(orig_dvnode->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - goto error_scb; - } + afs_op_set_vnode(op, 0, orig_dvnode); + afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */ + op->file[0].dv_delta = 1; + op->file[1].dv_delta = 1; + + op->dentry = old_dentry; + op->dentry_2 = new_dentry; + op->rename.new_negative = d_is_negative(new_dentry); + op->ops = &afs_rename_operation; /* For non-directories, check whether the target is busy and if so, * make a copy of the dentry and then do a silly-rename. If the @@ -1887,26 +1856,26 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, */ if (!d_unhashed(new_dentry)) { d_drop(new_dentry); - rehash = new_dentry; + op->rename.rehash = new_dentry; } if (d_count(new_dentry) > 2) { /* copy the target dentry's name */ ret = -ENOMEM; - tmp = d_alloc(new_dentry->d_parent, - &new_dentry->d_name); - if (!tmp) - goto error_rehash; + op->rename.tmp = d_alloc(new_dentry->d_parent, + &new_dentry->d_name); + if (!op->rename.tmp) + goto error; ret = afs_sillyrename(new_dvnode, AFS_FS_I(d_inode(new_dentry)), - new_dentry, key); + new_dentry, op->key); if (ret) - goto error_rehash; + goto error; - new_dentry = tmp; - rehash = NULL; - new_negative = true; + op->dentry_2 = op->rename.tmp; + op->rename.rehash = NULL; + op->rename.new_negative = true; } } @@ -1921,98 +1890,10 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, */ d_drop(old_dentry); - ret = -ERESTARTSYS; - if (afs_begin_vnode_operation(&fc, orig_dvnode, key, true)) { - orig_data_version = orig_dvnode->status.data_version + 1; + return afs_do_sync_operation(op); - if (orig_dvnode != new_dvnode) { - if (mutex_lock_interruptible_nested(&new_dvnode->io_lock, 1) < 0) { - afs_end_vnode_operation(&fc); - goto error_rehash_old; - } - new_data_version = new_dvnode->status.data_version + 1; - } else { - new_data_version = orig_data_version; - } - - while (afs_select_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(orig_dvnode); - fc.cb_break_2 = afs_calc_vnode_cb_break(new_dvnode); - afs_fs_rename(&fc, old_dentry->d_name.name, - new_dvnode, new_dentry->d_name.name, - &scb[0], &scb[1]); - } - - afs_vnode_commit_status(&fc, orig_dvnode, fc.cb_break, - &orig_data_version, &scb[0]); - if (new_dvnode != orig_dvnode) { - afs_vnode_commit_status(&fc, new_dvnode, fc.cb_break_2, - &new_data_version, &scb[1]); - mutex_unlock(&new_dvnode->io_lock); - } - ret = afs_end_vnode_operation(&fc); - if (ret < 0) - goto error_rehash_old; - } - - if (ret == 0) { - if (rehash) - d_rehash(rehash); - down_write(&orig_dvnode->validate_lock); - if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) && - orig_dvnode->status.data_version == orig_data_version) - afs_edit_dir_remove(orig_dvnode, &old_dentry->d_name, - afs_edit_dir_for_rename_0); - if (orig_dvnode != new_dvnode) { - up_write(&orig_dvnode->validate_lock); - - down_write(&new_dvnode->validate_lock); - } - if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags) && - orig_dvnode->status.data_version == new_data_version) { - if (!new_negative) - afs_edit_dir_remove(new_dvnode, &new_dentry->d_name, - afs_edit_dir_for_rename_1); - - afs_edit_dir_add(new_dvnode, &new_dentry->d_name, - &vnode->fid, afs_edit_dir_for_rename_2); - } - - new_inode = d_inode(new_dentry); - if (new_inode) { - spin_lock(&new_inode->i_lock); - if (new_inode->i_nlink > 0) - drop_nlink(new_inode); - spin_unlock(&new_inode->i_lock); - } - - /* Now we can update d_fsdata on the dentries to reflect their - * new parent's data_version. - * - * Note that if we ever implement RENAME_EXCHANGE, we'll have - * to update both dentries with opposing dir versions. - */ - afs_update_dentry_version(&fc, old_dentry, &scb[1]); - afs_update_dentry_version(&fc, new_dentry, &scb[1]); - d_move(old_dentry, new_dentry); - up_write(&new_dvnode->validate_lock); - goto error_tmp; - } - -error_rehash_old: - d_rehash(new_dentry); -error_rehash: - if (rehash) - d_rehash(rehash); -error_tmp: - if (tmp) - dput(tmp); - key_put(key); -error_scb: - kfree(scb); error: - _leave(" = %d", ret); - return ret; + return afs_put_operation(op); } /* diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index 0a82b134aa0d..b14e3d9a25e2 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -12,6 +12,47 @@ #include #include "internal.h" +static void afs_silly_rename_success(struct afs_operation *op) +{ + _enter("op=%08x", op->debug_id); + + afs_vnode_commit_status(op, &op->file[0]); +} + +static void afs_silly_rename_edit_dir(struct afs_operation *op) +{ + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode *dvnode = dvp->vnode; + struct afs_vnode *vnode = AFS_FS_I(d_inode(op->dentry)); + struct dentry *old = op->dentry; + struct dentry *new = op->dentry_2; + + spin_lock(&old->d_lock); + old->d_flags |= DCACHE_NFSFS_RENAMED; + spin_unlock(&old->d_lock); + if (dvnode->silly_key != op->key) { + key_put(dvnode->silly_key); + dvnode->silly_key = key_get(op->key); + } + + down_write(&dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + dvnode->status.data_version == dvp->dv_before + dvp->dv_delta) { + afs_edit_dir_remove(dvnode, &old->d_name, + afs_edit_dir_for_silly_0); + afs_edit_dir_add(dvnode, &new->d_name, + &vnode->fid, afs_edit_dir_for_silly_1); + } + up_write(&dvnode->validate_lock); +} + +static const struct afs_operation_ops afs_silly_rename_operation = { + .issue_afs_rpc = afs_fs_rename, + .issue_yfs_rpc = yfs_fs_rename, + .success = afs_silly_rename_success, + .edit_dir = afs_silly_rename_edit_dir, +}; + /* * Actually perform the silly rename step. */ @@ -19,56 +60,22 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode struct dentry *old, struct dentry *new, struct key *key) { - struct afs_operation fc; - struct afs_status_cb *scb; - afs_dataversion_t dir_data_version; - int ret = -ERESTARTSYS; + struct afs_operation *op; _enter("%pd,%pd", old, new); - scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL); - if (!scb) - return -ENOMEM; + op = afs_alloc_operation(key, dvnode->volume); + if (IS_ERR(op)) + return PTR_ERR(op); + + afs_op_set_vnode(op, 0, dvnode); + + op->dentry = old; + op->dentry_2 = new; + op->ops = &afs_silly_rename_operation; trace_afs_silly_rename(vnode, false); - if (afs_begin_vnode_operation(&fc, dvnode, key, true)) { - dir_data_version = dvnode->status.data_version + 1; - - while (afs_select_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(dvnode); - afs_fs_rename(&fc, old->d_name.name, - dvnode, new->d_name.name, - scb, scb); - } - - afs_vnode_commit_status(&fc, dvnode, fc.cb_break, - &dir_data_version, scb); - ret = afs_end_vnode_operation(&fc); - } - - if (ret == 0) { - spin_lock(&old->d_lock); - old->d_flags |= DCACHE_NFSFS_RENAMED; - spin_unlock(&old->d_lock); - if (dvnode->silly_key != key) { - key_put(dvnode->silly_key); - dvnode->silly_key = key_get(key); - } - - down_write(&dvnode->validate_lock); - if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && - dvnode->status.data_version == dir_data_version) { - afs_edit_dir_remove(dvnode, &old->d_name, - afs_edit_dir_for_silly_0); - afs_edit_dir_add(dvnode, &new->d_name, - &vnode->fid, afs_edit_dir_for_silly_1); - } - up_write(&dvnode->validate_lock); - } - - kfree(scb); - _leave(" = %d", ret); - return ret; + return afs_do_sync_operation(op); } /** @@ -139,65 +146,66 @@ out: return ret; } +static void afs_silly_unlink_success(struct afs_operation *op) +{ + struct afs_vnode *vnode = op->file[1].vnode; + + _enter("op=%08x", op->debug_id); + afs_check_for_remote_deletion(op, op->file[0].vnode); + afs_vnode_commit_status(op, &op->file[0]); + afs_vnode_commit_status(op, &op->file[1]); + afs_update_dentry_version(op, &op->file[0], op->dentry); + + drop_nlink(&vnode->vfs_inode); + if (vnode->vfs_inode.i_nlink == 0) { + set_bit(AFS_VNODE_DELETED, &vnode->flags); + clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + } +} + +static void afs_silly_unlink_edit_dir(struct afs_operation *op) +{ + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode *dvnode = dvp->vnode; + + _enter("op=%08x", op->debug_id); + down_write(&dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + dvnode->status.data_version == dvp->dv_before + dvp->dv_delta) + afs_edit_dir_remove(dvnode, &op->dentry->d_name, + afs_edit_dir_for_unlink); + up_write(&dvnode->validate_lock); +} + +static const struct afs_operation_ops afs_silly_unlink_operation = { + .issue_afs_rpc = afs_fs_remove_file, + .issue_yfs_rpc = yfs_fs_remove_file, + .success = afs_silly_unlink_success, + .edit_dir = afs_silly_unlink_edit_dir, +}; + /* * Tell the server to remove a sillyrename file. */ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode, struct dentry *dentry, struct key *key) { - struct afs_operation fc; - struct afs_status_cb *scb; - int ret = -ERESTARTSYS; + struct afs_operation *op; _enter(""); - scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL); - if (!scb) - return -ENOMEM; + op = afs_alloc_operation(NULL, dvnode->volume); + if (IS_ERR(op)) + return PTR_ERR(op); + + afs_op_set_vnode(op, 0, dvnode); + afs_op_set_vnode(op, 1, vnode); + + op->dentry = dentry; + op->ops = &afs_silly_unlink_operation; trace_afs_silly_rename(vnode, true); - if (afs_begin_vnode_operation(&fc, dvnode, key, false)) { - afs_dataversion_t dir_data_version = dvnode->status.data_version + 1; - - while (afs_select_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(dvnode); - - if (test_bit(AFS_SERVER_FL_IS_YFS, &fc.cbi->server->flags) && - !test_bit(AFS_SERVER_FL_NO_RM2, &fc.cbi->server->flags)) { - yfs_fs_remove_file2(&fc, vnode, dentry->d_name.name, - &scb[0], &scb[1]); - if (fc.ac.error != -ECONNABORTED || - fc.ac.abort_code != RXGEN_OPCODE) - continue; - set_bit(AFS_SERVER_FL_NO_RM2, &fc.cbi->server->flags); - } - - afs_fs_remove(&fc, vnode, dentry->d_name.name, false, &scb[0]); - } - - afs_vnode_commit_status(&fc, dvnode, fc.cb_break, - &dir_data_version, &scb[0]); - ret = afs_end_vnode_operation(&fc); - if (ret == 0) { - drop_nlink(&vnode->vfs_inode); - if (vnode->vfs_inode.i_nlink == 0) { - set_bit(AFS_VNODE_DELETED, &vnode->flags); - clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); - } - } - if (ret == 0) { - down_write(&dvnode->validate_lock); - if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && - dvnode->status.data_version == dir_data_version) - afs_edit_dir_remove(dvnode, &dentry->d_name, - afs_edit_dir_for_unlink); - up_write(&dvnode->validate_lock); - } - } - - kfree(scb); - _leave(" = %d", ret); - return ret; + return afs_do_sync_operation(op); } /* diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 7503899c0a1b..b79879aacc02 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -10,6 +10,99 @@ #include #include "internal.h" +static atomic_t afs_autocell_ino; + +/* + * iget5() comparator for inode created by autocell operations + * + * These pseudo inodes don't match anything. + */ +static int afs_iget5_pseudo_test(struct inode *inode, void *opaque) +{ + return 0; +} + +/* + * iget5() inode initialiser + */ +static int afs_iget5_pseudo_set(struct inode *inode, void *opaque) +{ + struct afs_super_info *as = AFS_FS_S(inode->i_sb); + struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_fid *fid = opaque; + + vnode->volume = as->volume; + vnode->fid = *fid; + inode->i_ino = fid->vnode; + inode->i_generation = fid->unique; + return 0; +} + +/* + * Create an inode for a dynamic root directory or an autocell dynamic + * automount dir. + */ +struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) +{ + struct afs_super_info *as = AFS_FS_S(sb); + struct afs_vnode *vnode; + struct inode *inode; + struct afs_fid fid = {}; + + _enter(""); + + if (as->volume) + fid.vid = as->volume->vid; + if (root) { + fid.vnode = 1; + fid.unique = 1; + } else { + fid.vnode = atomic_inc_return(&afs_autocell_ino); + fid.unique = 0; + } + + inode = iget5_locked(sb, fid.vnode, + afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid); + if (!inode) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + _debug("GOT INODE %p { ino=%lu, vl=%llx, vn=%llx, u=%x }", + inode, inode->i_ino, fid.vid, fid.vnode, fid.unique); + + vnode = AFS_FS_I(inode); + + /* there shouldn't be an existing inode */ + BUG_ON(!(inode->i_state & I_NEW)); + + inode->i_size = 0; + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; + if (root) { + inode->i_op = &afs_dynroot_inode_operations; + inode->i_fop = &simple_dir_operations; + } else { + inode->i_op = &afs_autocell_inode_operations; + } + set_nlink(inode, 2); + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_ctime = inode->i_atime = inode->i_mtime = current_time(inode); + inode->i_blocks = 0; + inode->i_generation = 0; + + set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); + if (!root) { + set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); + inode->i_flags |= S_AUTOMOUNT; + } + + inode->i_flags |= S_NOATIME; + unlock_new_inode(inode); + _leave(" = %p", inode); + return inode; +} + /* * Probe to see if a cell may exist. This prevents positive dentries from * being created unnecessarily. diff --git a/fs/afs/file.c b/fs/afs/file.c index 0c0ccc1412ee..506c47471b42 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -69,7 +69,7 @@ static const struct vm_operations_struct afs_vm_ops = { */ void afs_put_wb_key(struct afs_wb_key *wbk) { - if (refcount_dec_and_test(&wbk->usage)) { + if (wbk && refcount_dec_and_test(&wbk->usage)) { key_put(wbk->key); kfree(wbk); } @@ -220,14 +220,35 @@ static void afs_file_readpage_read_complete(struct page *page, } #endif +static void afs_fetch_data_success(struct afs_operation *op) +{ + struct afs_vnode *vnode = op->file[0].vnode; + + _enter("op=%08x", op->debug_id); + afs_check_for_remote_deletion(op, vnode); + afs_vnode_commit_status(op, &op->file[0]); + afs_stat_v(vnode, n_fetches); + atomic_long_add(op->fetch.req->actual_len, &op->net->n_fetch_bytes); +} + +static void afs_fetch_data_put(struct afs_operation *op) +{ + afs_put_read(op->fetch.req); +} + +static const struct afs_operation_ops afs_fetch_data_operation = { + .issue_afs_rpc = afs_fs_fetch_data, + .issue_yfs_rpc = yfs_fs_fetch_data, + .success = afs_fetch_data_success, + .put = afs_fetch_data_put, +}; + /* * Fetch file data from the volume. */ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *req) { - struct afs_operation fc; - struct afs_status_cb *scb; - int ret; + struct afs_operation *op; _enter("%s{%llx:%llu.%u},%x,,,", vnode->volume->name, @@ -236,34 +257,15 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *re vnode->fid.unique, key_serial(key)); - scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL); - if (!scb) - return -ENOMEM; + op = afs_alloc_operation(key, vnode->volume); + if (IS_ERR(op)) + return PTR_ERR(op); - ret = -ERESTARTSYS; - if (afs_begin_vnode_operation(&fc, vnode, key, true)) { - afs_dataversion_t data_version = vnode->status.data_version; + afs_op_set_vnode(op, 0, vnode); - while (afs_select_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(vnode); - afs_fs_fetch_data(&fc, scb, req); - } - - afs_check_for_remote_deletion(&fc, vnode); - afs_vnode_commit_status(&fc, vnode, fc.cb_break, - &data_version, scb); - ret = afs_end_vnode_operation(&fc); - } - - if (ret == 0) { - afs_stat_v(vnode, n_fetches); - atomic_long_add(req->actual_len, - &afs_v2net(vnode)->n_fetch_bytes); - } - - kfree(scb); - _leave(" = %d", ret); - return ret; + op->fetch.req = afs_get_read(req); + op->ops = &afs_fetch_data_operation; + return afs_do_sync_operation(op); } /* diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 682fe745f10e..70e518f7bc19 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -70,7 +70,8 @@ static void afs_schedule_lock_extension(struct afs_vnode *vnode) */ void afs_lock_op_done(struct afs_call *call) { - struct afs_vnode *vnode = call->lvnode; + struct afs_operation *op = call->op; + struct afs_vnode *vnode = op->lock.lvnode; if (call->error == 0) { spin_lock(&vnode->lock); @@ -172,15 +173,28 @@ static void afs_kill_lockers_enoent(struct afs_vnode *vnode) vnode->lock_key = NULL; } +static void afs_lock_success(struct afs_operation *op) +{ + struct afs_vnode *vnode = op->file[0].vnode; + + _enter("op=%08x", op->debug_id); + afs_check_for_remote_deletion(op, vnode); + afs_vnode_commit_status(op, &op->file[0]); +} + +static const struct afs_operation_ops afs_set_lock_operation = { + .issue_afs_rpc = afs_fs_set_lock, + .issue_yfs_rpc = yfs_fs_set_lock, + .success = afs_lock_success, +}; + /* * Get a lock on a file */ static int afs_set_lock(struct afs_vnode *vnode, struct key *key, afs_lock_type_t type) { - struct afs_status_cb *scb; - struct afs_operation fc; - int ret; + struct afs_operation *op; _enter("%s{%llx:%llu.%u},%x,%u", vnode->volume->name, @@ -189,35 +203,29 @@ static int afs_set_lock(struct afs_vnode *vnode, struct key *key, vnode->fid.unique, key_serial(key), type); - scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL); - if (!scb) - return -ENOMEM; + op = afs_alloc_operation(key, vnode->volume); + if (IS_ERR(op)) + return PTR_ERR(op); - ret = -ERESTARTSYS; - if (afs_begin_vnode_operation(&fc, vnode, key, true)) { - while (afs_select_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(vnode); - afs_fs_set_lock(&fc, type, scb); - } + afs_op_set_vnode(op, 0, vnode); - afs_check_for_remote_deletion(&fc, vnode); - afs_vnode_commit_status(&fc, vnode, fc.cb_break, NULL, scb); - ret = afs_end_vnode_operation(&fc); - } - - kfree(scb); - _leave(" = %d", ret); - return ret; + op->lock.type = type; + op->ops = &afs_set_lock_operation; + return afs_do_sync_operation(op); } +static const struct afs_operation_ops afs_extend_lock_operation = { + .issue_afs_rpc = afs_fs_extend_lock, + .issue_yfs_rpc = yfs_fs_extend_lock, + .success = afs_lock_success, +}; + /* * Extend a lock on a file */ static int afs_extend_lock(struct afs_vnode *vnode, struct key *key) { - struct afs_status_cb *scb; - struct afs_operation fc; - int ret; + struct afs_operation *op; _enter("%s{%llx:%llu.%u},%x", vnode->volume->name, @@ -226,35 +234,29 @@ static int afs_extend_lock(struct afs_vnode *vnode, struct key *key) vnode->fid.unique, key_serial(key)); - scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL); - if (!scb) - return -ENOMEM; + op = afs_alloc_operation(key, vnode->volume); + if (IS_ERR(op)) + return PTR_ERR(op); - ret = -ERESTARTSYS; - if (afs_begin_vnode_operation(&fc, vnode, key, false)) { - while (afs_select_current_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(vnode); - afs_fs_extend_lock(&fc, scb); - } + afs_op_set_vnode(op, 0, vnode); - afs_check_for_remote_deletion(&fc, vnode); - afs_vnode_commit_status(&fc, vnode, fc.cb_break, NULL, scb); - ret = afs_end_vnode_operation(&fc); - } - - kfree(scb); - _leave(" = %d", ret); - return ret; + op->flags |= AFS_OPERATION_UNINTR; + op->ops = &afs_extend_lock_operation; + return afs_do_sync_operation(op); } +static const struct afs_operation_ops afs_release_lock_operation = { + .issue_afs_rpc = afs_fs_release_lock, + .issue_yfs_rpc = yfs_fs_release_lock, + .success = afs_lock_success, +}; + /* * Release a lock on a file */ static int afs_release_lock(struct afs_vnode *vnode, struct key *key) { - struct afs_status_cb *scb; - struct afs_operation fc; - int ret; + struct afs_operation *op; _enter("%s{%llx:%llu.%u},%x", vnode->volume->name, @@ -263,25 +265,15 @@ static int afs_release_lock(struct afs_vnode *vnode, struct key *key) vnode->fid.unique, key_serial(key)); - scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL); - if (!scb) - return -ENOMEM; + op = afs_alloc_operation(key, vnode->volume); + if (IS_ERR(op)) + return PTR_ERR(op); - ret = -ERESTARTSYS; - if (afs_begin_vnode_operation(&fc, vnode, key, false)) { - while (afs_select_current_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(vnode); - afs_fs_release_lock(&fc, scb); - } + afs_op_set_vnode(op, 0, vnode); - afs_check_for_remote_deletion(&fc, vnode); - afs_vnode_commit_status(&fc, vnode, fc.cb_break, NULL, scb); - ret = afs_end_vnode_operation(&fc); - } - - kfree(scb); - _leave(" = %d", ret); - return ret; + op->flags |= AFS_OPERATION_UNINTR; + op->ops = &afs_release_lock_operation; + return afs_do_sync_operation(op); } /* diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c new file mode 100644 index 000000000000..f7a768d12141 --- /dev/null +++ b/fs/afs/fs_operation.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Fileserver-directed operation handling. + * + * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include +#include +#include "internal.h" + +static atomic_t afs_operation_debug_counter; + +/* + * Create an operation against a volume. + */ +struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *volume) +{ + struct afs_operation *op; + + _enter(""); + + op = kzalloc(sizeof(*op), GFP_KERNEL); + if (!op) + return ERR_PTR(-ENOMEM); + + if (!key) { + key = afs_request_key(volume->cell); + if (IS_ERR(key)) { + kfree(op); + return ERR_CAST(key); + } + } else { + key_get(key); + } + + op->key = key; + op->volume = afs_get_volume(volume); + op->net = volume->cell->net; + op->cb_v_break = volume->cb_v_break; + op->debug_id = atomic_inc_return(&afs_operation_debug_counter); + op->error = -EDESTADDRREQ; + op->ac.error = SHRT_MAX; + + _leave(" = [op=%08x]", op->debug_id); + return op; +} + +/* + * Lock the vnode(s) being operated upon. + */ +static bool afs_get_io_locks(struct afs_operation *op) +{ + struct afs_vnode *vnode = op->file[0].vnode; + struct afs_vnode *vnode2 = op->file[1].vnode; + + _enter(""); + + if (op->flags & AFS_OPERATION_UNINTR) { + mutex_lock(&vnode->io_lock); + op->flags |= AFS_OPERATION_LOCK_0; + _leave(" = t [1]"); + return true; + } + + if (!vnode2 || !op->file[1].need_io_lock || vnode == vnode2) + vnode2 = NULL; + + if (vnode2 > vnode) + swap(vnode, vnode2); + + if (mutex_lock_interruptible(&vnode->io_lock) < 0) { + op->error = -EINTR; + op->flags |= AFS_OPERATION_STOP; + _leave(" = f [I 0]"); + return false; + } + op->flags |= AFS_OPERATION_LOCK_0; + + if (vnode2) { + if (mutex_lock_interruptible_nested(&vnode2->io_lock, 1) < 0) { + op->error = -EINTR; + op->flags |= AFS_OPERATION_STOP; + mutex_unlock(&vnode->io_lock); + op->flags &= ~AFS_OPERATION_LOCK_0; + _leave(" = f [I 1]"); + return false; + } + op->flags |= AFS_OPERATION_LOCK_1; + } + + _leave(" = t [2]"); + return true; +} + +static void afs_drop_io_locks(struct afs_operation *op) +{ + struct afs_vnode *vnode = op->file[0].vnode; + struct afs_vnode *vnode2 = op->file[1].vnode; + + _enter(""); + + if (op->flags & AFS_OPERATION_LOCK_1) + mutex_unlock(&vnode2->io_lock); + if (op->flags & AFS_OPERATION_LOCK_0) + mutex_unlock(&vnode->io_lock); +} + +static void afs_prepare_vnode(struct afs_operation *op, struct afs_vnode_param *vp, + unsigned int index) +{ + struct afs_vnode *vnode = vp->vnode; + + if (vnode) { + vp->fid = vnode->fid; + vp->dv_before = vnode->status.data_version; + vp->cb_break_before = afs_calc_vnode_cb_break(vnode); + if (vnode->lock_state != AFS_VNODE_LOCK_NONE) + op->flags |= AFS_OPERATION_CUR_ONLY; + } + + if (vp->fid.vnode) + _debug("PREP[%u] {%llx:%llu.%u}", + index, vp->fid.vid, vp->fid.vnode, vp->fid.unique); +} + +/* + * Begin an operation on the fileserver. + * + * Fileserver operations are serialised on the server by vnode, so we serialise + * them here also using the io_lock. + */ +bool afs_begin_vnode_operation(struct afs_operation *op) +{ + struct afs_vnode *vnode = op->file[0].vnode; + + ASSERT(vnode); + + _enter(""); + + if (op->file[0].need_io_lock) + if (!afs_get_io_locks(op)) + return false; + + read_seqlock_excl(&vnode->cb_lock); + op->cbi = afs_get_cb_interest( + rcu_dereference_protected(vnode->cb_interest, + lockdep_is_held(&vnode->cb_lock.lock))); + read_sequnlock_excl(&vnode->cb_lock); + + afs_prepare_vnode(op, &op->file[0], 0); + afs_prepare_vnode(op, &op->file[1], 1); + op->cb_v_break = op->volume->cb_v_break; + _leave(" = true"); + return true; +} + +/* + * Tidy up a filesystem cursor and unlock the vnode. + */ +static void afs_end_vnode_operation(struct afs_operation *op) +{ + _enter(""); + + if (op->error == -EDESTADDRREQ || + op->error == -EADDRNOTAVAIL || + op->error == -ENETUNREACH || + op->error == -EHOSTUNREACH) + afs_dump_edestaddrreq(op); + + afs_drop_io_locks(op); + + if (op->error == -ECONNABORTED) + op->error = afs_abort_to_error(op->ac.abort_code); +} + +/* + * Wait for an in-progress operation to complete. + */ +void afs_wait_for_operation(struct afs_operation *op) +{ + _enter(""); + + while (afs_select_fileserver(op)) { + op->cb_s_break = op->cbi->server->cb_s_break; + if (test_bit(AFS_SERVER_FL_IS_YFS, &op->cbi->server->flags) && + op->ops->issue_yfs_rpc) + op->ops->issue_yfs_rpc(op); + else + op->ops->issue_afs_rpc(op); + + op->error = afs_wait_for_call_to_complete(op->call, &op->ac); + } + + if (op->error == 0) { + _debug("success"); + op->ops->success(op); + } + + afs_end_vnode_operation(op); + + if (op->error == 0 && op->ops->edit_dir) { + _debug("edit_dir"); + op->ops->edit_dir(op); + } + _leave(""); +} + +/* + * Dispose of an operation. + */ +int afs_put_operation(struct afs_operation *op) +{ + int i, ret = op->error; + + _enter("op=%08x,%d", op->debug_id, ret); + + if (op->ops && op->ops->put) + op->ops->put(op); + if (op->file[0].put_vnode) + iput(&op->file[0].vnode->vfs_inode); + if (op->file[1].put_vnode) + iput(&op->file[1].vnode->vfs_inode); + + if (op->more_files) { + for (i = 0; i < op->nr_files - 2; i++) + if (op->more_files[i].put_vnode) + iput(&op->more_files[i].vnode->vfs_inode); + kfree(op->more_files); + } + + afs_end_cursor(&op->ac); + afs_put_cb_interest(op->net, op->cbi); + afs_put_serverlist(op->net, op->server_list); + afs_put_volume(op->net, op->volume); + kfree(op); + return ret; +} + +int afs_do_sync_operation(struct afs_operation *op) +{ + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + return afs_put_operation(op); +} diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 3e423e9daa24..b1b45f10583d 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -13,12 +13,6 @@ #include "internal.h" #include "afs_fs.h" #include "xdr_fs.h" -#include "protocol_yfs.h" - -static inline void afs_use_fs_server(struct afs_call *call, struct afs_cb_interest *cbi) -{ - call->cbi = afs_get_cb_interest(cbi); -} /* * decode an AFSFid block @@ -240,8 +234,10 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp, /* * deliver reply data to an FS.FetchStatus */ -static int afs_deliver_fs_fetch_status_vnode(struct afs_call *call) +static int afs_deliver_fs_fetch_status(struct afs_call *call) { + struct afs_operation *op = call->op; + struct afs_vnode_param *vp = &op->file[op->fetch_status.which]; const __be32 *bp; int ret; @@ -251,9 +247,9 @@ static int afs_deliver_fs_fetch_status_vnode(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - xdr_decode_AFSCallBack(&bp, call, call->out_scb); - xdr_decode_AFSVolSync(&bp, call->out_volsync); + xdr_decode_AFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_AFSCallBack(&bp, call, &vp->scb); + xdr_decode_AFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; @@ -262,54 +258,39 @@ static int afs_deliver_fs_fetch_status_vnode(struct afs_call *call) /* * FS.FetchStatus operation type */ -static const struct afs_call_type afs_RXFSFetchStatus_vnode = { - .name = "FS.FetchStatus(vnode)", +static const struct afs_call_type afs_RXFSFetchStatus = { + .name = "FS.FetchStatus", .op = afs_FS_FetchStatus, - .deliver = afs_deliver_fs_fetch_status_vnode, + .deliver = afs_deliver_fs_fetch_status, .destructor = afs_flat_call_destructor, }; /* * fetch the status information for a file */ -int afs_fs_fetch_file_status(struct afs_operation *fc, struct afs_status_cb *scb, - struct afs_volsync *volsync) +void afs_fs_fetch_status(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[op->fetch_status.which]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); __be32 *bp; - if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) - return yfs_fs_fetch_file_status(fc, scb, volsync); - _enter(",%x,{%llx:%llu},,", - key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + key_serial(op->key), vp->fid.vid, vp->fid.vnode); - call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus_vnode, + call = afs_alloc_flat_call(op->net, &afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4); - if (!call) { - fc->ac.error = -ENOMEM; - return -ENOMEM; - } - - call->key = fc->key; - call->out_scb = scb; - call->out_volsync = volsync; + if (!call) + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp[0] = htonl(FSFETCHSTATUS); - bp[1] = htonl(vnode->fid.vid); - bp[2] = htonl(vnode->fid.vnode); - bp[3] = htonl(vnode->fid.unique); + bp[1] = htonl(vp->fid.vid); + bp[2] = htonl(vp->fid.vnode); + bp[3] = htonl(vp->fid.unique); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); - - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -317,7 +298,9 @@ int afs_fs_fetch_file_status(struct afs_operation *fc, struct afs_status_cb *scb */ static int afs_deliver_fs_fetch_data(struct afs_call *call) { - struct afs_read *req = call->read_request; + struct afs_operation *op = call->op; + struct afs_vnode_param *vp = &op->file[0]; + struct afs_read *req = op->fetch.req; const __be32 *bp; unsigned int size; int ret; @@ -414,12 +397,12 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) return ret; bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - xdr_decode_AFSCallBack(&bp, call, call->out_scb); - xdr_decode_AFSVolSync(&bp, call->out_volsync); + xdr_decode_AFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_AFSCallBack(&bp, call, &vp->scb); + xdr_decode_AFSVolSync(&bp, &op->volsync); - req->data_version = call->out_scb->status.data_version; - req->file_size = call->out_scb->status.size; + req->data_version = vp->scb.status.data_version; + req->file_size = vp->scb.status.size; call->unmarshall++; @@ -442,14 +425,6 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) return 0; } -static void afs_fetch_data_destructor(struct afs_call *call) -{ - struct afs_read *req = call->read_request; - - afs_put_read(req); - afs_flat_call_destructor(call); -} - /* * FS.FetchData operation type */ @@ -457,102 +432,79 @@ static const struct afs_call_type afs_RXFSFetchData = { .name = "FS.FetchData", .op = afs_FS_FetchData, .deliver = afs_deliver_fs_fetch_data, - .destructor = afs_fetch_data_destructor, + .destructor = afs_flat_call_destructor, }; static const struct afs_call_type afs_RXFSFetchData64 = { .name = "FS.FetchData64", .op = afs_FS_FetchData64, .deliver = afs_deliver_fs_fetch_data, - .destructor = afs_fetch_data_destructor, + .destructor = afs_flat_call_destructor, }; /* * fetch data from a very large file */ -static int afs_fs_fetch_data64(struct afs_operation *fc, - struct afs_status_cb *scb, - struct afs_read *req) +static void afs_fs_fetch_data64(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; + struct afs_read *req = op->fetch.req; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); __be32 *bp; _enter(""); - call = afs_alloc_flat_call(net, &afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4); + call = afs_alloc_flat_call(op->net, &afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_scb = scb; - call->out_volsync = NULL; - call->read_request = afs_get_read(req); + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp[0] = htonl(FSFETCHDATA64); - bp[1] = htonl(vnode->fid.vid); - bp[2] = htonl(vnode->fid.vnode); - bp[3] = htonl(vnode->fid.unique); + bp[1] = htonl(vp->fid.vid); + bp[2] = htonl(vp->fid.vnode); + bp[3] = htonl(vp->fid.unique); bp[4] = htonl(upper_32_bits(req->pos)); bp[5] = htonl(lower_32_bits(req->pos)); bp[6] = 0; bp[7] = htonl(lower_32_bits(req->len)); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* * fetch data from a file */ -int afs_fs_fetch_data(struct afs_operation *fc, - struct afs_status_cb *scb, - struct afs_read *req) +void afs_fs_fetch_data(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); + struct afs_read *req = op->fetch.req; __be32 *bp; - if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) - return yfs_fs_fetch_data(fc, scb, req); - if (upper_32_bits(req->pos) || upper_32_bits(req->len) || upper_32_bits(req->pos + req->len)) - return afs_fs_fetch_data64(fc, scb, req); + return afs_fs_fetch_data64(op); _enter(""); - call = afs_alloc_flat_call(net, &afs_RXFSFetchData, 24, (21 + 3 + 6) * 4); + call = afs_alloc_flat_call(op->net, &afs_RXFSFetchData, 24, (21 + 3 + 6) * 4); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_scb = scb; - call->out_volsync = NULL; - call->read_request = afs_get_read(req); + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp[0] = htonl(FSFETCHDATA); - bp[1] = htonl(vnode->fid.vid); - bp[2] = htonl(vnode->fid.vnode); - bp[3] = htonl(vnode->fid.unique); + bp[1] = htonl(vp->fid.vid); + bp[2] = htonl(vp->fid.vnode); + bp[3] = htonl(vp->fid.unique); bp[4] = htonl(lower_32_bits(req->pos)); bp[5] = htonl(lower_32_bits(req->len)); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -560,6 +512,9 @@ int afs_fs_fetch_data(struct afs_operation *fc, */ static int afs_deliver_fs_create_vnode(struct afs_call *call) { + struct afs_operation *op = call->op; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; const __be32 *bp; int ret; @@ -569,11 +524,11 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFid(&bp, call->out_fid); - xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb); - xdr_decode_AFSCallBack(&bp, call, call->out_scb); - xdr_decode_AFSVolSync(&bp, call->out_volsync); + xdr_decode_AFSFid(&bp, &op->file[1].fid); + xdr_decode_AFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_AFSFetchStatus(&bp, call, &dvp->scb); + xdr_decode_AFSCallBack(&bp, call, &vp->scb); + xdr_decode_AFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; @@ -589,6 +544,52 @@ static const struct afs_call_type afs_RXFSCreateFile = { .destructor = afs_flat_call_destructor, }; +/* + * Create a file. + */ +void afs_fs_create_file(struct afs_operation *op) +{ + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_call *call; + size_t namesz, reqsz, padsz; + __be32 *bp; + + _enter(""); + + namesz = name->len; + padsz = (4 - (namesz & 3)) & 3; + reqsz = (5 * 4) + namesz + padsz + (6 * 4); + + call = afs_alloc_flat_call(op->net, &afs_RXFSCreateFile, + reqsz, (3 + 21 + 21 + 3 + 6) * 4); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSCREATEFILE); + *bp++ = htonl(dvp->fid.vid); + *bp++ = htonl(dvp->fid.vnode); + *bp++ = htonl(dvp->fid.unique); + *bp++ = htonl(namesz); + memcpy(bp, name->name, namesz); + bp = (void *) bp + namesz; + if (padsz > 0) { + memset(bp, 0, padsz); + bp = (void *) bp + padsz; + } + *bp++ = htonl(AFS_SET_MODE | AFS_SET_MTIME); + *bp++ = htonl(op->mtime.tv_sec); /* mtime */ + *bp++ = 0; /* owner */ + *bp++ = 0; /* group */ + *bp++ = htonl(op->create.mode & S_IALLUGO); /* unix mode */ + *bp++ = 0; /* segment size */ + + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); +} + static const struct afs_call_type afs_RXFSMakeDir = { .name = "FS.MakeDir", .op = afs_FS_MakeDir, @@ -597,80 +598,58 @@ static const struct afs_call_type afs_RXFSMakeDir = { }; /* - * create a file or make a directory + * Create a new directory */ -int afs_fs_create(struct afs_operation *fc, - const char *name, - umode_t mode, - struct afs_status_cb *dvnode_scb, - struct afs_fid *newfid, - struct afs_status_cb *new_scb) +void afs_fs_make_dir(struct afs_operation *op) { - struct afs_vnode *dvnode = fc->vnode; + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(dvnode); size_t namesz, reqsz, padsz; __be32 *bp; - if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)){ - if (S_ISDIR(mode)) - return yfs_fs_make_dir(fc, name, mode, dvnode_scb, - newfid, new_scb); - else - return yfs_fs_create_file(fc, name, mode, dvnode_scb, - newfid, new_scb); - } - _enter(""); - namesz = strlen(name); + namesz = name->len; padsz = (4 - (namesz & 3)) & 3; reqsz = (5 * 4) + namesz + padsz + (6 * 4); - call = afs_alloc_flat_call( - net, S_ISDIR(mode) ? &afs_RXFSMakeDir : &afs_RXFSCreateFile, - reqsz, (3 + 21 + 21 + 3 + 6) * 4); + call = afs_alloc_flat_call(op->net, &afs_RXFSMakeDir, + reqsz, (3 + 21 + 21 + 3 + 6) * 4); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_dir_scb = dvnode_scb; - call->out_fid = newfid; - call->out_scb = new_scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; - *bp++ = htonl(S_ISDIR(mode) ? FSMAKEDIR : FSCREATEFILE); - *bp++ = htonl(dvnode->fid.vid); - *bp++ = htonl(dvnode->fid.vnode); - *bp++ = htonl(dvnode->fid.unique); + *bp++ = htonl(FSMAKEDIR); + *bp++ = htonl(dvp->fid.vid); + *bp++ = htonl(dvp->fid.vnode); + *bp++ = htonl(dvp->fid.unique); *bp++ = htonl(namesz); - memcpy(bp, name, namesz); + memcpy(bp, name->name, namesz); bp = (void *) bp + namesz; if (padsz > 0) { memset(bp, 0, padsz); bp = (void *) bp + padsz; } *bp++ = htonl(AFS_SET_MODE | AFS_SET_MTIME); - *bp++ = htonl(dvnode->vfs_inode.i_mtime.tv_sec); /* mtime */ + *bp++ = htonl(op->mtime.tv_sec); /* mtime */ *bp++ = 0; /* owner */ *bp++ = 0; /* group */ - *bp++ = htonl(mode & S_IALLUGO); /* unix mode */ + *bp++ = htonl(op->create.mode & S_IALLUGO); /* unix mode */ *bp++ = 0; /* segment size */ - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call1(call, &dvnode->fid, name); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); } /* - * Deliver reply data to any operation that returns directory status and volume - * sync. + * Deliver reply data to any operation that returns status and volume sync. */ -static int afs_deliver_fs_dir_status_and_vol(struct afs_call *call) +static int afs_deliver_fs_file_status_and_vol(struct afs_call *call) { + struct afs_operation *op = call->op; + struct afs_vnode_param *vp = &op->file[0]; const __be32 *bp; int ret; @@ -680,79 +659,108 @@ static int afs_deliver_fs_dir_status_and_vol(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb); - xdr_decode_AFSVolSync(&bp, call->out_volsync); + xdr_decode_AFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_AFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; } /* - * FS.RemoveDir/FS.RemoveFile operation type + * FS.RemoveFile operation type */ static const struct afs_call_type afs_RXFSRemoveFile = { .name = "FS.RemoveFile", .op = afs_FS_RemoveFile, - .deliver = afs_deliver_fs_dir_status_and_vol, - .destructor = afs_flat_call_destructor, -}; - -static const struct afs_call_type afs_RXFSRemoveDir = { - .name = "FS.RemoveDir", - .op = afs_FS_RemoveDir, - .deliver = afs_deliver_fs_dir_status_and_vol, + .deliver = afs_deliver_fs_file_status_and_vol, .destructor = afs_flat_call_destructor, }; /* - * remove a file or directory + * Remove a file. */ -int afs_fs_remove(struct afs_operation *fc, struct afs_vnode *vnode, - const char *name, bool isdir, struct afs_status_cb *dvnode_scb) +void afs_fs_remove_file(struct afs_operation *op) { - struct afs_vnode *dvnode = fc->vnode; + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(dvnode); size_t namesz, reqsz, padsz; __be32 *bp; - if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) - return yfs_fs_remove(fc, vnode, name, isdir, dvnode_scb); - _enter(""); - namesz = strlen(name); + namesz = name->len; padsz = (4 - (namesz & 3)) & 3; reqsz = (5 * 4) + namesz + padsz; - call = afs_alloc_flat_call( - net, isdir ? &afs_RXFSRemoveDir : &afs_RXFSRemoveFile, - reqsz, (21 + 6) * 4); + call = afs_alloc_flat_call(op->net, &afs_RXFSRemoveFile, + reqsz, (21 + 6) * 4); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_dir_scb = dvnode_scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; - *bp++ = htonl(isdir ? FSREMOVEDIR : FSREMOVEFILE); - *bp++ = htonl(dvnode->fid.vid); - *bp++ = htonl(dvnode->fid.vnode); - *bp++ = htonl(dvnode->fid.unique); + *bp++ = htonl(FSREMOVEFILE); + *bp++ = htonl(dvp->fid.vid); + *bp++ = htonl(dvp->fid.vnode); + *bp++ = htonl(dvp->fid.unique); *bp++ = htonl(namesz); - memcpy(bp, name, namesz); + memcpy(bp, name->name, namesz); bp = (void *) bp + namesz; if (padsz > 0) { memset(bp, 0, padsz); bp = (void *) bp + padsz; } - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call1(call, &dvnode->fid, name); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); +} + +static const struct afs_call_type afs_RXFSRemoveDir = { + .name = "FS.RemoveDir", + .op = afs_FS_RemoveDir, + .deliver = afs_deliver_fs_file_status_and_vol, + .destructor = afs_flat_call_destructor, +}; + +/* + * Remove a directory. + */ +void afs_fs_remove_dir(struct afs_operation *op) +{ + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_call *call; + size_t namesz, reqsz, padsz; + __be32 *bp; + + _enter(""); + + namesz = name->len; + padsz = (4 - (namesz & 3)) & 3; + reqsz = (5 * 4) + namesz + padsz; + + call = afs_alloc_flat_call(op->net, &afs_RXFSRemoveDir, + reqsz, (21 + 6) * 4); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSREMOVEDIR); + *bp++ = htonl(dvp->fid.vid); + *bp++ = htonl(dvp->fid.vnode); + *bp++ = htonl(dvp->fid.unique); + *bp++ = htonl(namesz); + memcpy(bp, name->name, namesz); + bp = (void *) bp + namesz; + if (padsz > 0) { + memset(bp, 0, padsz); + bp = (void *) bp + padsz; + } + + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -760,6 +768,9 @@ int afs_fs_remove(struct afs_operation *fc, struct afs_vnode *vnode, */ static int afs_deliver_fs_link(struct afs_call *call) { + struct afs_operation *op = call->op; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; const __be32 *bp; int ret; @@ -771,9 +782,9 @@ static int afs_deliver_fs_link(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb); - xdr_decode_AFSVolSync(&bp, call->out_volsync); + xdr_decode_AFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_AFSFetchStatus(&bp, call, &dvp->scb); + xdr_decode_AFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; @@ -792,56 +803,44 @@ static const struct afs_call_type afs_RXFSLink = { /* * make a hard link */ -int afs_fs_link(struct afs_operation *fc, struct afs_vnode *vnode, - const char *name, - struct afs_status_cb *dvnode_scb, - struct afs_status_cb *vnode_scb) +void afs_fs_link(struct afs_operation *op) { - struct afs_vnode *dvnode = fc->vnode; + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); size_t namesz, reqsz, padsz; __be32 *bp; - if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) - return yfs_fs_link(fc, vnode, name, dvnode_scb, vnode_scb); - _enter(""); - namesz = strlen(name); + namesz = name->len; padsz = (4 - (namesz & 3)) & 3; reqsz = (5 * 4) + namesz + padsz + (3 * 4); - call = afs_alloc_flat_call(net, &afs_RXFSLink, reqsz, (21 + 21 + 6) * 4); + call = afs_alloc_flat_call(op->net, &afs_RXFSLink, reqsz, (21 + 21 + 6) * 4); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_dir_scb = dvnode_scb; - call->out_scb = vnode_scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSLINK); - *bp++ = htonl(dvnode->fid.vid); - *bp++ = htonl(dvnode->fid.vnode); - *bp++ = htonl(dvnode->fid.unique); + *bp++ = htonl(dvp->fid.vid); + *bp++ = htonl(dvp->fid.vnode); + *bp++ = htonl(dvp->fid.unique); *bp++ = htonl(namesz); - memcpy(bp, name, namesz); + memcpy(bp, name->name, namesz); bp = (void *) bp + namesz; if (padsz > 0) { memset(bp, 0, padsz); bp = (void *) bp + padsz; } - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(vp->fid.vid); + *bp++ = htonl(vp->fid.vnode); + *bp++ = htonl(vp->fid.unique); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call1(call, &vnode->fid, name); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call1(call, &vp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -849,6 +848,9 @@ int afs_fs_link(struct afs_operation *fc, struct afs_vnode *vnode, */ static int afs_deliver_fs_symlink(struct afs_call *call) { + struct afs_operation *op = call->op; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; const __be32 *bp; int ret; @@ -860,10 +862,10 @@ static int afs_deliver_fs_symlink(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFid(&bp, call->out_fid); - xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb); - xdr_decode_AFSVolSync(&bp, call->out_volsync); + xdr_decode_AFSFid(&bp, &vp->fid); + xdr_decode_AFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_AFSFetchStatus(&bp, call, &dvp->scb); + xdr_decode_AFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; @@ -882,75 +884,58 @@ static const struct afs_call_type afs_RXFSSymlink = { /* * create a symbolic link */ -int afs_fs_symlink(struct afs_operation *fc, - const char *name, - const char *contents, - struct afs_status_cb *dvnode_scb, - struct afs_fid *newfid, - struct afs_status_cb *new_scb) +void afs_fs_symlink(struct afs_operation *op) { - struct afs_vnode *dvnode = fc->vnode; + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(dvnode); size_t namesz, reqsz, padsz, c_namesz, c_padsz; __be32 *bp; - if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) - return yfs_fs_symlink(fc, name, contents, dvnode_scb, - newfid, new_scb); - _enter(""); - namesz = strlen(name); + namesz = name->len; padsz = (4 - (namesz & 3)) & 3; - c_namesz = strlen(contents); + c_namesz = strlen(op->create.symlink); c_padsz = (4 - (c_namesz & 3)) & 3; reqsz = (6 * 4) + namesz + padsz + c_namesz + c_padsz + (6 * 4); - call = afs_alloc_flat_call(net, &afs_RXFSSymlink, reqsz, + call = afs_alloc_flat_call(op->net, &afs_RXFSSymlink, reqsz, (3 + 21 + 21 + 6) * 4); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_dir_scb = dvnode_scb; - call->out_fid = newfid; - call->out_scb = new_scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSSYMLINK); - *bp++ = htonl(dvnode->fid.vid); - *bp++ = htonl(dvnode->fid.vnode); - *bp++ = htonl(dvnode->fid.unique); + *bp++ = htonl(dvp->fid.vid); + *bp++ = htonl(dvp->fid.vnode); + *bp++ = htonl(dvp->fid.unique); *bp++ = htonl(namesz); - memcpy(bp, name, namesz); + memcpy(bp, name->name, namesz); bp = (void *) bp + namesz; if (padsz > 0) { memset(bp, 0, padsz); bp = (void *) bp + padsz; } *bp++ = htonl(c_namesz); - memcpy(bp, contents, c_namesz); + memcpy(bp, op->create.symlink, c_namesz); bp = (void *) bp + c_namesz; if (c_padsz > 0) { memset(bp, 0, c_padsz); bp = (void *) bp + c_padsz; } *bp++ = htonl(AFS_SET_MODE | AFS_SET_MTIME); - *bp++ = htonl(dvnode->vfs_inode.i_mtime.tv_sec); /* mtime */ + *bp++ = htonl(op->mtime.tv_sec); /* mtime */ *bp++ = 0; /* owner */ *bp++ = 0; /* group */ *bp++ = htonl(S_IRWXUGO); /* unix mode */ *bp++ = 0; /* segment size */ - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call1(call, &dvnode->fid, name); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -958,6 +943,9 @@ int afs_fs_symlink(struct afs_operation *fc, */ static int afs_deliver_fs_rename(struct afs_call *call) { + struct afs_operation *op = call->op; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; const __be32 *bp; int ret; @@ -969,9 +957,9 @@ static int afs_deliver_fs_rename(struct afs_call *call) /* If the two dirs are the same, we have two copies of the same status * report, so we just decode it twice. */ - xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb); - xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - xdr_decode_AFSVolSync(&bp, call->out_volsync); + xdr_decode_AFSFetchStatus(&bp, call, &orig_dvp->scb); + xdr_decode_AFSFetchStatus(&bp, call, &new_dvp->scb); + xdr_decode_AFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; @@ -990,31 +978,22 @@ static const struct afs_call_type afs_RXFSRename = { /* * Rename/move a file or directory. */ -int afs_fs_rename(struct afs_operation *fc, - const char *orig_name, - struct afs_vnode *new_dvnode, - const char *new_name, - struct afs_status_cb *orig_dvnode_scb, - struct afs_status_cb *new_dvnode_scb) +void afs_fs_rename(struct afs_operation *op) { - struct afs_vnode *orig_dvnode = fc->vnode; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; struct afs_call *call; - struct afs_net *net = afs_v2net(orig_dvnode); size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz; __be32 *bp; - if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) - return yfs_fs_rename(fc, orig_name, - new_dvnode, new_name, - orig_dvnode_scb, - new_dvnode_scb); - _enter(""); - o_namesz = strlen(orig_name); + o_namesz = orig_name->len; o_padsz = (4 - (o_namesz & 3)) & 3; - n_namesz = strlen(new_name); + n_namesz = new_name->len; n_padsz = (4 - (n_namesz & 3)) & 3; reqsz = (4 * 4) + @@ -1022,51 +1001,46 @@ int afs_fs_rename(struct afs_operation *fc, (3 * 4) + 4 + n_namesz + n_padsz; - call = afs_alloc_flat_call(net, &afs_RXFSRename, reqsz, (21 + 21 + 6) * 4); + call = afs_alloc_flat_call(op->net, &afs_RXFSRename, reqsz, (21 + 21 + 6) * 4); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_dir_scb = orig_dvnode_scb; - call->out_scb = new_dvnode_scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSRENAME); - *bp++ = htonl(orig_dvnode->fid.vid); - *bp++ = htonl(orig_dvnode->fid.vnode); - *bp++ = htonl(orig_dvnode->fid.unique); + *bp++ = htonl(orig_dvp->fid.vid); + *bp++ = htonl(orig_dvp->fid.vnode); + *bp++ = htonl(orig_dvp->fid.unique); *bp++ = htonl(o_namesz); - memcpy(bp, orig_name, o_namesz); + memcpy(bp, orig_name->name, o_namesz); bp = (void *) bp + o_namesz; if (o_padsz > 0) { memset(bp, 0, o_padsz); bp = (void *) bp + o_padsz; } - *bp++ = htonl(new_dvnode->fid.vid); - *bp++ = htonl(new_dvnode->fid.vnode); - *bp++ = htonl(new_dvnode->fid.unique); + *bp++ = htonl(new_dvp->fid.vid); + *bp++ = htonl(new_dvp->fid.vnode); + *bp++ = htonl(new_dvp->fid.unique); *bp++ = htonl(n_namesz); - memcpy(bp, new_name, n_namesz); + memcpy(bp, new_name->name, n_namesz); bp = (void *) bp + n_namesz; if (n_padsz > 0) { memset(bp, 0, n_padsz); bp = (void *) bp + n_padsz; } - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call2(call, &orig_dvnode->fid, orig_name, new_name); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); } /* - * deliver reply data to an FS.StoreData + * Deliver reply data to FS.StoreData or FS.StoreStatus */ static int afs_deliver_fs_store_data(struct afs_call *call) { + struct afs_operation *op = call->op; + struct afs_vnode_param *vp = &op->file[0]; const __be32 *bp; int ret; @@ -1078,8 +1052,8 @@ static int afs_deliver_fs_store_data(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - xdr_decode_AFSVolSync(&bp, call->out_volsync); + xdr_decode_AFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_AFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; @@ -1105,90 +1079,69 @@ static const struct afs_call_type afs_RXFSStoreData64 = { /* * store a set of pages to a very large file */ -static int afs_fs_store_data64(struct afs_operation *fc, - struct address_space *mapping, - pgoff_t first, pgoff_t last, - unsigned offset, unsigned to, - loff_t size, loff_t pos, loff_t i_size, - struct afs_status_cb *scb) +static void afs_fs_store_data64(struct afs_operation *op, + loff_t pos, loff_t size, loff_t i_size) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); __be32 *bp; _enter(",%x,{%llx:%llu},,", - key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + key_serial(op->key), vp->fid.vid, vp->fid.vnode); - call = afs_alloc_flat_call(net, &afs_RXFSStoreData64, + call = afs_alloc_flat_call(op->net, &afs_RXFSStoreData64, (4 + 6 + 3 * 2) * 4, (21 + 6) * 4); if (!call) - return -ENOMEM; + return afs_op_nomem(op); - call->key = fc->key; - call->mapping = mapping; - call->first = first; - call->last = last; - call->first_offset = offset; - call->last_to = to; call->send_pages = true; - call->out_scb = scb; /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSSTOREDATA64); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(vp->fid.vid); + *bp++ = htonl(vp->fid.vnode); + *bp++ = htonl(vp->fid.unique); *bp++ = htonl(AFS_SET_MTIME); /* mask */ - *bp++ = htonl(vnode->vfs_inode.i_mtime.tv_sec); /* mtime */ + *bp++ = htonl(op->mtime.tv_sec); /* mtime */ *bp++ = 0; /* owner */ *bp++ = 0; /* group */ *bp++ = 0; /* unix mode */ *bp++ = 0; /* segment size */ - *bp++ = htonl(pos >> 32); - *bp++ = htonl((u32) pos); - *bp++ = htonl(size >> 32); - *bp++ = htonl((u32) size); - *bp++ = htonl(i_size >> 32); - *bp++ = htonl((u32) i_size); + *bp++ = htonl(upper_32_bits(pos)); + *bp++ = htonl(lower_32_bits(pos)); + *bp++ = htonl(upper_32_bits(size)); + *bp++ = htonl(lower_32_bits(size)); + *bp++ = htonl(upper_32_bits(i_size)); + *bp++ = htonl(lower_32_bits(i_size)); - trace_afs_make_fs_call(call, &vnode->fid); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* * store a set of pages */ -int afs_fs_store_data(struct afs_operation *fc, struct address_space *mapping, - pgoff_t first, pgoff_t last, - unsigned offset, unsigned to, - struct afs_status_cb *scb) +void afs_fs_store_data(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); loff_t size, pos, i_size; __be32 *bp; - if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) - return yfs_fs_store_data(fc, mapping, first, last, offset, to, scb); - _enter(",%x,{%llx:%llu},,", - key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + key_serial(op->key), vp->fid.vid, vp->fid.vnode); - size = (loff_t)to - (loff_t)offset; - if (first != last) - size += (loff_t)(last - first) << PAGE_SHIFT; - pos = (loff_t)first << PAGE_SHIFT; - pos += offset; + size = (loff_t)op->store.last_to - (loff_t)op->store.first_offset; + if (op->store.first != op->store.last) + size += (loff_t)(op->store.last - op->store.first) << PAGE_SHIFT; + pos = (loff_t)op->store.first << PAGE_SHIFT; + pos += op->store.first_offset; - i_size = i_size_read(&vnode->vfs_inode); + i_size = i_size_read(&vp->vnode->vfs_inode); if (pos + size > i_size) i_size = size + pos; @@ -1196,71 +1149,38 @@ int afs_fs_store_data(struct afs_operation *fc, struct address_space *mapping, (unsigned long long) size, (unsigned long long) pos, (unsigned long long) i_size); - if (pos >> 32 || i_size >> 32 || size >> 32 || (pos + size) >> 32) - return afs_fs_store_data64(fc, mapping, first, last, offset, to, - size, pos, i_size, scb); + if (upper_32_bits(pos) || upper_32_bits(i_size) || upper_32_bits(size) || + upper_32_bits(pos + size)) + return afs_fs_store_data64(op, pos, size, i_size); - call = afs_alloc_flat_call(net, &afs_RXFSStoreData, + call = afs_alloc_flat_call(op->net, &afs_RXFSStoreData, (4 + 6 + 3) * 4, (21 + 6) * 4); if (!call) - return -ENOMEM; + return afs_op_nomem(op); - call->key = fc->key; - call->mapping = mapping; - call->first = first; - call->last = last; - call->first_offset = offset; - call->last_to = to; call->send_pages = true; - call->out_scb = scb; /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSSTOREDATA); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(vp->fid.vid); + *bp++ = htonl(vp->fid.vnode); + *bp++ = htonl(vp->fid.unique); *bp++ = htonl(AFS_SET_MTIME); /* mask */ - *bp++ = htonl(vnode->vfs_inode.i_mtime.tv_sec); /* mtime */ + *bp++ = htonl(op->mtime.tv_sec); /* mtime */ *bp++ = 0; /* owner */ *bp++ = 0; /* group */ *bp++ = 0; /* unix mode */ *bp++ = 0; /* segment size */ - *bp++ = htonl(pos); - *bp++ = htonl(size); - *bp++ = htonl(i_size); + *bp++ = htonl(lower_32_bits(pos)); + *bp++ = htonl(lower_32_bits(size)); + *bp++ = htonl(lower_32_bits(i_size)); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); -} - -/* - * deliver reply data to an FS.StoreStatus - */ -static int afs_deliver_fs_store_status(struct afs_call *call) -{ - const __be32 *bp; - int ret; - - _enter(""); - - ret = afs_transfer_reply(call); - if (ret < 0) - return ret; - - /* unmarshall the reply once we've received all of it */ - bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - xdr_decode_AFSVolSync(&bp, call->out_volsync); - - _leave(" = 0 [done]"); - return 0; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -1269,21 +1189,21 @@ static int afs_deliver_fs_store_status(struct afs_call *call) static const struct afs_call_type afs_RXFSStoreStatus = { .name = "FS.StoreStatus", .op = afs_FS_StoreStatus, - .deliver = afs_deliver_fs_store_status, + .deliver = afs_deliver_fs_store_data, .destructor = afs_flat_call_destructor, }; static const struct afs_call_type afs_RXFSStoreData_as_Status = { .name = "FS.StoreData", .op = afs_FS_StoreData, - .deliver = afs_deliver_fs_store_status, + .deliver = afs_deliver_fs_store_data, .destructor = afs_flat_call_destructor, }; static const struct afs_call_type afs_RXFSStoreData64_as_Status = { .name = "FS.StoreData64", .op = afs_FS_StoreData64, - .deliver = afs_deliver_fs_store_status, + .deliver = afs_deliver_fs_store_data, .destructor = afs_flat_call_destructor, }; @@ -1291,85 +1211,74 @@ static const struct afs_call_type afs_RXFSStoreData64_as_Status = { * set the attributes on a very large file, using FS.StoreData rather than * FS.StoreStatus so as to alter the file size also */ -static int afs_fs_setattr_size64(struct afs_operation *fc, struct iattr *attr, - struct afs_status_cb *scb) +static void afs_fs_setattr_size64(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); + struct iattr *attr = op->setattr.attr; __be32 *bp; _enter(",%x,{%llx:%llu},,", - key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + key_serial(op->key), vp->fid.vid, vp->fid.vnode); ASSERT(attr->ia_valid & ATTR_SIZE); - call = afs_alloc_flat_call(net, &afs_RXFSStoreData64_as_Status, + call = afs_alloc_flat_call(op->net, &afs_RXFSStoreData64_as_Status, (4 + 6 + 3 * 2) * 4, (21 + 6) * 4); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_scb = scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSSTOREDATA64); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(vp->fid.vid); + *bp++ = htonl(vp->fid.vnode); + *bp++ = htonl(vp->fid.unique); xdr_encode_AFS_StoreStatus(&bp, attr); - *bp++ = htonl(attr->ia_size >> 32); /* position of start of write */ - *bp++ = htonl((u32) attr->ia_size); - *bp++ = 0; /* size of write */ + *bp++ = htonl(upper_32_bits(attr->ia_size)); /* position of start of write */ + *bp++ = htonl(lower_32_bits(attr->ia_size)); + *bp++ = 0; /* size of write */ *bp++ = 0; - *bp++ = htonl(attr->ia_size >> 32); /* new file length */ - *bp++ = htonl((u32) attr->ia_size); + *bp++ = htonl(upper_32_bits(attr->ia_size)); /* new file length */ + *bp++ = htonl(lower_32_bits(attr->ia_size)); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* * set the attributes on a file, using FS.StoreData rather than FS.StoreStatus * so as to alter the file size also */ -static int afs_fs_setattr_size(struct afs_operation *fc, struct iattr *attr, - struct afs_status_cb *scb) +static void afs_fs_setattr_size(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); + struct iattr *attr = op->setattr.attr; __be32 *bp; _enter(",%x,{%llx:%llu},,", - key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + key_serial(op->key), vp->fid.vid, vp->fid.vnode); ASSERT(attr->ia_valid & ATTR_SIZE); - if (attr->ia_size >> 32) - return afs_fs_setattr_size64(fc, attr, scb); + if (upper_32_bits(attr->ia_size)) + return afs_fs_setattr_size64(op); - call = afs_alloc_flat_call(net, &afs_RXFSStoreData_as_Status, + call = afs_alloc_flat_call(op->net, &afs_RXFSStoreData_as_Status, (4 + 6 + 3) * 4, (21 + 6) * 4); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_scb = scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSSTOREDATA); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(vp->fid.vid); + *bp++ = htonl(vp->fid.vnode); + *bp++ = htonl(vp->fid.unique); xdr_encode_AFS_StoreStatus(&bp, attr); @@ -1377,57 +1286,44 @@ static int afs_fs_setattr_size(struct afs_operation *fc, struct iattr *attr, *bp++ = 0; /* size of write */ *bp++ = htonl(attr->ia_size); /* new file length */ - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* * set the attributes on a file, using FS.StoreData if there's a change in file * size, and FS.StoreStatus otherwise */ -int afs_fs_setattr(struct afs_operation *fc, struct iattr *attr, - struct afs_status_cb *scb) +void afs_fs_setattr(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); + struct iattr *attr = op->setattr.attr; __be32 *bp; - if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) - return yfs_fs_setattr(fc, attr, scb); - if (attr->ia_valid & ATTR_SIZE) - return afs_fs_setattr_size(fc, attr, scb); + return afs_fs_setattr_size(op); _enter(",%x,{%llx:%llu},,", - key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + key_serial(op->key), vp->fid.vid, vp->fid.vnode); - call = afs_alloc_flat_call(net, &afs_RXFSStoreStatus, + call = afs_alloc_flat_call(op->net, &afs_RXFSStoreStatus, (4 + 6) * 4, (21 + 6) * 4); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_scb = scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSSTORESTATUS); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(vp->fid.vid); + *bp++ = htonl(vp->fid.vnode); + *bp++ = htonl(vp->fid.unique); - xdr_encode_AFS_StoreStatus(&bp, attr); + xdr_encode_AFS_StoreStatus(&bp, op->setattr.attr); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -1435,6 +1331,7 @@ int afs_fs_setattr(struct afs_operation *fc, struct iattr *attr, */ static int afs_deliver_fs_get_volume_status(struct afs_call *call) { + struct afs_operation *op = call->op; const __be32 *bp; char *p; u32 size; @@ -1456,7 +1353,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) return ret; bp = call->buffer; - xdr_decode_AFSFetchVolumeStatus(&bp, call->out_volstatus); + xdr_decode_AFSFetchVolumeStatus(&bp, &op->volstatus.vs); call->unmarshall++; afs_extract_to_tmp(call); /* Fall through */ @@ -1569,37 +1466,26 @@ static const struct afs_call_type afs_RXFSGetVolumeStatus = { /* * fetch the status of a volume */ -int afs_fs_get_volume_status(struct afs_operation *fc, - struct afs_volume_status *vs) +void afs_fs_get_volume_status(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); __be32 *bp; - if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) - return yfs_fs_get_volume_status(fc, vs); - _enter(""); - call = afs_alloc_flat_call(net, &afs_RXFSGetVolumeStatus, 2 * 4, + call = afs_alloc_flat_call(op->net, &afs_RXFSGetVolumeStatus, 2 * 4, max(12 * 4, AFSOPAQUEMAX + 1)); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_volstatus = vs; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp[0] = htonl(FSGETVOLUMESTATUS); - bp[1] = htonl(vnode->fid.vid); + bp[1] = htonl(vp->fid.vid); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -1607,6 +1493,7 @@ int afs_fs_get_volume_status(struct afs_operation *fc, */ static int afs_deliver_fs_xxxx_lock(struct afs_call *call) { + struct afs_operation *op = call->op; const __be32 *bp; int ret; @@ -1618,7 +1505,7 @@ static int afs_deliver_fs_xxxx_lock(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSVolSync(&bp, call->out_volsync); + xdr_decode_AFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; @@ -1659,114 +1546,80 @@ static const struct afs_call_type afs_RXFSReleaseLock = { /* * Set a lock on a file */ -int afs_fs_set_lock(struct afs_operation *fc, afs_lock_type_t type, - struct afs_status_cb *scb) +void afs_fs_set_lock(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); __be32 *bp; - if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) - return yfs_fs_set_lock(fc, type, scb); - _enter(""); - call = afs_alloc_flat_call(net, &afs_RXFSSetLock, 5 * 4, 6 * 4); + call = afs_alloc_flat_call(op->net, &afs_RXFSSetLock, 5 * 4, 6 * 4); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->lvnode = vnode; - call->out_scb = scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSSETLOCK); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); - *bp++ = htonl(type); + *bp++ = htonl(vp->fid.vid); + *bp++ = htonl(vp->fid.vnode); + *bp++ = htonl(vp->fid.unique); + *bp++ = htonl(op->lock.type); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_calli(call, &vnode->fid, type); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_calli(call, &vp->fid, op->lock.type); + afs_make_op_call(op, call, GFP_NOFS); } /* * extend a lock on a file */ -int afs_fs_extend_lock(struct afs_operation *fc, struct afs_status_cb *scb) +void afs_fs_extend_lock(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); __be32 *bp; - if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) - return yfs_fs_extend_lock(fc, scb); - _enter(""); - call = afs_alloc_flat_call(net, &afs_RXFSExtendLock, 4 * 4, 6 * 4); + call = afs_alloc_flat_call(op->net, &afs_RXFSExtendLock, 4 * 4, 6 * 4); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->lvnode = vnode; - call->out_scb = scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSEXTENDLOCK); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(vp->fid.vid); + *bp++ = htonl(vp->fid.vnode); + *bp++ = htonl(vp->fid.unique); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* * release a lock on a file */ -int afs_fs_release_lock(struct afs_operation *fc, struct afs_status_cb *scb) +void afs_fs_release_lock(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); __be32 *bp; - if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) - return yfs_fs_release_lock(fc, scb); - _enter(""); - call = afs_alloc_flat_call(net, &afs_RXFSReleaseLock, 4 * 4, 6 * 4); + call = afs_alloc_flat_call(op->net, &afs_RXFSReleaseLock, 4 * 4, 6 * 4); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->lvnode = vnode; - call->out_scb = scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSRELEASELOCK); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(vp->fid.vid); + *bp++ = htonl(vp->fid.vnode); + *bp++ = htonl(vp->fid.unique); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -1906,86 +1759,12 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, return true; } -/* - * Deliver reply data to an FS.FetchStatus with no vnode. - */ -static int afs_deliver_fs_fetch_status(struct afs_call *call) -{ - const __be32 *bp; - int ret; - - ret = afs_transfer_reply(call); - if (ret < 0) - return ret; - - /* unmarshall the reply once we've received all of it */ - bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - xdr_decode_AFSCallBack(&bp, call, call->out_scb); - xdr_decode_AFSVolSync(&bp, call->out_volsync); - - _leave(" = 0 [done]"); - return 0; -} - -/* - * FS.FetchStatus operation type - */ -static const struct afs_call_type afs_RXFSFetchStatus = { - .name = "FS.FetchStatus", - .op = afs_FS_FetchStatus, - .deliver = afs_deliver_fs_fetch_status, - .destructor = afs_flat_call_destructor, -}; - -/* - * Fetch the status information for a fid without needing a vnode handle. - */ -int afs_fs_fetch_status(struct afs_operation *fc, - struct afs_net *net, - struct afs_fid *fid, - struct afs_status_cb *scb, - struct afs_volsync *volsync) -{ - struct afs_call *call; - __be32 *bp; - - if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) - return yfs_fs_fetch_status(fc, net, fid, scb, volsync); - - _enter(",%x,{%llx:%llu},,", - key_serial(fc->key), fid->vid, fid->vnode); - - call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4); - if (!call) { - fc->ac.error = -ENOMEM; - return -ENOMEM; - } - - call->key = fc->key; - call->out_fid = fid; - call->out_scb = scb; - call->out_volsync = volsync; - - /* marshall the parameters */ - bp = call->request; - bp[0] = htonl(FSFETCHSTATUS); - bp[1] = htonl(fid->vid); - bp[2] = htonl(fid->vnode); - bp[3] = htonl(fid->unique); - - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, fid); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); -} - /* * Deliver reply data to an FS.InlineBulkStatus call */ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) { + struct afs_operation *op = call->op; struct afs_status_cb *scb; const __be32 *bp; u32 tmp; @@ -2007,8 +1786,8 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) return ret; tmp = ntohl(call->tmp); - _debug("status count: %u/%u", tmp, call->count2); - if (tmp != call->count2) + _debug("status count: %u/%u", tmp, op->nr_files); + if (tmp != op->nr_files) return afs_protocol_error(call, afs_eproto_ibulkst_count); call->count = 0; @@ -2023,11 +1802,23 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) if (ret < 0) return ret; + switch (call->count) { + case 0: + scb = &op->file[0].scb; + break; + case 1: + scb = &op->file[1].scb; + break; + default: + scb = &op->more_files[call->count - 2].scb; + break; + } + bp = call->buffer; - scb = &call->out_scb[call->count]; xdr_decode_AFSFetchStatus(&bp, call, scb); + call->count++; - if (call->count < call->count2) + if (call->count < op->nr_files) goto more_counts; call->count = 0; @@ -2044,7 +1835,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) tmp = ntohl(call->tmp); _debug("CB count: %u", tmp); - if (tmp != call->count2) + if (tmp != op->nr_files) return afs_protocol_error(call, afs_eproto_ibulkst_cb_count); call->count = 0; call->unmarshall++; @@ -2059,11 +1850,22 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) return ret; _debug("unmarshall CB array"); + switch (call->count) { + case 0: + scb = &op->file[0].scb; + break; + case 1: + scb = &op->file[1].scb; + break; + default: + scb = &op->more_files[call->count - 2].scb; + break; + } + bp = call->buffer; - scb = &call->out_scb[call->count]; xdr_decode_AFSCallBack(&bp, call, scb); call->count++; - if (call->count < call->count2) + if (call->count < op->nr_files) goto more_cbs; afs_extract_to_buf(call, 6 * sizeof(__be32)); @@ -2076,7 +1878,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) return ret; bp = call->buffer; - xdr_decode_AFSVolSync(&bp, call->out_volsync); + xdr_decode_AFSVolSync(&bp, &op->volsync); call->unmarshall++; @@ -2088,6 +1890,13 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) return 0; } +static void afs_done_fs_inline_bulk_status(struct afs_call *call) +{ + if (call->error == -ECONNABORTED && + call->abort_code == RX_INVALID_OPERATION) + set_bit(AFS_SERVER_FL_NO_IBULK, &call->server->flags); +} + /* * FS.InlineBulkStatus operation type */ @@ -2095,58 +1904,53 @@ static const struct afs_call_type afs_RXFSInlineBulkStatus = { .name = "FS.InlineBulkStatus", .op = afs_FS_InlineBulkStatus, .deliver = afs_deliver_fs_inline_bulk_status, + .done = afs_done_fs_inline_bulk_status, .destructor = afs_flat_call_destructor, }; /* * Fetch the status information for up to 50 files */ -int afs_fs_inline_bulk_status(struct afs_operation *fc, - struct afs_net *net, - struct afs_fid *fids, - struct afs_status_cb *statuses, - unsigned int nr_fids, - struct afs_volsync *volsync) +void afs_fs_inline_bulk_status(struct afs_operation *op) { + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; struct afs_call *call; __be32 *bp; int i; - if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) - return yfs_fs_inline_bulk_status(fc, net, fids, statuses, - nr_fids, volsync); - - _enter(",%x,{%llx:%llu},%u", - key_serial(fc->key), fids[0].vid, fids[1].vnode, nr_fids); - - call = afs_alloc_flat_call(net, &afs_RXFSInlineBulkStatus, - (2 + nr_fids * 3) * 4, - 21 * 4); - if (!call) { - fc->ac.error = -ENOMEM; - return -ENOMEM; + if (test_bit(AFS_SERVER_FL_NO_IBULK, &op->cbi->server->flags)) { + op->error = -ENOTSUPP; + return; } - call->key = fc->key; - call->out_scb = statuses; - call->out_volsync = volsync; - call->count2 = nr_fids; + _enter(",%x,{%llx:%llu},%u", + key_serial(op->key), vp->fid.vid, vp->fid.vnode, op->nr_files); + + call = afs_alloc_flat_call(op->net, &afs_RXFSInlineBulkStatus, + (2 + op->nr_files * 3) * 4, + 21 * 4); + if (!call) + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSINLINEBULKSTATUS); - *bp++ = htonl(nr_fids); - for (i = 0; i < nr_fids; i++) { - *bp++ = htonl(fids[i].vid); - *bp++ = htonl(fids[i].vnode); - *bp++ = htonl(fids[i].unique); + *bp++ = htonl(op->nr_files); + *bp++ = htonl(dvp->fid.vid); + *bp++ = htonl(dvp->fid.vnode); + *bp++ = htonl(dvp->fid.unique); + *bp++ = htonl(vp->fid.vid); + *bp++ = htonl(vp->fid.vnode); + *bp++ = htonl(vp->fid.unique); + for (i = 0; i < op->nr_files - 2; i++) { + *bp++ = htonl(op->more_files[i].fid.vid); + *bp++ = htonl(op->more_files[i].fid.vnode); + *bp++ = htonl(op->more_files[i].fid.unique); } - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &fids[0]); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -2154,6 +1958,8 @@ int afs_fs_inline_bulk_status(struct afs_operation *fc, */ static int afs_deliver_fs_fetch_acl(struct afs_call *call) { + struct afs_operation *op = call->op; + struct afs_vnode_param *vp = &op->file[0]; struct afs_acl *acl; const __be32 *bp; unsigned int size; @@ -2179,7 +1985,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call) acl = kmalloc(struct_size(acl, data, size), GFP_KERNEL); if (!acl) return -ENOMEM; - call->ret_acl = acl; + op->acl = acl; acl->size = call->count2; afs_extract_begin(call, acl->data, size); call->unmarshall++; @@ -2202,8 +2008,8 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call) return ret; bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - xdr_decode_AFSVolSync(&bp, call->out_volsync); + xdr_decode_AFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_AFSVolSync(&bp, &op->volsync); call->unmarshall++; @@ -2215,12 +2021,6 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call) return 0; } -static void afs_destroy_fs_fetch_acl(struct afs_call *call) -{ - kfree(call->ret_acl); - afs_flat_call_destructor(call); -} - /* * FS.FetchACL operation type */ @@ -2228,66 +2028,33 @@ static const struct afs_call_type afs_RXFSFetchACL = { .name = "FS.FetchACL", .op = afs_FS_FetchACL, .deliver = afs_deliver_fs_fetch_acl, - .destructor = afs_destroy_fs_fetch_acl, }; /* * Fetch the ACL for a file. */ -struct afs_acl *afs_fs_fetch_acl(struct afs_operation *fc, - struct afs_status_cb *scb) +void afs_fs_fetch_acl(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); __be32 *bp; _enter(",%x,{%llx:%llu},,", - key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + key_serial(op->key), vp->fid.vid, vp->fid.vnode); - call = afs_alloc_flat_call(net, &afs_RXFSFetchACL, 16, (21 + 6) * 4); - if (!call) { - fc->ac.error = -ENOMEM; - return ERR_PTR(-ENOMEM); - } - - call->key = fc->key; - call->ret_acl = NULL; - call->out_scb = scb; - call->out_volsync = NULL; + call = afs_alloc_flat_call(op->net, &afs_RXFSFetchACL, 16, (21 + 6) * 4); + if (!call) + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp[0] = htonl(FSFETCHACL); - bp[1] = htonl(vnode->fid.vid); - bp[2] = htonl(vnode->fid.vnode); - bp[3] = htonl(vnode->fid.unique); + bp[1] = htonl(vp->fid.vid); + bp[2] = htonl(vp->fid.vnode); + bp[3] = htonl(vp->fid.unique); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); - afs_make_call(&fc->ac, call, GFP_KERNEL); - return (struct afs_acl *)afs_wait_for_call_to_complete(call, &fc->ac); -} - -/* - * Deliver reply data to any operation that returns file status and volume - * sync. - */ -static int afs_deliver_fs_file_status_and_vol(struct afs_call *call) -{ - const __be32 *bp; - int ret; - - ret = afs_transfer_reply(call); - if (ret < 0) - return ret; - - bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - xdr_decode_AFSVolSync(&bp, call->out_volsync); - - _leave(" = 0 [done]"); - return 0; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_KERNEL); } /* @@ -2303,42 +2070,34 @@ static const struct afs_call_type afs_RXFSStoreACL = { /* * Fetch the ACL for a file. */ -int afs_fs_store_acl(struct afs_operation *fc, const struct afs_acl *acl, - struct afs_status_cb *scb) +void afs_fs_store_acl(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); + const struct afs_acl *acl = op->acl; size_t size; __be32 *bp; _enter(",%x,{%llx:%llu},,", - key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + key_serial(op->key), vp->fid.vid, vp->fid.vnode); size = round_up(acl->size, 4); - call = afs_alloc_flat_call(net, &afs_RXFSStoreACL, + call = afs_alloc_flat_call(op->net, &afs_RXFSStoreACL, 5 * 4 + size, (21 + 6) * 4); - if (!call) { - fc->ac.error = -ENOMEM; - return -ENOMEM; - } - - call->key = fc->key; - call->out_scb = scb; - call->out_volsync = NULL; + if (!call) + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp[0] = htonl(FSSTOREACL); - bp[1] = htonl(vnode->fid.vid); - bp[2] = htonl(vnode->fid.vnode); - bp[3] = htonl(vnode->fid.unique); + bp[1] = htonl(vp->fid.vid); + bp[2] = htonl(vp->fid.vnode); + bp[3] = htonl(vp->fid.unique); bp[4] = htonl(acl->size); memcpy(&bp[5], acl->data, acl->size); if (acl->size != size) memset((void *)&bp[5] + acl->size, 0, size - acl->size); - trace_afs_make_fs_call(call, &vnode->fid); - afs_make_call(&fc->ac, call, GFP_KERNEL); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_KERNEL); } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index d2dbb3aef611..94675acb6a3a 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -67,16 +67,19 @@ static void afs_set_i_size(struct afs_vnode *vnode, u64 size) /* * Initialise an inode from the vnode status. */ -static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key, - struct afs_cb_interest *cbi, - struct afs_vnode *parent_vnode, - struct afs_status_cb *scb) +static int afs_inode_init_from_status(struct afs_operation *op, + struct afs_vnode_param *vp, + struct afs_vnode *vnode) { struct afs_cb_interest *old_cbi = NULL; - struct afs_file_status *status = &scb->status; + struct afs_file_status *status = &vp->scb.status; struct inode *inode = AFS_VNODE_TO_I(vnode); struct timespec64 t; + _enter("{%llx:%llu.%u} %s", + vp->fid.vid, vp->fid.vnode, vp->fid.unique, + op->type ? op->type->name : "???"); + _debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu", status->type, status->nlink, @@ -86,12 +89,15 @@ static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key, write_seqlock(&vnode->cb_lock); + vnode->cb_v_break = op->cb_v_break; + vnode->cb_s_break = op->cb_s_break; vnode->status = *status; t = status->mtime_client; inode->i_ctime = t; inode->i_mtime = t; inode->i_atime = t; + inode->i_flags |= S_NOATIME; inode->i_uid = make_kuid(&init_user_ns, status->owner); inode->i_gid = make_kgid(&init_user_ns, status->group); set_nlink(&vnode->vfs_inode, status->nlink); @@ -128,7 +134,7 @@ static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key, inode_nohighmem(inode); break; default: - dump_vnode(vnode, parent_vnode); + dump_vnode(vnode, op->file[0].vnode != vnode ? op->file[0].vnode : NULL); write_sequnlock(&vnode->cb_lock); return afs_protocol_error(NULL, afs_eproto_file_type); } @@ -138,16 +144,17 @@ static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key, vnode->invalid_before = status->data_version; inode_set_iversion_raw(&vnode->vfs_inode, status->data_version); - if (!scb->have_cb) { + if (!vp->scb.have_cb) { /* it's a symlink we just created (the fileserver * didn't give us a callback) */ vnode->cb_expires_at = ktime_get_real_seconds(); } else { - vnode->cb_expires_at = scb->callback.expires_at; + vnode->cb_expires_at = vp->scb.callback.expires_at; old_cbi = rcu_dereference_protected(vnode->cb_interest, lockdep_is_held(&vnode->cb_lock.lock)); - if (cbi != old_cbi) - rcu_assign_pointer(vnode->cb_interest, afs_get_cb_interest(cbi)); + if (op->cbi != old_cbi) + rcu_assign_pointer(vnode->cb_interest, + afs_get_cb_interest(op->cbi)); else old_cbi = NULL; set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); @@ -161,16 +168,19 @@ static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key, /* * Update the core inode struct from a returned status record. */ -static void afs_apply_status(struct afs_operation *fc, - struct afs_vnode *vnode, - struct afs_status_cb *scb, - const afs_dataversion_t *expected_version) +static void afs_apply_status(struct afs_operation *op, + struct afs_vnode_param *vp) { - struct afs_file_status *status = &scb->status; + struct afs_file_status *status = &vp->scb.status; + struct afs_vnode *vnode = vp->vnode; struct timespec64 t; umode_t mode; bool data_changed = false; + _enter("{%llx:%llu.%u} %s", + vp->fid.vid, vp->fid.vnode, vp->fid.unique, + op->type ? op->type->name : "???"); + BUG_ON(test_bit(AFS_VNODE_UNSET, &vnode->flags)); if (status->type != vnode->status.type) { @@ -209,14 +219,13 @@ static void afs_apply_status(struct afs_operation *fc, vnode->status = *status; - if (expected_version && - *expected_version != status->data_version) { + if (vp->dv_before + vp->dv_delta != status->data_version) { if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s\n", vnode->fid.vid, vnode->fid.vnode, - (unsigned long long)*expected_version, + (unsigned long long)vp->dv_before + vp->dv_delta, (unsigned long long)status->data_version, - fc->type ? fc->type->name : "???"); + op->type ? op->type->name : "???"); vnode->invalid_before = status->data_version; if (vnode->status.type == AFS_FTYPE_DIR) { @@ -243,20 +252,19 @@ static void afs_apply_status(struct afs_operation *fc, /* * Apply a callback to a vnode. */ -static void afs_apply_callback(struct afs_operation *fc, - struct afs_vnode *vnode, - struct afs_status_cb *scb, - unsigned int cb_break) +static void afs_apply_callback(struct afs_operation *op, + struct afs_vnode_param *vp) { struct afs_cb_interest *old; - struct afs_callback *cb = &scb->callback; + struct afs_callback *cb = &vp->scb.callback; + struct afs_vnode *vnode = vp->vnode; - if (!afs_cb_is_broken(cb_break, vnode, fc->cbi)) { + if (!afs_cb_is_broken(vp->cb_break_before, vnode, op->cbi)) { vnode->cb_expires_at = cb->expires_at; old = rcu_dereference_protected(vnode->cb_interest, lockdep_is_held(&vnode->cb_lock.lock)); - if (old != fc->cbi) { - rcu_assign_pointer(vnode->cb_interest, afs_get_cb_interest(fc->cbi)); + if (old != op->cbi) { + rcu_assign_pointer(vnode->cb_interest, afs_get_cb_interest(op->cbi)); afs_put_cb_interest(afs_v2net(vnode), old); } set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); @@ -267,106 +275,108 @@ static void afs_apply_callback(struct afs_operation *fc, * Apply the received status and callback to an inode all in the same critical * section to avoid races with afs_validate(). */ -void afs_vnode_commit_status(struct afs_operation *fc, - struct afs_vnode *vnode, - unsigned int cb_break, - const afs_dataversion_t *expected_version, - struct afs_status_cb *scb) +void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *vp) { - if (fc->ac.error != 0) - return; + struct afs_vnode *vnode = vp->vnode; + + _enter(""); + + ASSERTCMP(op->error, ==, 0); write_seqlock(&vnode->cb_lock); - if (scb->have_error) { - if (scb->status.abort_code == VNOVNODE) { + if (vp->scb.have_error) { + if (vp->scb.status.abort_code == VNOVNODE) { set_bit(AFS_VNODE_DELETED, &vnode->flags); clear_nlink(&vnode->vfs_inode); __afs_break_callback(vnode, afs_cb_break_for_deleted); } } else { - if (scb->have_status) - afs_apply_status(fc, vnode, scb, expected_version); - if (scb->have_cb) - afs_apply_callback(fc, vnode, scb, cb_break); + if (vp->scb.have_status) + afs_apply_status(op, vp); + if (vp->scb.have_cb) + afs_apply_callback(op, vp); } write_sequnlock(&vnode->cb_lock); - if (fc->ac.error == 0 && scb->have_status) - afs_cache_permit(vnode, fc->key, cb_break, scb); + if (op->error == 0 && vp->scb.have_status) + afs_cache_permit(vnode, op->key, vp->cb_break_before, &vp->scb); } +static void afs_fetch_status_success(struct afs_operation *op) +{ + struct afs_vnode_param *vp = &op->file[0]; + struct afs_vnode *vnode = vp->vnode; + int ret; + + if (vnode->vfs_inode.i_state & I_NEW) { + ret = afs_inode_init_from_status(op, vp, vnode); + op->error = ret; + if (ret == 0) + afs_cache_permit(vnode, op->key, vp->cb_break_before, &vp->scb); + } else { + afs_vnode_commit_status(op, vp); + } +} + +static const struct afs_operation_ops afs_fetch_status_operation = { + .issue_afs_rpc = afs_fs_fetch_status, + .issue_yfs_rpc = yfs_fs_fetch_status, + .success = afs_fetch_status_success, +}; + /* * Fetch file status from the volume. */ int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool is_new, afs_access_t *_caller_access) { - struct afs_status_cb *scb; - struct afs_operation fc; - int ret; + struct afs_operation *op; _enter("%s,{%llx:%llu.%u,S=%lx}", vnode->volume->name, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, vnode->flags); - scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL); - if (!scb) - return -ENOMEM; + op = afs_alloc_operation(key, vnode->volume); + if (IS_ERR(op)) + return PTR_ERR(op); - ret = -ERESTARTSYS; - if (afs_begin_vnode_operation(&fc, vnode, key, true)) { - afs_dataversion_t data_version = vnode->status.data_version; + afs_op_set_vnode(op, 0, vnode); - while (afs_select_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(vnode); - afs_fs_fetch_file_status(&fc, scb, NULL); - } + op->nr_files = 1; + op->ops = &afs_fetch_status_operation; + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); - if (fc.error) { - /* Do nothing. */ - } else if (is_new) { - ret = afs_inode_init_from_status(vnode, key, fc.cbi, - NULL, scb); - fc.error = ret; - if (ret == 0) - afs_cache_permit(vnode, key, fc.cb_break, scb); - } else { - afs_vnode_commit_status(&fc, vnode, fc.cb_break, - &data_version, scb); - } - afs_check_for_remote_deletion(&fc, vnode); - ret = afs_end_vnode_operation(&fc); - } + if (_caller_access) + *_caller_access = op->file[0].scb.status.caller_access; + return afs_put_operation(op); +} - if (ret == 0 && _caller_access) - *_caller_access = scb->status.caller_access; - kfree(scb); - _leave(" = %d", ret); - return ret; +/* + * ilookup() comparator + */ +int afs_ilookup5_test_by_fid(struct inode *inode, void *opaque) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_fid *fid = opaque; + + return (fid->vnode == vnode->fid.vnode && + fid->vnode_hi == vnode->fid.vnode_hi && + fid->unique == vnode->fid.unique); } /* * iget5() comparator */ -int afs_iget5_test(struct inode *inode, void *opaque) +static int afs_iget5_test(struct inode *inode, void *opaque) { - struct afs_iget_data *iget_data = opaque; - struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_vnode_param *vp = opaque; + //struct afs_vnode *vnode = AFS_FS_I(inode); - return memcmp(&vnode->fid, &iget_data->fid, sizeof(iget_data->fid)) == 0; -} - -/* - * iget5() comparator for inode created by autocell operations - * - * These pseudo inodes don't match anything. - */ -static int afs_iget5_pseudo_dir_test(struct inode *inode, void *opaque) -{ - return 0; + return afs_ilookup5_test_by_fid(inode, &vp->fid); } /* @@ -374,98 +384,21 @@ static int afs_iget5_pseudo_dir_test(struct inode *inode, void *opaque) */ static int afs_iget5_set(struct inode *inode, void *opaque) { - struct afs_iget_data *iget_data = opaque; + struct afs_vnode_param *vp = opaque; + struct afs_super_info *as = AFS_FS_S(inode->i_sb); struct afs_vnode *vnode = AFS_FS_I(inode); - vnode->fid = iget_data->fid; - vnode->volume = iget_data->volume; - vnode->cb_v_break = iget_data->cb_v_break; - vnode->cb_s_break = iget_data->cb_s_break; + vnode->volume = as->volume; + vnode->fid = vp->fid; /* YFS supports 96-bit vnode IDs, but Linux only supports * 64-bit inode numbers. */ - inode->i_ino = iget_data->fid.vnode; - inode->i_generation = iget_data->fid.unique; + inode->i_ino = vnode->fid.vnode; + inode->i_generation = vnode->fid.unique; return 0; } -/* - * Create an inode for a dynamic root directory or an autocell dynamic - * automount dir. - */ -struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) -{ - struct afs_super_info *as; - struct afs_vnode *vnode; - struct inode *inode; - static atomic_t afs_autocell_ino; - - struct afs_iget_data iget_data = { - .cb_v_break = 0, - .cb_s_break = 0, - }; - - _enter(""); - - as = sb->s_fs_info; - if (as->volume) { - iget_data.volume = as->volume; - iget_data.fid.vid = as->volume->vid; - } - if (root) { - iget_data.fid.vnode = 1; - iget_data.fid.unique = 1; - } else { - iget_data.fid.vnode = atomic_inc_return(&afs_autocell_ino); - iget_data.fid.unique = 0; - } - - inode = iget5_locked(sb, iget_data.fid.vnode, - afs_iget5_pseudo_dir_test, afs_iget5_set, - &iget_data); - if (!inode) { - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); - } - - _debug("GOT INODE %p { ino=%lu, vl=%llx, vn=%llx, u=%x }", - inode, inode->i_ino, iget_data.fid.vid, iget_data.fid.vnode, - iget_data.fid.unique); - - vnode = AFS_FS_I(inode); - - /* there shouldn't be an existing inode */ - BUG_ON(!(inode->i_state & I_NEW)); - - inode->i_size = 0; - inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; - if (root) { - inode->i_op = &afs_dynroot_inode_operations; - inode->i_fop = &simple_dir_operations; - } else { - inode->i_op = &afs_autocell_inode_operations; - } - set_nlink(inode, 2); - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - inode->i_ctime = inode->i_atime = inode->i_mtime = current_time(inode); - inode->i_blocks = 0; - inode_set_iversion_raw(inode, 0); - inode->i_generation = 0; - - set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); - if (!root) { - set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); - inode->i_flags |= S_AUTOMOUNT; - } - - inode->i_flags |= S_NOATIME; - unlock_new_inode(inode); - _leave(" = %p", inode); - return inode; -} - /* * Get a cache cookie for an inode. */ @@ -501,58 +434,41 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) /* * inode retrieval */ -struct inode *afs_iget(struct super_block *sb, struct key *key, - struct afs_iget_data *iget_data, - struct afs_status_cb *scb, - struct afs_cb_interest *cbi, - struct afs_vnode *parent_vnode) +struct inode *afs_iget(struct afs_operation *op, struct afs_vnode_param *vp) { - struct afs_super_info *as; + struct afs_vnode_param *dvp = &op->file[0]; + struct super_block *sb = dvp->vnode->vfs_inode.i_sb; struct afs_vnode *vnode; - struct afs_fid *fid = &iget_data->fid; struct inode *inode; int ret; - _enter(",{%llx:%llu.%u},,", fid->vid, fid->vnode, fid->unique); + _enter(",{%llx:%llu.%u},,", vp->fid.vid, vp->fid.vnode, vp->fid.unique); - as = sb->s_fs_info; - iget_data->volume = as->volume; - - inode = iget5_locked(sb, fid->vnode, afs_iget5_test, afs_iget5_set, - iget_data); + inode = iget5_locked(sb, vp->fid.vnode, afs_iget5_test, afs_iget5_set, vp); if (!inode) { _leave(" = -ENOMEM"); return ERR_PTR(-ENOMEM); } - _debug("GOT INODE %p { vl=%llx vn=%llx, u=%x }", - inode, fid->vid, fid->vnode, fid->unique); - vnode = AFS_FS_I(inode); + _debug("GOT INODE %p { vl=%llx vn=%llx, u=%x }", + inode, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); + /* deal with an existing inode */ if (!(inode->i_state & I_NEW)) { _leave(" = %p", inode); return inode; } - if (!scb) { - /* it's a remotely extant inode */ - ret = afs_fetch_status(vnode, key, true, NULL); - if (ret < 0) - goto bad_inode; - } else { - ret = afs_inode_init_from_status(vnode, key, cbi, parent_vnode, - scb); - if (ret < 0) - goto bad_inode; - } + ret = afs_inode_init_from_status(op, vp, vnode); + if (ret < 0) + goto bad_inode; afs_get_inode_cache(vnode); /* success */ clear_bit(AFS_VNODE_UNSET, &vnode->flags); - inode->i_flags |= S_NOATIME; unlock_new_inode(inode); _leave(" = %p", inode); return inode; @@ -564,6 +480,74 @@ bad_inode: return ERR_PTR(ret); } +static int afs_iget5_set_root(struct inode *inode, void *opaque) +{ + struct afs_super_info *as = AFS_FS_S(inode->i_sb); + struct afs_vnode *vnode = AFS_FS_I(inode); + + vnode->volume = as->volume; + vnode->fid.vid = as->volume->vid, + vnode->fid.vnode = 1; + vnode->fid.unique = 1; + inode->i_ino = 1; + inode->i_generation = 1; + return 0; +} + +/* + * Set up the root inode for a volume. This is always vnode 1, unique 1 within + * the volume. + */ +struct inode *afs_root_iget(struct super_block *sb, struct key *key) +{ + struct afs_super_info *as = AFS_FS_S(sb); + struct afs_operation *op; + struct afs_vnode *vnode; + struct inode *inode; + int ret; + + _enter(",{%llx},,", as->volume->vid); + + inode = iget5_locked(sb, 1, NULL, afs_iget5_set_root, NULL); + if (!inode) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + _debug("GOT ROOT INODE %p { vl=%llx }", inode, as->volume->vid); + + BUG_ON(!(inode->i_state & I_NEW)); + + vnode = AFS_FS_I(inode); + vnode->cb_v_break = as->volume->cb_v_break, + + op = afs_alloc_operation(key, as->volume); + if (IS_ERR(op)) { + ret = PTR_ERR(op); + goto error; + } + + afs_op_set_vnode(op, 0, vnode); + + op->nr_files = 1; + op->ops = &afs_fetch_status_operation; + ret = afs_do_sync_operation(op); + if (ret < 0) + goto error; + + afs_get_inode_cache(vnode); + + clear_bit(AFS_VNODE_UNSET, &vnode->flags); + unlock_new_inode(inode); + _leave(" = %p", inode); + return inode; + +error: + iget_failed(inode); + _leave(" = %d [bad]", ret); + return ERR_PTR(ret); +} + /* * mark the data attached to an inode as obsolete due to a write on the server * - might also want to ditch all the outstanding writes and dirty pages @@ -808,16 +792,24 @@ void afs_evict_inode(struct inode *inode) _leave(""); } +static void afs_setattr_success(struct afs_operation *op) +{ + afs_vnode_commit_status(op, &op->file[0]); +} + +static const struct afs_operation_ops afs_setattr_operation = { + .issue_afs_rpc = afs_fs_setattr, + .issue_yfs_rpc = yfs_fs_setattr, + .success = afs_setattr_success, +}; + /* * set the attributes of an inode */ int afs_setattr(struct dentry *dentry, struct iattr *attr) { - struct afs_operation fc; - struct afs_status_cb *scb; + struct afs_operation *op; struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); - struct key *key; - int ret = -ENOMEM; _enter("{%llx:%llu},{n=%pd},%x", vnode->fid.vid, vnode->fid.vnode, dentry, @@ -829,48 +821,22 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) return 0; } - scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL); - if (!scb) - goto error; - /* flush any dirty data outstanding on a regular file */ if (S_ISREG(vnode->vfs_inode.i_mode)) filemap_write_and_wait(vnode->vfs_inode.i_mapping); - if (attr->ia_valid & ATTR_FILE) { - key = afs_file_key(attr->ia_file); - } else { - key = afs_request_key(vnode->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - goto error_scb; - } - } + op = afs_alloc_operation(((attr->ia_valid & ATTR_FILE) ? + afs_file_key(attr->ia_file) : NULL), + vnode->volume); + if (IS_ERR(op)) + return PTR_ERR(op); - ret = -ERESTARTSYS; - if (afs_begin_vnode_operation(&fc, vnode, key, false)) { - afs_dataversion_t data_version = vnode->status.data_version; + afs_op_set_vnode(op, 0, vnode); + op->setattr.attr = attr; - if (attr->ia_valid & ATTR_SIZE) - data_version++; + if (attr->ia_valid & ATTR_SIZE) + op->file[0].dv_delta = 1; - while (afs_select_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(vnode); - afs_fs_setattr(&fc, attr, scb); - } - - afs_check_for_remote_deletion(&fc, vnode); - afs_vnode_commit_status(&fc, vnode, fc.cb_break, - &data_version, scb); - ret = afs_end_vnode_operation(&fc); - } - - if (!(attr->ia_valid & ATTR_FILE)) - key_put(key); - -error_scb: - kfree(scb); -error: - _leave(" = %d", ret); - return ret; + op->ops = &afs_setattr_operation; + return afs_do_sync_operation(op); } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 0551dedb0371..4b8ac049fc17 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -59,13 +59,6 @@ struct afs_fs_context { struct key *key; /* key to use for secure mounting */ }; -struct afs_iget_data { - struct afs_fid fid; - struct afs_volume *volume; /* volume on which resides */ - unsigned int cb_v_break; /* Pre-fetch volume break count */ - unsigned int cb_s_break; /* Pre-fetch server break count */ -}; - enum afs_call_state { AFS_CALL_CL_REQUESTING, /* Client: Request is being sent */ AFS_CALL_CL_AWAIT_REPLY, /* Client: Awaiting reply */ @@ -111,9 +104,7 @@ struct afs_call { struct afs_server *server; /* The fileserver record if fs op (pins ref) */ struct afs_vlserver *vlserver; /* The vlserver record if vl op */ struct afs_cb_interest *cbi; /* Callback interest for server used */ - struct afs_vnode *lvnode; /* vnode being locked */ void *request; /* request data (first part) */ - struct address_space *mapping; /* Pages being written from */ struct iov_iter def_iter; /* Default buffer/data iterator */ struct iov_iter *iter; /* Iterator currently in use */ union { /* Convenience for ->def_iter */ @@ -125,18 +116,9 @@ struct afs_call { long ret0; /* Value to reply with instead of 0 */ struct afs_addr_list *ret_alist; struct afs_vldb_entry *ret_vldb; - struct afs_acl *ret_acl; }; - struct afs_fid *out_fid; - struct afs_status_cb *out_dir_scb; - struct afs_status_cb *out_scb; - struct yfs_acl *out_yacl; - struct afs_volsync *out_volsync; - struct afs_volume_status *out_volstatus; - struct afs_read *read_request; + struct afs_operation *op; unsigned int server_index; - pgoff_t first; /* first page in mapping to deal with */ - pgoff_t last; /* last page in mapping to deal with */ atomic_t usage; enum afs_call_state state; spinlock_t state_lock; @@ -146,11 +128,7 @@ struct afs_call { unsigned int max_lifespan; /* Maximum lifespan to set if not 0 */ unsigned request_size; /* size of request data */ unsigned reply_max; /* maximum size of reply */ - unsigned first_offset; /* offset into mapping[first] */ - union { - unsigned last_to; /* amount of mapping[last] */ - unsigned count2; /* count used in unmarshalling */ - }; + unsigned count2; /* count used in unmarshalling */ unsigned char unmarshall; /* unmarshalling phase */ unsigned char addr_ix; /* Address in ->alist */ bool drop_ref; /* T if need to drop ref for incoming call */ @@ -570,10 +548,7 @@ struct afs_cb_interest { struct afs_vol_interest *vol_interest; struct afs_server *server; /* Server on which this interest resides */ struct super_block *sb; /* Superblock on which inodes reside */ - union { - struct rcu_head rcu; - afs_volid_t vid; /* Volume ID to match */ - }; + struct rcu_head rcu; refcount_t usage; }; @@ -764,29 +739,116 @@ struct afs_vl_cursor { }; /* - * Cursor for iterating over a set of fileservers. + * Fileserver operation methods. + */ +struct afs_operation_ops { + void (*issue_afs_rpc)(struct afs_operation *op); + void (*issue_yfs_rpc)(struct afs_operation *op); + void (*success)(struct afs_operation *op); + void (*aborted)(struct afs_operation *op); + void (*edit_dir)(struct afs_operation *op); + void (*put)(struct afs_operation *op); +}; + +struct afs_vnode_param { + struct afs_vnode *vnode; + struct afs_fid fid; /* Fid to access */ + struct afs_status_cb scb; /* Returned status and callback promise */ + afs_dataversion_t dv_before; /* Data version before the call */ + unsigned int cb_break_before; /* cb_break + cb_s_break before the call */ + u8 dv_delta; /* Expected change in data version */ + bool put_vnode; /* T if we have a ref on the vnode */ + bool need_io_lock; /* T if we need the I/O lock on this */ +}; + +/* + * Fileserver operation wrapper, handling server and address rotation + * asynchronously. May make simultaneous calls to multiple servers. */ struct afs_operation { + struct afs_net *net; /* Network namespace */ + struct key *key; /* Key for the cell */ const struct afs_call_type *type; /* Type of call done */ + const struct afs_operation_ops *ops; + + /* Parameters/results for the operation */ + struct afs_volume *volume; /* Volume being accessed */ + struct afs_vnode_param file[2]; + struct afs_vnode_param *more_files; + struct afs_volsync volsync; + struct dentry *dentry; /* Dentry to be altered */ + struct dentry *dentry_2; /* Second dentry to be altered */ + struct timespec64 mtime; /* Modification time to record */ + short nr_files; /* Number of entries in file[], more_files */ + short error; + unsigned int abort_code; + unsigned int debug_id; + + unsigned int cb_v_break; /* Volume break counter before op */ + unsigned int cb_s_break; /* Server break counter before op */ + + union { + struct { + int which; /* Which ->file[] to fetch for */ + } fetch_status; + struct { + int reason; /* enum afs_edit_dir_reason */ + mode_t mode; + const char *symlink; + } create; + struct { + bool need_rehash; + } unlink; + struct { + struct dentry *rehash; + struct dentry *tmp; + bool new_negative; + } rename; + struct { + struct afs_read *req; + } fetch; + struct { + struct afs_vnode *lvnode; /* vnode being locked */ + afs_lock_type_t type; + } lock; + struct { + struct address_space *mapping; /* Pages being written from */ + pgoff_t first; /* first page in mapping to deal with */ + pgoff_t last; /* last page in mapping to deal with */ + unsigned first_offset; /* offset into mapping[first] */ + unsigned last_to; /* amount of mapping[last] */ + } store; + struct { + struct iattr *attr; + } setattr; + struct afs_acl *acl; + struct yfs_acl *yacl; + struct { + struct afs_volume_status vs; + struct kstatfs *buf; + } volstatus; + }; + + /* Fileserver iteration state */ struct afs_addr_cursor ac; - struct afs_vnode *vnode; struct afs_server_list *server_list; /* Current server list (pins ref) */ struct afs_cb_interest *cbi; /* Server on which this resides (pins ref) */ - struct key *key; /* Key for the server */ + struct afs_call *call; unsigned long untried; /* Bitmask of untried servers */ - unsigned int cb_break; /* cb_break + cb_s_break before the call */ - unsigned int cb_break_2; /* cb_break + cb_s_break (2nd vnode) */ short index; /* Current server */ - short error; - unsigned short flags; + unsigned short nr_iterations; /* Number of server iterations */ + + unsigned int flags; #define AFS_OPERATION_STOP 0x0001 /* Set to cease iteration */ #define AFS_OPERATION_VBUSY 0x0002 /* Set if seen VBUSY */ #define AFS_OPERATION_VMOVED 0x0004 /* Set if seen VMOVED */ #define AFS_OPERATION_VNOVOL 0x0008 /* Set if seen VNOVOL */ #define AFS_OPERATION_CUR_ONLY 0x0010 /* Set if current server only (file lock held) */ #define AFS_OPERATION_NO_VSLEEP 0x0020 /* Set to prevent sleep on VBUSY, VOFFLINE, ... */ -#define AFS_OPERATION_INTR 0x0040 /* Set if op is interruptible */ - unsigned short nr_iterations; /* Number of server iterations */ +#define AFS_OPERATION_UNINTR 0x0040 /* Set if op is uninterruptible */ +#define AFS_OPERATION_DOWNGRADE 0x0080 /* Set to retry with downgraded opcode */ +#define AFS_OPERATION_LOCK_0 0x0100 /* Set if have io_lock on file[0] */ +#define AFS_OPERATION_LOCK_1 0x0200 /* Set if have io_lock on file[1] */ }; /* @@ -958,46 +1020,61 @@ extern int afs_flock(struct file *, int, struct file_lock *); /* * fsclient.c */ -extern int afs_fs_fetch_file_status(struct afs_operation *, struct afs_status_cb *, - struct afs_volsync *); -extern int afs_fs_fetch_data(struct afs_operation *, struct afs_status_cb *, struct afs_read *); -extern int afs_fs_create(struct afs_operation *, const char *, umode_t, - struct afs_status_cb *, struct afs_fid *, struct afs_status_cb *); -extern int afs_fs_remove(struct afs_operation *, struct afs_vnode *, const char *, bool, - struct afs_status_cb *); -extern int afs_fs_link(struct afs_operation *, struct afs_vnode *, const char *, - struct afs_status_cb *, struct afs_status_cb *); -extern int afs_fs_symlink(struct afs_operation *, const char *, const char *, - struct afs_status_cb *, struct afs_fid *, struct afs_status_cb *); -extern int afs_fs_rename(struct afs_operation *, const char *, - struct afs_vnode *, const char *, - struct afs_status_cb *, struct afs_status_cb *); -extern int afs_fs_store_data(struct afs_operation *, struct address_space *, - pgoff_t, pgoff_t, unsigned, unsigned, struct afs_status_cb *); -extern int afs_fs_setattr(struct afs_operation *, struct iattr *, struct afs_status_cb *); -extern int afs_fs_get_volume_status(struct afs_operation *, struct afs_volume_status *); -extern int afs_fs_set_lock(struct afs_operation *, afs_lock_type_t, struct afs_status_cb *); -extern int afs_fs_extend_lock(struct afs_operation *, struct afs_status_cb *); -extern int afs_fs_release_lock(struct afs_operation *, struct afs_status_cb *); +extern void afs_fs_fetch_status(struct afs_operation *); +extern void afs_fs_fetch_data(struct afs_operation *); +extern void afs_fs_create_file(struct afs_operation *); +extern void afs_fs_make_dir(struct afs_operation *); +extern void afs_fs_remove_file(struct afs_operation *); +extern void afs_fs_remove_dir(struct afs_operation *); +extern void afs_fs_link(struct afs_operation *); +extern void afs_fs_symlink(struct afs_operation *); +extern void afs_fs_rename(struct afs_operation *); +extern void afs_fs_store_data(struct afs_operation *); +extern void afs_fs_setattr(struct afs_operation *); +extern void afs_fs_get_volume_status(struct afs_operation *); +extern void afs_fs_set_lock(struct afs_operation *); +extern void afs_fs_extend_lock(struct afs_operation *); +extern void afs_fs_release_lock(struct afs_operation *); extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *, struct afs_addr_cursor *, struct key *); extern bool afs_fs_get_capabilities(struct afs_net *, struct afs_server *, struct afs_addr_cursor *, struct key *); -extern int afs_fs_inline_bulk_status(struct afs_operation *, struct afs_net *, - struct afs_fid *, struct afs_status_cb *, - unsigned int, struct afs_volsync *); -extern int afs_fs_fetch_status(struct afs_operation *, struct afs_net *, - struct afs_fid *, struct afs_status_cb *, - struct afs_volsync *); +extern void afs_fs_inline_bulk_status(struct afs_operation *); struct afs_acl { u32 size; u8 data[]; }; -extern struct afs_acl *afs_fs_fetch_acl(struct afs_operation *, struct afs_status_cb *); -extern int afs_fs_store_acl(struct afs_operation *, const struct afs_acl *, - struct afs_status_cb *); +extern void afs_fs_fetch_acl(struct afs_operation *); +extern void afs_fs_store_acl(struct afs_operation *); + +/* + * fs_operation.c + */ +extern struct afs_operation *afs_alloc_operation(struct key *, struct afs_volume *); +extern int afs_put_operation(struct afs_operation *); +extern bool afs_begin_vnode_operation(struct afs_operation *); +extern void afs_wait_for_operation(struct afs_operation *); +extern int afs_do_sync_operation(struct afs_operation *); + +static inline void afs_op_nomem(struct afs_operation *op) +{ + op->error = -ENOMEM; +} + +static inline void afs_op_set_vnode(struct afs_operation *op, unsigned int n, + struct afs_vnode *vnode) +{ + op->file[n].vnode = vnode; + op->file[n].need_io_lock = true; +} + +static inline void afs_op_set_fid(struct afs_operation *op, unsigned int n, + const struct afs_fid *fid) +{ + op->file[n].fid = *fid; +} /* * fs_probe.c @@ -1010,18 +1087,12 @@ extern void afs_fs_probe_dispatcher(struct work_struct *); /* * inode.c */ -extern void afs_vnode_commit_status(struct afs_operation *, - struct afs_vnode *, - unsigned int, - const afs_dataversion_t *, - struct afs_status_cb *); +extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *); extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *); -extern int afs_iget5_test(struct inode *, void *); +extern int afs_ilookup5_test_by_fid(struct inode *, void *); extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool); -extern struct inode *afs_iget(struct super_block *, struct key *, - struct afs_iget_data *, struct afs_status_cb *, - struct afs_cb_interest *, - struct afs_vnode *); +extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); +extern struct inode *afs_root_iget(struct super_block *, struct key *); extern void afs_zap_data(struct afs_vnode *); extern bool afs_check_validity(struct afs_vnode *); extern int afs_validate(struct afs_vnode *, struct key *); @@ -1109,11 +1180,9 @@ static inline void afs_put_sysnames(struct afs_sysnames *sysnames) {} /* * rotate.c */ -extern bool afs_begin_vnode_operation(struct afs_operation *, struct afs_vnode *, - struct key *, bool); extern bool afs_select_fileserver(struct afs_operation *); extern bool afs_select_current_fileserver(struct afs_operation *); -extern int afs_end_vnode_operation(struct afs_operation *); +extern void afs_dump_edestaddrreq(const struct afs_operation *); /* * rxrpc.c @@ -1135,10 +1204,16 @@ extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); extern int afs_extract_data(struct afs_call *, bool); extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause); -static inline void afs_set_fc_call(struct afs_call *call, struct afs_operation *op) +static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *call, + gfp_t gfp) { - call->intr = op->flags & AFS_OPERATION_INTR; + op->call = call; op->type = call->type; + call->op = op; + call->key = op->key; + call->cbi = afs_get_cb_interest(op->cbi); + call->intr = !(op->flags & AFS_OPERATION_UNINTR); + afs_make_call(&op->ac, call, gfp); } static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size) @@ -1347,7 +1422,7 @@ extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *, /* * volume.c */ -static inline struct afs_volume *__afs_get_volume(struct afs_volume *volume) +static inline struct afs_volume *afs_get_volume(struct afs_volume *volume) { if (volume) atomic_inc(&volume->usage); @@ -1357,7 +1432,7 @@ static inline struct afs_volume *__afs_get_volume(struct afs_volume *volume) extern struct afs_volume *afs_create_volume(struct afs_fs_context *); extern void afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); -extern void afs_put_volume(struct afs_cell *, struct afs_volume *); +extern void afs_put_volume(struct afs_net *, struct afs_volume *); extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); /* @@ -1387,36 +1462,24 @@ extern ssize_t afs_listxattr(struct dentry *, char *, size_t); /* * yfsclient.c */ -extern int yfs_fs_fetch_file_status(struct afs_operation *, struct afs_status_cb *, - struct afs_volsync *); -extern int yfs_fs_fetch_data(struct afs_operation *, struct afs_status_cb *, struct afs_read *); -extern int yfs_fs_create_file(struct afs_operation *, const char *, umode_t, struct afs_status_cb *, - struct afs_fid *, struct afs_status_cb *); -extern int yfs_fs_make_dir(struct afs_operation *, const char *, umode_t, struct afs_status_cb *, - struct afs_fid *, struct afs_status_cb *); -extern int yfs_fs_remove_file2(struct afs_operation *, struct afs_vnode *, const char *, - struct afs_status_cb *, struct afs_status_cb *); -extern int yfs_fs_remove(struct afs_operation *, struct afs_vnode *, const char *, bool, - struct afs_status_cb *); -extern int yfs_fs_link(struct afs_operation *, struct afs_vnode *, const char *, - struct afs_status_cb *, struct afs_status_cb *); -extern int yfs_fs_symlink(struct afs_operation *, const char *, const char *, - struct afs_status_cb *, struct afs_fid *, struct afs_status_cb *); -extern int yfs_fs_rename(struct afs_operation *, const char *, struct afs_vnode *, const char *, - struct afs_status_cb *, struct afs_status_cb *); -extern int yfs_fs_store_data(struct afs_operation *, struct address_space *, - pgoff_t, pgoff_t, unsigned, unsigned, struct afs_status_cb *); -extern int yfs_fs_setattr(struct afs_operation *, struct iattr *, struct afs_status_cb *); -extern int yfs_fs_get_volume_status(struct afs_operation *, struct afs_volume_status *); -extern int yfs_fs_set_lock(struct afs_operation *, afs_lock_type_t, struct afs_status_cb *); -extern int yfs_fs_extend_lock(struct afs_operation *, struct afs_status_cb *); -extern int yfs_fs_release_lock(struct afs_operation *, struct afs_status_cb *); -extern int yfs_fs_fetch_status(struct afs_operation *, struct afs_net *, - struct afs_fid *, struct afs_status_cb *, - struct afs_volsync *); -extern int yfs_fs_inline_bulk_status(struct afs_operation *, struct afs_net *, - struct afs_fid *, struct afs_status_cb *, - unsigned int, struct afs_volsync *); +extern void yfs_fs_fetch_file_status(struct afs_operation *); +extern void yfs_fs_fetch_data(struct afs_operation *); +extern void yfs_fs_create_file(struct afs_operation *); +extern void yfs_fs_make_dir(struct afs_operation *); +extern void yfs_fs_remove_file2(struct afs_operation *); +extern void yfs_fs_remove_file(struct afs_operation *); +extern void yfs_fs_remove_dir(struct afs_operation *); +extern void yfs_fs_link(struct afs_operation *); +extern void yfs_fs_symlink(struct afs_operation *); +extern void yfs_fs_rename(struct afs_operation *); +extern void yfs_fs_store_data(struct afs_operation *); +extern void yfs_fs_setattr(struct afs_operation *); +extern void yfs_fs_get_volume_status(struct afs_operation *); +extern void yfs_fs_set_lock(struct afs_operation *); +extern void yfs_fs_extend_lock(struct afs_operation *); +extern void yfs_fs_release_lock(struct afs_operation *); +extern void yfs_fs_fetch_status(struct afs_operation *); +extern void yfs_fs_inline_bulk_status(struct afs_operation *); struct yfs_acl { struct afs_acl *acl; /* Dir/file/symlink ACL */ @@ -1429,10 +1492,8 @@ struct yfs_acl { }; extern void yfs_free_opaque_acl(struct yfs_acl *); -extern struct yfs_acl *yfs_fs_fetch_opaque_acl(struct afs_operation *, struct yfs_acl *, - struct afs_status_cb *); -extern int yfs_fs_store_opaque_acl2(struct afs_operation *, const struct afs_acl *, - struct afs_status_cb *); +extern void yfs_fs_fetch_opaque_acl(struct afs_operation *); +extern void yfs_fs_store_opaque_acl2(struct afs_operation *); /* * Miscellaneous inline functions. @@ -1450,12 +1511,26 @@ static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode) static inline void afs_check_for_remote_deletion(struct afs_operation *op, struct afs_vnode *vnode) { - if (op->ac.error == -ENOENT) { + if (op->error == -ENOENT) { set_bit(AFS_VNODE_DELETED, &vnode->flags); afs_break_callback(vnode, afs_cb_break_for_deleted); } } +/* + * Note that a dentry got changed. We need to set d_fsdata to the data version + * number derived from the result of the operation. It doesn't matter if + * d_fsdata goes backwards as we'll just revalidate. + */ +static inline void afs_update_dentry_version(struct afs_operation *op, + struct afs_vnode_param *dir_vp, + struct dentry *dentry) +{ + if (!op->error) + dentry->d_fsdata = + (void *)(unsigned long)dir_vp->scb.status.data_version; +} + static inline int afs_io_error(struct afs_call *call, enum afs_io_error where) { trace_afs_io_error(call->debug_id, -EIO, where); diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index c930033473f6..8c8dc2397c5d 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -14,37 +14,6 @@ #include "internal.h" #include "afs_fs.h" -/* - * Begin an operation on the fileserver. - * - * Fileserver operations are serialised on the server by vnode, so we serialise - * them here also using the io_lock. - */ -bool afs_begin_vnode_operation(struct afs_operation *op, struct afs_vnode *vnode, - struct key *key, bool intr) -{ - memset(op, 0, sizeof(*op)); - op->vnode = vnode; - op->key = key; - op->ac.error = SHRT_MAX; - op->error = -EDESTADDRREQ; - - if (intr) { - op->flags |= AFS_OPERATION_INTR; - if (mutex_lock_interruptible(&vnode->io_lock) < 0) { - op->error = -EINTR; - op->flags |= AFS_OPERATION_STOP; - return false; - } - } else { - mutex_lock(&vnode->io_lock); - } - - if (vnode->lock_state != AFS_VNODE_LOCK_NONE) - op->flags |= AFS_OPERATION_CUR_ONLY; - return true; -} - /* * Begin iteration through a server list, starting with the vnode's last used * server if possible, or the last recorded good server if not. @@ -55,9 +24,9 @@ static bool afs_start_fs_iteration(struct afs_operation *op, struct afs_cb_interest *cbi; int i; - read_lock(&vnode->volume->servers_lock); - op->server_list = afs_get_serverlist(vnode->volume->servers); - read_unlock(&vnode->volume->servers_lock); + read_lock(&op->volume->servers_lock); + op->server_list = afs_get_serverlist(op->volume->servers); + read_unlock(&op->volume->servers_lock); op->untried = (1UL << op->server_list->nr_servers) - 1; op->index = READ_ONCE(op->server_list->preferred); @@ -90,7 +59,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op, vnode->cb_break++; write_sequnlock(&vnode->cb_lock); - afs_put_cb_interest(afs_v2net(vnode), cbi); + afs_put_cb_interest(op->net, cbi); cbi = NULL; } @@ -120,7 +89,7 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code) */ static bool afs_sleep_and_retry(struct afs_operation *op) { - if (op->flags & AFS_OPERATION_INTR) { + if (!(op->flags & AFS_OPERATION_UNINTR)) { msleep_interruptible(1000); if (signal_pending(current)) { op->error = -ERESTARTSYS; @@ -141,7 +110,7 @@ bool afs_select_fileserver(struct afs_operation *op) { struct afs_addr_list *alist; struct afs_server *server; - struct afs_vnode *vnode = op->vnode; + struct afs_vnode *vnode = op->file[0].vnode; struct afs_error e; u32 rtt; int error = op->ac.error, i; @@ -187,16 +156,16 @@ bool afs_select_fileserver(struct afs_operation *op) goto next_server; } - write_lock(&vnode->volume->servers_lock); + write_lock(&op->volume->servers_lock); op->server_list->vnovol_mask |= 1 << op->index; - write_unlock(&vnode->volume->servers_lock); + write_unlock(&op->volume->servers_lock); - set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); - error = afs_check_volume_status(vnode->volume, op); + set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags); + error = afs_check_volume_status(op->volume, op); if (error < 0) goto failed_set_error; - if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) { + if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) { op->error = -ENOMEDIUM; goto failed; } @@ -204,7 +173,7 @@ bool afs_select_fileserver(struct afs_operation *op) /* If the server list didn't change, then assume that * it's the fileserver having trouble. */ - if (vnode->volume->servers == op->server_list) { + if (op->volume->servers == op->server_list) { op->error = -EREMOTEIO; goto next_server; } @@ -224,9 +193,9 @@ bool afs_select_fileserver(struct afs_operation *op) goto next_server; case VOFFLINE: - if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags)) { - afs_busy(vnode->volume, op->ac.abort_code); - clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags); + if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &op->volume->flags)) { + afs_busy(op->volume, op->ac.abort_code); + clear_bit(AFS_VOLUME_BUSY, &op->volume->flags); } if (op->flags & AFS_OPERATION_NO_VSLEEP) { op->error = -EADV; @@ -248,9 +217,9 @@ bool afs_select_fileserver(struct afs_operation *op) op->error = -EBUSY; goto failed; } - if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) { - afs_busy(vnode->volume, op->ac.abort_code); - clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags); + if (!test_and_set_bit(AFS_VOLUME_BUSY, &op->volume->flags)) { + afs_busy(op->volume, op->ac.abort_code); + clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags); } busy: if (op->flags & AFS_OPERATION_CUR_ONLY) { @@ -279,9 +248,9 @@ bool afs_select_fileserver(struct afs_operation *op) } op->flags |= AFS_OPERATION_VMOVED; - set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags); - set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); - error = afs_check_volume_status(vnode->volume, op); + set_bit(AFS_VOLUME_WAIT, &op->volume->flags); + set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags); + error = afs_check_volume_status(op->volume, op); if (error < 0) goto failed_set_error; @@ -294,7 +263,7 @@ bool afs_select_fileserver(struct afs_operation *op) * * TODO: Retry a few times with sleeps. */ - if (vnode->volume->servers == op->server_list) { + if (op->volume->servers == op->server_list) { op->error = -ENOMEDIUM; goto failed; } @@ -302,8 +271,8 @@ bool afs_select_fileserver(struct afs_operation *op) goto restart_from_beginning; default: - clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags); - clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags); + clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags); + clear_bit(AFS_VOLUME_BUSY, &op->volume->flags); op->error = afs_abort_to_error(op->ac.abort_code); goto failed; } @@ -332,23 +301,23 @@ bool afs_select_fileserver(struct afs_operation *op) restart_from_beginning: _debug("restart"); afs_end_cursor(&op->ac); - afs_put_cb_interest(afs_v2net(vnode), op->cbi); + afs_put_cb_interest(op->net, op->cbi); op->cbi = NULL; - afs_put_serverlist(afs_v2net(vnode), op->server_list); + afs_put_serverlist(op->net, op->server_list); op->server_list = NULL; start: _debug("start"); /* See if we need to do an update of the volume record. Note that the * volume may have moved or even have been deleted. */ - error = afs_check_volume_status(vnode->volume, op); + error = afs_check_volume_status(op->volume, op); if (error < 0) goto failed_set_error; if (!afs_start_fs_iteration(op, vnode)) goto failed; - _debug("__ VOL %llx __", vnode->volume->vid); + _debug("__ VOL %llx __", op->volume->vid); pick_server: _debug("pick [%lx]", op->untried); @@ -364,7 +333,7 @@ pick_server: _debug("cbi %u", op->index); if (test_bit(op->index, &op->untried)) goto selected_server; - afs_put_cb_interest(afs_v2net(vnode), op->cbi); + afs_put_cb_interest(op->net, op->cbi); op->cbi = NULL; _debug("nocbi"); } @@ -482,26 +451,21 @@ failed: */ bool afs_select_current_fileserver(struct afs_operation *op) { - struct afs_vnode *vnode = op->vnode; struct afs_cb_interest *cbi; struct afs_addr_list *alist; int error = op->ac.error; _enter(""); - cbi = rcu_dereference_protected(vnode->cb_interest, - lockdep_is_held(&vnode->io_lock)); - switch (error) { case SHRT_MAX: + cbi = op->cbi; if (!cbi) { op->error = -ESTALE; op->flags |= AFS_OPERATION_STOP; return false; } - op->cbi = afs_get_cb_interest(cbi); - read_lock(&cbi->server->fs_lock); alist = rcu_dereference_protected(cbi->server->addresses, lockdep_is_held(&cbi->server->fs_lock)); @@ -561,7 +525,7 @@ iterate_address: /* * Dump cursor state in the case of the error being EDESTADDRREQ. */ -static void afs_dump_edestaddrreq(const struct afs_operation *op) +void afs_dump_edestaddrreq(const struct afs_operation *op) { static int count; int i; @@ -573,8 +537,9 @@ static void afs_dump_edestaddrreq(const struct afs_operation *op) rcu_read_lock(); pr_notice("EDESTADDR occurred\n"); - pr_notice("FC: cbb=%x cbb2=%x fl=%hx err=%hd\n", - op->cb_break, op->cb_break_2, op->flags, op->error); + pr_notice("FC: cbb=%x cbb2=%x fl=%x err=%hd\n", + op->file[0].cb_break_before, + op->file[1].cb_break_before, op->flags, op->error); pr_notice("FC: ut=%lx ix=%d ni=%u\n", op->untried, op->index, op->nr_iterations); @@ -606,28 +571,3 @@ static void afs_dump_edestaddrreq(const struct afs_operation *op) op->ac.responded, op->ac.nr_iterations); rcu_read_unlock(); } - -/* - * Tidy up a filesystem cursor and unlock the vnode. - */ -int afs_end_vnode_operation(struct afs_operation *op) -{ - struct afs_net *net = afs_v2net(op->vnode); - - if (op->error == -EDESTADDRREQ || - op->error == -EADDRNOTAVAIL || - op->error == -ENETUNREACH || - op->error == -EHOSTUNREACH) - afs_dump_edestaddrreq(op); - - mutex_unlock(&op->vnode->io_lock); - - afs_end_cursor(&op->ac); - afs_put_cb_interest(net, op->cbi); - afs_put_serverlist(net, op->server_list); - - if (op->error == -ECONNABORTED) - op->error = afs_abort_to_error(op->ac.abort_code); - - return op->error; -} diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 00b87bac4fec..bd4d8e5efe59 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -283,18 +283,19 @@ static void afs_load_bvec(struct afs_call *call, struct msghdr *msg, struct bio_vec *bv, pgoff_t first, pgoff_t last, unsigned offset) { + struct afs_operation *op = call->op; struct page *pages[AFS_BVEC_MAX]; unsigned int nr, n, i, to, bytes = 0; nr = min_t(pgoff_t, last - first + 1, AFS_BVEC_MAX); - n = find_get_pages_contig(call->mapping, first, nr, pages); + n = find_get_pages_contig(op->store.mapping, first, nr, pages); ASSERTCMP(n, ==, nr); msg->msg_flags |= MSG_MORE; for (i = 0; i < nr; i++) { to = PAGE_SIZE; if (first + i >= last) { - to = call->last_to; + to = op->store.last_to; msg->msg_flags &= ~MSG_MORE; } bv[i].bv_page = pages[i]; @@ -324,13 +325,14 @@ static void afs_notify_end_request_tx(struct sock *sock, */ static int afs_send_pages(struct afs_call *call, struct msghdr *msg) { + struct afs_operation *op = call->op; struct bio_vec bv[AFS_BVEC_MAX]; unsigned int bytes, nr, loop, offset; - pgoff_t first = call->first, last = call->last; + pgoff_t first = op->store.first, last = op->store.last; int ret; - offset = call->first_offset; - call->first_offset = 0; + offset = op->store.first_offset; + op->store.first_offset = 0; do { afs_load_bvec(call, msg, bv, first, last, offset); @@ -340,7 +342,7 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg) bytes = msg->msg_iter.count; nr = msg->msg_iter.nr_segs; - ret = rxrpc_kernel_send_data(call->net->socket, call->rxcall, msg, + ret = rxrpc_kernel_send_data(op->net->socket, call->rxcall, msg, bytes, afs_notify_end_request_tx); for (loop = 0; loop < nr; loop++) put_page(bv[loop].bv_page); @@ -350,7 +352,7 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg) first += nr; } while (first <= last); - trace_afs_sent_pages(call, call->first, last, first, ret); + trace_afs_sent_pages(call, op->store.first, last, first, ret); return ret; } @@ -385,16 +387,18 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) */ tx_total_len = call->request_size; if (call->send_pages) { - if (call->last == call->first) { - tx_total_len += call->last_to - call->first_offset; + struct afs_operation *op = call->op; + + if (op->store.last == op->store.first) { + tx_total_len += op->store.last_to - op->store.first_offset; } else { /* It looks mathematically like you should be able to * combine the following lines with the ones above, but * unsigned arithmetic is fun when it wraps... */ - tx_total_len += PAGE_SIZE - call->first_offset; - tx_total_len += call->last_to; - tx_total_len += (call->last - call->first - 1) * PAGE_SIZE; + tx_total_len += PAGE_SIZE - op->store.first_offset; + tx_total_len += op->store.last_to; + tx_total_len += (op->store.last - op->store.first - 1) * PAGE_SIZE; } } diff --git a/fs/afs/server.c b/fs/afs/server.c index 3008f2ecfeee..1c1e315094ae 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -424,10 +424,7 @@ static void __afs_put_server(struct afs_net *net, struct afs_server *server) afs_dec_servers_outstanding(net); } -/* - * destroy a dead server - */ -static void afs_destroy_server(struct afs_net *net, struct afs_server *server) +static void afs_give_up_callbacks(struct afs_net *net, struct afs_server *server) { struct afs_addr_list *alist = rcu_access_pointer(server->addresses); struct afs_addr_cursor ac = { @@ -436,8 +433,16 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server) .error = 0, }; + afs_fs_give_up_all_callbacks(net, server, &ac, NULL); +} + +/* + * destroy a dead server + */ +static void afs_destroy_server(struct afs_net *net, struct afs_server *server) +{ if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags)) - afs_fs_give_up_all_callbacks(net, server, &ac, NULL); + afs_give_up_callbacks(net, server); afs_put_server(net, server, afs_server_trace_destroy); } @@ -571,7 +576,8 @@ void afs_purge_servers(struct afs_net *net) /* * Get an update for a server's address list. */ -static noinline bool afs_update_server_record(struct afs_operation *fc, struct afs_server *server) +static noinline bool afs_update_server_record(struct afs_operation *op, + struct afs_server *server) { struct afs_addr_list *alist, *discard; @@ -580,18 +586,17 @@ static noinline bool afs_update_server_record(struct afs_operation *fc, struct a trace_afs_server(server, atomic_read(&server->ref), atomic_read(&server->active), afs_server_trace_update); - alist = afs_vl_lookup_addrs(fc->vnode->volume->cell, fc->key, - &server->uuid); + alist = afs_vl_lookup_addrs(op->volume->cell, op->key, &server->uuid); if (IS_ERR(alist)) { if ((PTR_ERR(alist) == -ERESTARTSYS || PTR_ERR(alist) == -EINTR) && - !(fc->flags & AFS_OPERATION_INTR) && + (op->flags & AFS_OPERATION_UNINTR) && server->addresses) { _leave(" = t [intr]"); return true; } - fc->error = PTR_ERR(alist); - _leave(" = f [%d]", fc->error); + op->error = PTR_ERR(alist); + _leave(" = f [%d]", op->error); return false; } @@ -613,7 +618,7 @@ static noinline bool afs_update_server_record(struct afs_operation *fc, struct a /* * See if a server's address list needs updating. */ -bool afs_check_server_record(struct afs_operation *fc, struct afs_server *server) +bool afs_check_server_record(struct afs_operation *op, struct afs_server *server) { bool success; int ret, retries = 0; @@ -633,7 +638,7 @@ retry: update: if (!test_and_set_bit_lock(AFS_SERVER_FL_UPDATING, &server->flags)) { clear_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags); - success = afs_update_server_record(fc, server); + success = afs_update_server_record(op, server); clear_bit_unlock(AFS_SERVER_FL_UPDATING, &server->flags); wake_up_bit(&server->flags, AFS_SERVER_FL_UPDATING); _leave(" = %d", success); @@ -642,10 +647,10 @@ update: wait: ret = wait_on_bit(&server->flags, AFS_SERVER_FL_UPDATING, - (fc->flags & AFS_OPERATION_INTR) ? - TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); + (op->flags & AFS_OPERATION_UNINTR) ? + TASK_UNINTERRUPTIBLE : TASK_INTERRUPTIBLE); if (ret == -ERESTARTSYS) { - fc->error = ret; + op->error = ret; _leave(" = f [intr]"); return false; } diff --git a/fs/afs/super.c b/fs/afs/super.c index 9f412d7e7edf..c4bb314a22ae 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -373,7 +373,7 @@ static int afs_validate_fc(struct fs_context *fc) ctx->key = key; if (ctx->volume) { - afs_put_volume(ctx->cell, ctx->volume); + afs_put_volume(ctx->net, ctx->volume); ctx->volume = NULL; } @@ -421,7 +421,6 @@ static int afs_set_super(struct super_block *sb, struct fs_context *fc) static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) { struct afs_super_info *as = AFS_FS_S(sb); - struct afs_iget_data iget_data; struct inode *inode = NULL; int ret; @@ -446,13 +445,7 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) } else { sprintf(sb->s_id, "%llu", as->volume->vid); afs_activate_volume(as->volume); - iget_data.fid.vid = as->volume->vid; - iget_data.fid.vnode = 1; - iget_data.fid.vnode_hi = 0; - iget_data.fid.unique = 1; - iget_data.cb_v_break = as->volume->cb_v_break; - iget_data.cb_s_break = 0; - inode = afs_iget(sb, ctx->key, &iget_data, NULL, NULL, NULL); + inode = afs_root_iget(sb, ctx->key); } if (IS_ERR(inode)) @@ -496,7 +489,7 @@ static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc) as->dyn_root = true; } else { as->cell = afs_get_cell(ctx->cell); - as->volume = __afs_get_volume(ctx->volume); + as->volume = afs_get_volume(ctx->volume); } } return as; @@ -505,8 +498,9 @@ static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc) static void afs_destroy_sbi(struct afs_super_info *as) { if (as) { - afs_put_volume(as->cell, as->volume); - afs_put_cell(afs_net(as->net_ns), as->cell); + struct afs_net *net = afs_net(as->net_ns); + afs_put_volume(net, as->volume); + afs_put_cell(net, as->cell); put_net(as->net_ns); kfree(as); } @@ -592,7 +586,7 @@ static void afs_free_fc(struct fs_context *fc) struct afs_fs_context *ctx = fc->fs_private; afs_destroy_sbi(fc->s_fs_info); - afs_put_volume(ctx->cell, ctx->volume); + afs_put_volume(ctx->net, ctx->volume); afs_put_cell(ctx->net, ctx->cell); key_put(ctx->key); kfree(ctx); @@ -709,17 +703,32 @@ static void afs_destroy_inode(struct inode *inode) atomic_dec(&afs_count_active_inodes); } +static void afs_get_volume_status_success(struct afs_operation *op) +{ + struct afs_volume_status *vs = &op->volstatus.vs; + struct kstatfs *buf = op->volstatus.buf; + + if (vs->max_quota == 0) + buf->f_blocks = vs->part_max_blocks; + else + buf->f_blocks = vs->max_quota; + buf->f_bavail = buf->f_bfree = buf->f_blocks - vs->blocks_in_use; +} + +static const struct afs_operation_ops afs_get_volume_status_operation = { + .issue_afs_rpc = afs_fs_get_volume_status, + .issue_yfs_rpc = yfs_fs_get_volume_status, + .success = afs_get_volume_status_success, +}; + /* * return information about an AFS volume */ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct afs_super_info *as = AFS_FS_S(dentry->d_sb); - struct afs_operation fc; - struct afs_volume_status vs; + struct afs_operation *op; struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); - struct key *key; - int ret; buf->f_type = dentry->d_sb->s_magic; buf->f_bsize = AFS_BLOCK_SIZE; @@ -732,31 +741,13 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } - key = afs_request_key(vnode->volume->cell); - if (IS_ERR(key)) - return PTR_ERR(key); + op = afs_alloc_operation(NULL, as->volume); + if (IS_ERR(op)) + return PTR_ERR(op); - ret = -ERESTARTSYS; - if (afs_begin_vnode_operation(&fc, vnode, key, true)) { - fc.flags |= AFS_OPERATION_NO_VSLEEP; - while (afs_select_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(vnode); - afs_fs_get_volume_status(&fc, &vs); - } - - afs_check_for_remote_deletion(&fc, fc.vnode); - ret = afs_end_vnode_operation(&fc); - } - - key_put(key); - - if (ret == 0) { - if (vs.max_quota == 0) - buf->f_blocks = vs.part_max_blocks; - else - buf->f_blocks = vs.max_quota; - buf->f_bavail = buf->f_bfree = buf->f_blocks - vs.blocks_in_use; - } - - return ret; + afs_op_set_vnode(op, 0, vnode); + op->nr_files = 1; + op->volstatus.buf = buf; + op->ops = &afs_get_volume_status_operation; + return afs_do_sync_operation(op); } diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 96351088a578..57d0509f7353 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -166,13 +166,13 @@ static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume) /* * Drop a reference on a volume record. */ -void afs_put_volume(struct afs_cell *cell, struct afs_volume *volume) +void afs_put_volume(struct afs_net *net, struct afs_volume *volume) { if (volume) { _enter("%s", volume->name); if (atomic_dec_and_test(&volume->usage)) - afs_destroy_volume(cell->net, volume); + afs_destroy_volume(net, volume); } } @@ -280,7 +280,7 @@ error: /* * Make sure the volume record is up to date. */ -int afs_check_volume_status(struct afs_volume *volume, struct afs_operation *fc) +int afs_check_volume_status(struct afs_volume *volume, struct afs_operation *op) { int ret, retries = 0; @@ -298,7 +298,7 @@ retry: update: if (!test_and_set_bit_lock(AFS_VOLUME_UPDATING, &volume->flags)) { clear_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags); - ret = afs_update_volume_status(volume, fc->key); + ret = afs_update_volume_status(volume, op->key); if (ret < 0) set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags); clear_bit_unlock(AFS_VOLUME_WAIT, &volume->flags); @@ -315,8 +315,8 @@ wait: } ret = wait_on_bit(&volume->flags, AFS_VOLUME_WAIT, - (fc->flags & AFS_OPERATION_INTR) ? - TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); + (op->flags & AFS_OPERATION_UNINTR) ? + TASK_UNINTERRUPTIBLE : TASK_INTERRUPTIBLE); if (ret == -ERESTARTSYS) { _leave(" = %d", ret); return ret; diff --git a/fs/afs/write.c b/fs/afs/write.c index 1a8af44ea36b..97bccde3298b 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -348,6 +348,67 @@ static void afs_pages_written_back(struct afs_vnode *vnode, _leave(""); } +/* + * Find a key to use for the writeback. We cached the keys used to author the + * writes on the vnode. *_wbk will contain the last writeback key used or NULL + * and we need to start from there if it's set. + */ +static int afs_get_writeback_key(struct afs_vnode *vnode, + struct afs_wb_key **_wbk) +{ + struct afs_wb_key *wbk = NULL; + struct list_head *p; + int ret = -ENOKEY, ret2; + + spin_lock(&vnode->wb_lock); + if (*_wbk) + p = (*_wbk)->vnode_link.next; + else + p = vnode->wb_keys.next; + + while (p != &vnode->wb_keys) { + wbk = list_entry(p, struct afs_wb_key, vnode_link); + _debug("wbk %u", key_serial(wbk->key)); + ret2 = key_validate(wbk->key); + if (ret2 == 0) { + refcount_inc(&wbk->usage); + _debug("USE WB KEY %u", key_serial(wbk->key)); + break; + } + + wbk = NULL; + if (ret == -ENOKEY) + ret = ret2; + p = p->next; + } + + spin_unlock(&vnode->wb_lock); + if (*_wbk) + afs_put_wb_key(*_wbk); + *_wbk = wbk; + return 0; +} + +static void afs_store_data_success(struct afs_operation *op) +{ + struct afs_vnode *vnode = op->file[0].vnode; + + afs_vnode_commit_status(op, &op->file[0]); + if (op->error == 0) { + afs_pages_written_back(vnode, op->store.first, op->store.last); + afs_stat_v(vnode, n_stores); + atomic_long_add((op->store.last * PAGE_SIZE + op->store.last_to) - + (op->store.first * PAGE_SIZE + op->store.first_offset), + &afs_v2net(vnode)->n_store_bytes); + } +} + +static const struct afs_operation_ops afs_store_data_operation = { + .issue_afs_rpc = afs_fs_store_data, + .issue_yfs_rpc = yfs_fs_store_data, + .success = afs_store_data_success, +}; + /* * write to a file */ @@ -356,11 +417,9 @@ static int afs_store_data(struct address_space *mapping, unsigned offset, unsigned to) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct afs_operation fc; - struct afs_status_cb *scb; + struct afs_operation *op; struct afs_wb_key *wbk = NULL; - struct list_head *p; - int ret = -ENOKEY, ret2; + int ret; _enter("%s{%llx:%llu.%u},%lx,%lx,%x,%x", vnode->volume->name, @@ -369,62 +428,32 @@ static int afs_store_data(struct address_space *mapping, vnode->fid.unique, first, last, offset, to); - scb = kzalloc(sizeof(struct afs_status_cb), GFP_NOFS); - if (!scb) + ret = afs_get_writeback_key(vnode, &wbk); + if (ret) { + _leave(" = %d [no keys]", ret); + return ret; + } + + op = afs_alloc_operation(wbk->key, vnode->volume); + if (IS_ERR(op)) { + afs_put_wb_key(wbk); return -ENOMEM; + } - spin_lock(&vnode->wb_lock); - p = vnode->wb_keys.next; + afs_op_set_vnode(op, 0, vnode); + op->file[0].dv_delta = 1; + op->store.mapping = mapping; + op->store.first = first; + op->store.last = last; + op->store.first_offset = offset; + op->store.last_to = to; + op->ops = &afs_store_data_operation; - /* Iterate through the list looking for a valid key to use. */ try_next_key: - while (p != &vnode->wb_keys) { - wbk = list_entry(p, struct afs_wb_key, vnode_link); - _debug("wbk %u", key_serial(wbk->key)); - ret2 = key_validate(wbk->key); - if (ret2 == 0) - goto found_key; - if (ret == -ENOKEY) - ret = ret2; - p = p->next; - } + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); - spin_unlock(&vnode->wb_lock); - afs_put_wb_key(wbk); - kfree(scb); - _leave(" = %d [no keys]", ret); - return ret; - -found_key: - refcount_inc(&wbk->usage); - spin_unlock(&vnode->wb_lock); - - _debug("USE WB KEY %u", key_serial(wbk->key)); - - ret = -ERESTARTSYS; - if (afs_begin_vnode_operation(&fc, vnode, wbk->key, false)) { - afs_dataversion_t data_version = vnode->status.data_version + 1; - - while (afs_select_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(vnode); - afs_fs_store_data(&fc, mapping, first, last, offset, to, scb); - } - - afs_check_for_remote_deletion(&fc, vnode); - afs_vnode_commit_status(&fc, vnode, fc.cb_break, - &data_version, scb); - if (fc.ac.error == 0) - afs_pages_written_back(vnode, first, last); - ret = afs_end_vnode_operation(&fc); - } - - switch (ret) { - case 0: - afs_stat_v(vnode, n_stores); - atomic_long_add((last * PAGE_SIZE + to) - - (first * PAGE_SIZE + offset), - &afs_v2net(vnode)->n_store_bytes); - break; + switch (op->error) { case -EACCES: case -EPERM: case -ENOKEY: @@ -432,16 +461,19 @@ found_key: case -EKEYREJECTED: case -EKEYREVOKED: _debug("next"); - spin_lock(&vnode->wb_lock); - p = wbk->vnode_link.next; - afs_put_wb_key(wbk); - goto try_next_key; + + ret = afs_get_writeback_key(vnode, &wbk); + if (ret == 0) { + key_put(op->key); + op->key = key_get(wbk->key); + goto try_next_key; + } + break; } afs_put_wb_key(wbk); - kfree(scb); - _leave(" = %d", ret); - return ret; + _leave(" = %d", op->error); + return afs_put_operation(op); } /* diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index bf645f1c90b0..84f3c4f57531 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -34,6 +34,25 @@ ssize_t afs_listxattr(struct dentry *dentry, char *buffer, size_t size) return sizeof(afs_xattr_list); } +/* + * Deal with the result of a successful fetch ACL operation. + */ +static void afs_acl_success(struct afs_operation *op) +{ + afs_vnode_commit_status(op, &op->file[0]); +} + +static void afs_acl_put(struct afs_operation *op) +{ + kfree(op->acl); +} + +static const struct afs_operation_ops afs_fetch_acl_operation = { + .issue_afs_rpc = afs_fs_fetch_acl, + .success = afs_acl_success, + .put = afs_acl_put, +}; + /* * Get a file's ACL. */ @@ -42,37 +61,23 @@ static int afs_xattr_get_acl(const struct xattr_handler *handler, struct inode *inode, const char *name, void *buffer, size_t size) { - struct afs_operation fc; - struct afs_status_cb *scb; + struct afs_operation *op; struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_acl *acl = NULL; - struct key *key; - int ret = -ENOMEM; + int ret; - scb = kzalloc(sizeof(struct afs_status_cb), GFP_NOFS); - if (!scb) - goto error; + op = afs_alloc_operation(NULL, vnode->volume); + if (IS_ERR(op)) + return -ENOMEM; - key = afs_request_key(vnode->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - goto error_scb; - } + afs_op_set_vnode(op, 0, vnode); + op->ops = &afs_fetch_acl_operation; - ret = -ERESTARTSYS; - if (afs_begin_vnode_operation(&fc, vnode, key, true)) { - afs_dataversion_t data_version = vnode->status.data_version; - - while (afs_select_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(vnode); - acl = afs_fs_fetch_acl(&fc, scb); - } - - afs_check_for_remote_deletion(&fc, fc.vnode); - afs_vnode_commit_status(&fc, vnode, fc.cb_break, - &data_version, scb); - ret = afs_end_vnode_operation(&fc); - } + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + acl = op->acl; + op->acl = NULL; + ret = afs_put_operation(op); if (ret == 0) { ret = acl->size; @@ -80,18 +85,37 @@ static int afs_xattr_get_acl(const struct xattr_handler *handler, if (acl->size <= size) memcpy(buffer, acl->data, acl->size); else - ret = -ERANGE; + op->error = -ERANGE; } - kfree(acl); } - key_put(key); -error_scb: - kfree(scb); -error: + kfree(acl); return ret; } +static bool afs_make_acl(struct afs_operation *op, + const void *buffer, size_t size) +{ + struct afs_acl *acl; + + acl = kmalloc(sizeof(*acl) + size, GFP_KERNEL); + if (!acl) { + afs_op_nomem(op); + return false; + } + + acl->size = size; + memcpy(acl->data, buffer, size); + op->acl = acl; + return true; +} + +static const struct afs_operation_ops afs_store_acl_operation = { + .issue_afs_rpc = afs_fs_store_acl, + .success = afs_acl_success, + .put = afs_acl_put, +}; + /* * Set a file's AFS3 ACL. */ @@ -100,55 +124,22 @@ static int afs_xattr_set_acl(const struct xattr_handler *handler, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { - struct afs_operation fc; - struct afs_status_cb *scb; + struct afs_operation *op; struct afs_vnode *vnode = AFS_FS_I(inode); - struct afs_acl *acl = NULL; - struct key *key; - int ret = -ENOMEM; if (flags == XATTR_CREATE) return -EINVAL; - scb = kzalloc(sizeof(struct afs_status_cb), GFP_NOFS); - if (!scb) - goto error; + op = afs_alloc_operation(NULL, vnode->volume); + if (IS_ERR(op)) + return -ENOMEM; - acl = kmalloc(sizeof(*acl) + size, GFP_KERNEL); - if (!acl) - goto error_scb; + afs_op_set_vnode(op, 0, vnode); + if (!afs_make_acl(op, buffer, size)) + return afs_put_operation(op); - key = afs_request_key(vnode->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - goto error_acl; - } - - acl->size = size; - memcpy(acl->data, buffer, size); - - ret = -ERESTARTSYS; - if (afs_begin_vnode_operation(&fc, vnode, key, true)) { - afs_dataversion_t data_version = vnode->status.data_version; - - while (afs_select_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(vnode); - afs_fs_store_acl(&fc, acl, scb); - } - - afs_check_for_remote_deletion(&fc, fc.vnode); - afs_vnode_commit_status(&fc, vnode, fc.cb_break, - &data_version, scb); - ret = afs_end_vnode_operation(&fc); - } - - key_put(key); -error_acl: - kfree(acl); -error_scb: - kfree(scb); -error: - return ret; + op->ops = &afs_store_acl_operation; + return afs_do_sync_operation(op); } static const struct xattr_handler afs_xattr_afs_acl_handler = { @@ -157,6 +148,17 @@ static const struct xattr_handler afs_xattr_afs_acl_handler = { .set = afs_xattr_set_acl, }; +static void yfs_acl_put(struct afs_operation *op) +{ + yfs_free_opaque_acl(op->yacl); +} + +static const struct afs_operation_ops yfs_fetch_opaque_acl_operation = { + .issue_yfs_rpc = yfs_fs_fetch_opaque_acl, + .success = afs_acl_success, + /* Don't free op->yacl in .put here */ +}; + /* * Get a file's YFS ACL. */ @@ -165,11 +167,9 @@ static int afs_xattr_get_yfs(const struct xattr_handler *handler, struct inode *inode, const char *name, void *buffer, size_t size) { - struct afs_operation fc; - struct afs_status_cb *scb; + struct afs_operation *op; struct afs_vnode *vnode = AFS_FS_I(inode); struct yfs_acl *yacl = NULL; - struct key *key; char buf[16], *data; int which = 0, dsize, ret = -ENOMEM; @@ -193,75 +193,62 @@ static int afs_xattr_get_yfs(const struct xattr_handler *handler, else if (which == 3) yacl->flags |= YFS_ACL_WANT_VOL_ACL; - scb = kzalloc(sizeof(struct afs_status_cb), GFP_NOFS); - if (!scb) + op = afs_alloc_operation(NULL, vnode->volume); + if (IS_ERR(op)) goto error_yacl; - key = afs_request_key(vnode->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - goto error_scb; - } + afs_op_set_vnode(op, 0, vnode); + op->yacl = yacl; + op->ops = &yfs_fetch_opaque_acl_operation; - ret = -ERESTARTSYS; - if (afs_begin_vnode_operation(&fc, vnode, key, true)) { - afs_dataversion_t data_version = vnode->status.data_version; + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + ret = afs_put_operation(op); - while (afs_select_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(vnode); - yfs_fs_fetch_opaque_acl(&fc, yacl, scb); + if (ret == 0) { + switch (which) { + case 0: + data = yacl->acl->data; + dsize = yacl->acl->size; + break; + case 1: + data = buf; + dsize = scnprintf(buf, sizeof(buf), "%u", yacl->inherit_flag); + break; + case 2: + data = buf; + dsize = scnprintf(buf, sizeof(buf), "%u", yacl->num_cleaned); + break; + case 3: + data = yacl->vol_acl->data; + dsize = yacl->vol_acl->size; + break; + default: + ret = -EOPNOTSUPP; + goto error_yacl; } - afs_check_for_remote_deletion(&fc, fc.vnode); - afs_vnode_commit_status(&fc, vnode, fc.cb_break, - &data_version, scb); - ret = afs_end_vnode_operation(&fc); - } - - if (ret < 0) - goto error_key; - - switch (which) { - case 0: - data = yacl->acl->data; - dsize = yacl->acl->size; - break; - case 1: - data = buf; - dsize = scnprintf(buf, sizeof(buf), "%u", yacl->inherit_flag); - break; - case 2: - data = buf; - dsize = scnprintf(buf, sizeof(buf), "%u", yacl->num_cleaned); - break; - case 3: - data = yacl->vol_acl->data; - dsize = yacl->vol_acl->size; - break; - default: - ret = -EOPNOTSUPP; - goto error_key; - } - - ret = dsize; - if (size > 0) { - if (dsize > size) { - ret = -ERANGE; - goto error_key; + ret = dsize; + if (size > 0) { + if (dsize <= size) + memcpy(buffer, data, dsize); + else + ret = -ERANGE; } - memcpy(buffer, data, dsize); } -error_key: - key_put(key); -error_scb: - kfree(scb); error_yacl: yfs_free_opaque_acl(yacl); error: return ret; } +static const struct afs_operation_ops yfs_store_opaque_acl2_operation = { + .issue_yfs_rpc = yfs_fs_store_opaque_acl2, + .success = afs_acl_success, + .put = yfs_acl_put, +}; + /* * Set a file's YFS ACL. */ @@ -270,56 +257,23 @@ static int afs_xattr_set_yfs(const struct xattr_handler *handler, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { - struct afs_operation fc; - struct afs_status_cb *scb; + struct afs_operation *op; struct afs_vnode *vnode = AFS_FS_I(inode); - struct afs_acl *acl = NULL; - struct key *key; - int ret = -ENOMEM; if (flags == XATTR_CREATE || strcmp(name, "acl") != 0) return -EINVAL; - scb = kzalloc(sizeof(struct afs_status_cb), GFP_NOFS); - if (!scb) - goto error; + op = afs_alloc_operation(NULL, vnode->volume); + if (IS_ERR(op)) + return -ENOMEM; - acl = kmalloc(sizeof(*acl) + size, GFP_KERNEL); - if (!acl) - goto error_scb; + afs_op_set_vnode(op, 0, vnode); + if (!afs_make_acl(op, buffer, size)) + return afs_put_operation(op); - acl->size = size; - memcpy(acl->data, buffer, size); - - key = afs_request_key(vnode->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - goto error_acl; - } - - ret = -ERESTARTSYS; - if (afs_begin_vnode_operation(&fc, vnode, key, true)) { - afs_dataversion_t data_version = vnode->status.data_version; - - while (afs_select_fileserver(&fc)) { - fc.cb_break = afs_calc_vnode_cb_break(vnode); - yfs_fs_store_opaque_acl2(&fc, acl, scb); - } - - afs_check_for_remote_deletion(&fc, fc.vnode); - afs_vnode_commit_status(&fc, vnode, fc.cb_break, - &data_version, scb); - ret = afs_end_vnode_operation(&fc); - } - -error_acl: - kfree(acl); - key_put(key); -error_scb: - kfree(scb); -error: - return ret; + op->ops = &yfs_store_opaque_acl2_operation; + return afs_do_sync_operation(op); } static const struct xattr_handler afs_xattr_yfs_handler = { diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 360b4a560ba7..d0cd112a3720 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -17,11 +17,6 @@ static const struct afs_fid afs_zero_fid; -static inline void afs_use_fs_server(struct afs_call *call, struct afs_cb_interest *cbi) -{ - call->cbi = afs_get_cb_interest(cbi); -} - #define xdr_size(x) (sizeof(*x) / sizeof(__be32)) static void xdr_decode_YFSFid(const __be32 **_bp, struct afs_fid *fid) @@ -79,6 +74,11 @@ static __be32 *xdr_encode_string(__be32 *bp, const char *p, unsigned int len) return bp + len / sizeof(__be32); } +static __be32 *xdr_encode_name(__be32 *bp, const struct qstr *p) +{ + return xdr_encode_string(bp, p->name, p->len); +} + static s64 linux_to_yfs_time(const struct timespec64 *t) { /* Convert to 100ns intervals. */ @@ -336,6 +336,7 @@ static void xdr_decode_YFSFetchVolumeStatus(const __be32 **_bp, */ static int yfs_deliver_fs_status_cb_and_volsync(struct afs_call *call) { + struct afs_operation *op = call->op; const __be32 *bp; int ret; @@ -345,9 +346,9 @@ static int yfs_deliver_fs_status_cb_and_volsync(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); - xdr_decode_YFSCallBack(&bp, call, call->out_scb); - xdr_decode_YFSVolSync(&bp, call->out_volsync); + xdr_decode_YFSFetchStatus(&bp, call, &op->file[0].scb); + xdr_decode_YFSCallBack(&bp, call, &op->file[0].scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; @@ -359,6 +360,7 @@ static int yfs_deliver_fs_status_cb_and_volsync(struct afs_call *call) */ static int yfs_deliver_status_and_volsync(struct afs_call *call) { + struct afs_operation *op = call->op; const __be32 *bp; int ret; @@ -367,8 +369,8 @@ static int yfs_deliver_status_and_volsync(struct afs_call *call) return ret; bp = call->buffer; - xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); - xdr_decode_YFSVolSync(&bp, call->out_volsync); + xdr_decode_YFSFetchStatus(&bp, call, &op->file[0].scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; @@ -387,44 +389,33 @@ static const struct afs_call_type yfs_RXYFSFetchStatus_vnode = { /* * Fetch the status information for a file. */ -int yfs_fs_fetch_file_status(struct afs_operation *fc, struct afs_status_cb *scb, - struct afs_volsync *volsync) +void yfs_fs_fetch_file_status(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); __be32 *bp; _enter(",%x,{%llx:%llu},,", - key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + key_serial(op->key), vp->fid.vid, vp->fid.vnode); - call = afs_alloc_flat_call(net, &yfs_RXYFSFetchStatus_vnode, + call = afs_alloc_flat_call(op->net, &yfs_RXYFSFetchStatus_vnode, sizeof(__be32) * 2 + sizeof(struct yfs_xdr_YFSFid), sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSCallBack) + sizeof(struct yfs_xdr_YFSVolSync)); - if (!call) { - fc->ac.error = -ENOMEM; - return -ENOMEM; - } - - call->key = fc->key; - call->out_scb = scb; - call->out_volsync = volsync; + if (!call) + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSFETCHSTATUS); bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -432,7 +423,9 @@ int yfs_fs_fetch_file_status(struct afs_operation *fc, struct afs_status_cb *scb */ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) { - struct afs_read *req = call->read_request; + struct afs_operation *op = call->op; + struct afs_vnode_param *vp = &op->file[0]; + struct afs_read *req = op->fetch.req; const __be32 *bp; unsigned int size; int ret; @@ -527,12 +520,12 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) return ret; bp = call->buffer; - xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); - xdr_decode_YFSCallBack(&bp, call, call->out_scb); - xdr_decode_YFSVolSync(&bp, call->out_volsync); + xdr_decode_YFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_YFSCallBack(&bp, call, &vp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); - req->data_version = call->out_scb->status.data_version; - req->file_size = call->out_scb->status.size; + req->data_version = vp->scb.status.data_version; + req->file_size = vp->scb.status.size; call->unmarshall++; /* Fall through */ @@ -556,12 +549,6 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) return 0; } -static void yfs_fetch_data_destructor(struct afs_call *call) -{ - afs_put_read(call->read_request); - afs_flat_call_destructor(call); -} - /* * YFS.FetchData64 operation type */ @@ -569,25 +556,24 @@ static const struct afs_call_type yfs_RXYFSFetchData64 = { .name = "YFS.FetchData64", .op = yfs_FS_FetchData64, .deliver = yfs_deliver_fs_fetch_data64, - .destructor = yfs_fetch_data_destructor, + .destructor = afs_flat_call_destructor, }; /* * Fetch data from a file. */ -int yfs_fs_fetch_data(struct afs_operation *fc, struct afs_status_cb *scb, - struct afs_read *req) +void yfs_fs_fetch_data(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; + struct afs_read *req = op->fetch.req; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); __be32 *bp; _enter(",%x,{%llx:%llu},%llx,%llx", - key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode, + key_serial(op->key), vp->fid.vid, vp->fid.vnode, req->pos, req->len); - call = afs_alloc_flat_call(net, &yfs_RXYFSFetchData64, + call = afs_alloc_flat_call(op->net, &yfs_RXYFSFetchData64, sizeof(__be32) * 2 + sizeof(struct yfs_xdr_YFSFid) + sizeof(struct yfs_xdr_u64) * 2, @@ -595,27 +581,19 @@ int yfs_fs_fetch_data(struct afs_operation *fc, struct afs_status_cb *scb, sizeof(struct yfs_xdr_YFSCallBack) + sizeof(struct yfs_xdr_YFSVolSync)); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_scb = scb; - call->out_volsync = NULL; - call->read_request = afs_get_read(req); + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSFETCHDATA64); bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_YFSFid(bp, &vp->fid); bp = xdr_encode_u64(bp, req->pos); bp = xdr_encode_u64(bp, req->len); yfs_check_req(call, bp); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -623,6 +601,9 @@ int yfs_fs_fetch_data(struct afs_operation *fc, struct afs_status_cb *scb, */ static int yfs_deliver_fs_create_vnode(struct afs_call *call) { + struct afs_operation *op = call->op; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; const __be32 *bp; int ret; @@ -634,11 +615,11 @@ static int yfs_deliver_fs_create_vnode(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_YFSFid(&bp, call->out_fid); - xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); - xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb); - xdr_decode_YFSCallBack(&bp, call, call->out_scb); - xdr_decode_YFSVolSync(&bp, call->out_volsync); + xdr_decode_YFSFid(&bp, &op->file[1].fid); + xdr_decode_YFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &dvp->scb); + xdr_decode_YFSCallBack(&bp, call, &vp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; @@ -657,26 +638,20 @@ static const struct afs_call_type afs_RXFSCreateFile = { /* * Create a file. */ -int yfs_fs_create_file(struct afs_operation *fc, - const char *name, - umode_t mode, - struct afs_status_cb *dvnode_scb, - struct afs_fid *newfid, - struct afs_status_cb *new_scb) +void yfs_fs_create_file(struct afs_operation *op) { - struct afs_vnode *dvnode = fc->vnode; + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(dvnode); - size_t namesz, reqsz, rplsz; + size_t reqsz, rplsz; __be32 *bp; _enter(""); - namesz = strlen(name); reqsz = (sizeof(__be32) + sizeof(__be32) + sizeof(struct yfs_xdr_YFSFid) + - xdr_strlen(namesz) + + xdr_strlen(name->len) + sizeof(struct yfs_xdr_YFSStoreStatus) + sizeof(__be32)); rplsz = (sizeof(struct yfs_xdr_YFSFid) + @@ -685,30 +660,22 @@ int yfs_fs_create_file(struct afs_operation *fc, sizeof(struct yfs_xdr_YFSCallBack) + sizeof(struct yfs_xdr_YFSVolSync)); - call = afs_alloc_flat_call(net, &afs_RXFSCreateFile, reqsz, rplsz); + call = afs_alloc_flat_call(op->net, &afs_RXFSCreateFile, reqsz, rplsz); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_dir_scb = dvnode_scb; - call->out_fid = newfid; - call->out_scb = new_scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSCREATEFILE); bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_YFSFid(bp, &dvnode->fid); - bp = xdr_encode_string(bp, name, namesz); - bp = xdr_encode_YFSStoreStatus_mode(bp, mode); + bp = xdr_encode_YFSFid(bp, &dvp->fid); + bp = xdr_encode_name(bp, name); + bp = xdr_encode_YFSStoreStatus_mode(bp, op->create.mode); bp = xdr_encode_u32(bp, yfs_LockNone); /* ViceLockType */ yfs_check_req(call, bp); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call1(call, &dvnode->fid, name); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); } static const struct afs_call_type yfs_RXFSMakeDir = { @@ -721,26 +688,20 @@ static const struct afs_call_type yfs_RXFSMakeDir = { /* * Make a directory. */ -int yfs_fs_make_dir(struct afs_operation *fc, - const char *name, - umode_t mode, - struct afs_status_cb *dvnode_scb, - struct afs_fid *newfid, - struct afs_status_cb *new_scb) +void yfs_fs_make_dir(struct afs_operation *op) { - struct afs_vnode *dvnode = fc->vnode; + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(dvnode); - size_t namesz, reqsz, rplsz; + size_t reqsz, rplsz; __be32 *bp; _enter(""); - namesz = strlen(name); reqsz = (sizeof(__be32) + sizeof(struct yfs_xdr_RPCFlags) + sizeof(struct yfs_xdr_YFSFid) + - xdr_strlen(namesz) + + xdr_strlen(name->len) + sizeof(struct yfs_xdr_YFSStoreStatus)); rplsz = (sizeof(struct yfs_xdr_YFSFid) + sizeof(struct yfs_xdr_YFSFetchStatus) + @@ -748,29 +709,21 @@ int yfs_fs_make_dir(struct afs_operation *fc, sizeof(struct yfs_xdr_YFSCallBack) + sizeof(struct yfs_xdr_YFSVolSync)); - call = afs_alloc_flat_call(net, &yfs_RXFSMakeDir, reqsz, rplsz); + call = afs_alloc_flat_call(op->net, &yfs_RXFSMakeDir, reqsz, rplsz); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_dir_scb = dvnode_scb; - call->out_fid = newfid; - call->out_scb = new_scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSMAKEDIR); bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_YFSFid(bp, &dvnode->fid); - bp = xdr_encode_string(bp, name, namesz); - bp = xdr_encode_YFSStoreStatus_mode(bp, mode); + bp = xdr_encode_YFSFid(bp, &dvp->fid); + bp = xdr_encode_name(bp, name); + bp = xdr_encode_YFSStoreStatus_mode(bp, op->create.mode); yfs_check_req(call, bp); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call1(call, &dvnode->fid, name); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -778,6 +731,9 @@ int yfs_fs_make_dir(struct afs_operation *fc, */ static int yfs_deliver_fs_remove_file2(struct afs_call *call) { + struct afs_operation *op = call->op; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; struct afs_fid fid; const __be32 *bp; int ret; @@ -789,15 +745,24 @@ static int yfs_deliver_fs_remove_file2(struct afs_call *call) return ret; bp = call->buffer; - xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb); + xdr_decode_YFSFetchStatus(&bp, call, &dvp->scb); xdr_decode_YFSFid(&bp, &fid); - xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); + xdr_decode_YFSFetchStatus(&bp, call, &vp->scb); /* Was deleted if vnode->status.abort_code == VNOVNODE. */ - xdr_decode_YFSVolSync(&bp, call->out_volsync); + xdr_decode_YFSVolSync(&bp, &op->volsync); return 0; } +static void yfs_done_fs_remove_file2(struct afs_call *call) +{ + if (call->error == -ECONNABORTED && + call->abort_code == RX_INVALID_OPERATION) { + set_bit(AFS_SERVER_FL_NO_RM2, &call->server->flags); + call->op->flags |= AFS_OPERATION_DOWNGRADE; + } +} + /* * YFS.RemoveFile2 operation type. */ @@ -805,55 +770,44 @@ static const struct afs_call_type yfs_RXYFSRemoveFile2 = { .name = "YFS.RemoveFile2", .op = yfs_FS_RemoveFile2, .deliver = yfs_deliver_fs_remove_file2, + .done = yfs_done_fs_remove_file2, .destructor = afs_flat_call_destructor, }; /* * Remove a file and retrieve new file status. */ -int yfs_fs_remove_file2(struct afs_operation *fc, struct afs_vnode *vnode, - const char *name, struct afs_status_cb *dvnode_scb, - struct afs_status_cb *vnode_scb) +void yfs_fs_remove_file2(struct afs_operation *op) { - struct afs_vnode *dvnode = fc->vnode; + struct afs_vnode_param *dvp = &op->file[0]; + const struct qstr *name = &op->dentry->d_name; struct afs_call *call; - struct afs_net *net = afs_v2net(dvnode); - size_t namesz; __be32 *bp; _enter(""); - namesz = strlen(name); - - call = afs_alloc_flat_call(net, &yfs_RXYFSRemoveFile2, + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRemoveFile2, sizeof(__be32) + sizeof(struct yfs_xdr_RPCFlags) + sizeof(struct yfs_xdr_YFSFid) + - xdr_strlen(namesz), + xdr_strlen(name->len), sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSFid) + sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSVolSync)); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_dir_scb = dvnode_scb; - call->out_scb = vnode_scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSREMOVEFILE2); bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_YFSFid(bp, &dvnode->fid); - bp = xdr_encode_string(bp, name, namesz); + bp = xdr_encode_YFSFid(bp, &dvp->fid); + bp = xdr_encode_name(bp, name); yfs_check_req(call, bp); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call1(call, &dvnode->fid, name); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -861,6 +815,8 @@ int yfs_fs_remove_file2(struct afs_operation *fc, struct afs_vnode *vnode, */ static int yfs_deliver_fs_remove(struct afs_call *call) { + struct afs_operation *op = call->op; + struct afs_vnode_param *dvp = &op->file[0]; const __be32 *bp; int ret; @@ -871,8 +827,8 @@ static int yfs_deliver_fs_remove(struct afs_call *call) return ret; bp = call->buffer; - xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb); - xdr_decode_YFSVolSync(&bp, call->out_volsync); + xdr_decode_YFSFetchStatus(&bp, call, &dvp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); return 0; } @@ -886,6 +842,43 @@ static const struct afs_call_type yfs_RXYFSRemoveFile = { .destructor = afs_flat_call_destructor, }; +/* + * Remove a file. + */ +void yfs_fs_remove_file(struct afs_operation *op) +{ + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + if (!test_bit(AFS_SERVER_FL_NO_RM2, &op->cbi->server->flags)) + return yfs_fs_remove_file2(op); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRemoveFile, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSREMOVEFILE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &dvp->fid); + bp = xdr_encode_name(bp, name); + yfs_check_req(call, bp); + + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); +} + static const struct afs_call_type yfs_RXYFSRemoveDir = { .name = "YFS.RemoveDir", .op = yfs_FS_RemoveDir, @@ -894,48 +887,37 @@ static const struct afs_call_type yfs_RXYFSRemoveDir = { }; /* - * remove a file or directory + * Remove a directory. */ -int yfs_fs_remove(struct afs_operation *fc, struct afs_vnode *vnode, - const char *name, bool isdir, - struct afs_status_cb *dvnode_scb) +void yfs_fs_remove_dir(struct afs_operation *op) { - struct afs_vnode *dvnode = fc->vnode; + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(dvnode); - size_t namesz; __be32 *bp; _enter(""); - namesz = strlen(name); - call = afs_alloc_flat_call( - net, isdir ? &yfs_RXYFSRemoveDir : &yfs_RXYFSRemoveFile, - sizeof(__be32) + - sizeof(struct yfs_xdr_RPCFlags) + - sizeof(struct yfs_xdr_YFSFid) + - xdr_strlen(namesz), - sizeof(struct yfs_xdr_YFSFetchStatus) + - sizeof(struct yfs_xdr_YFSVolSync)); + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRemoveDir, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_dir_scb = dvnode_scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; - bp = xdr_encode_u32(bp, isdir ? YFSREMOVEDIR : YFSREMOVEFILE); + bp = xdr_encode_u32(bp, YFSREMOVEDIR); bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_YFSFid(bp, &dvnode->fid); - bp = xdr_encode_string(bp, name, namesz); + bp = xdr_encode_YFSFid(bp, &dvp->fid); + bp = xdr_encode_name(bp, name); yfs_check_req(call, bp); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call1(call, &dvnode->fid, name); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -943,6 +925,9 @@ int yfs_fs_remove(struct afs_operation *fc, struct afs_vnode *vnode, */ static int yfs_deliver_fs_link(struct afs_call *call) { + struct afs_operation *op = call->op; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; const __be32 *bp; int ret; @@ -953,9 +938,9 @@ static int yfs_deliver_fs_link(struct afs_call *call) return ret; bp = call->buffer; - xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); - xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb); - xdr_decode_YFSVolSync(&bp, call->out_volsync); + xdr_decode_YFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &dvp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; } @@ -973,50 +958,39 @@ static const struct afs_call_type yfs_RXYFSLink = { /* * Make a hard link. */ -int yfs_fs_link(struct afs_operation *fc, struct afs_vnode *vnode, - const char *name, - struct afs_status_cb *dvnode_scb, - struct afs_status_cb *vnode_scb) +void yfs_fs_link(struct afs_operation *op) { - struct afs_vnode *dvnode = fc->vnode; + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); - size_t namesz; __be32 *bp; _enter(""); - namesz = strlen(name); - call = afs_alloc_flat_call(net, &yfs_RXYFSLink, + call = afs_alloc_flat_call(op->net, &yfs_RXYFSLink, sizeof(__be32) + sizeof(struct yfs_xdr_RPCFlags) + sizeof(struct yfs_xdr_YFSFid) + - xdr_strlen(namesz) + + xdr_strlen(name->len) + sizeof(struct yfs_xdr_YFSFid), sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSVolSync)); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_dir_scb = dvnode_scb; - call->out_scb = vnode_scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSLINK); bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_YFSFid(bp, &dvnode->fid); - bp = xdr_encode_string(bp, name, namesz); - bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_YFSFid(bp, &dvp->fid); + bp = xdr_encode_name(bp, name); + bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call1(call, &vnode->fid, name); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call1(call, &vp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -1024,6 +998,9 @@ int yfs_fs_link(struct afs_operation *fc, struct afs_vnode *vnode, */ static int yfs_deliver_fs_symlink(struct afs_call *call) { + struct afs_operation *op = call->op; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; const __be32 *bp; int ret; @@ -1035,10 +1012,10 @@ static int yfs_deliver_fs_symlink(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_YFSFid(&bp, call->out_fid); - xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); - xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb); - xdr_decode_YFSVolSync(&bp, call->out_volsync); + xdr_decode_YFSFid(&bp, &vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &dvp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; @@ -1057,28 +1034,22 @@ static const struct afs_call_type yfs_RXYFSSymlink = { /* * Create a symbolic link. */ -int yfs_fs_symlink(struct afs_operation *fc, - const char *name, - const char *contents, - struct afs_status_cb *dvnode_scb, - struct afs_fid *newfid, - struct afs_status_cb *vnode_scb) +void yfs_fs_symlink(struct afs_operation *op) { - struct afs_vnode *dvnode = fc->vnode; + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(dvnode); - size_t namesz, contents_sz; + size_t contents_sz; __be32 *bp; _enter(""); - namesz = strlen(name); - contents_sz = strlen(contents); - call = afs_alloc_flat_call(net, &yfs_RXYFSSymlink, + contents_sz = strlen(op->create.symlink); + call = afs_alloc_flat_call(op->net, &yfs_RXYFSSymlink, sizeof(__be32) + sizeof(struct yfs_xdr_RPCFlags) + sizeof(struct yfs_xdr_YFSFid) + - xdr_strlen(namesz) + + xdr_strlen(name->len) + xdr_strlen(contents_sz) + sizeof(struct yfs_xdr_YFSStoreStatus), sizeof(struct yfs_xdr_YFSFid) + @@ -1086,28 +1057,20 @@ int yfs_fs_symlink(struct afs_operation *fc, sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSVolSync)); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_dir_scb = dvnode_scb; - call->out_fid = newfid; - call->out_scb = vnode_scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSSYMLINK); bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_YFSFid(bp, &dvnode->fid); - bp = xdr_encode_string(bp, name, namesz); - bp = xdr_encode_string(bp, contents, contents_sz); + bp = xdr_encode_YFSFid(bp, &dvp->fid); + bp = xdr_encode_name(bp, name); + bp = xdr_encode_string(bp, op->create.symlink, contents_sz); bp = xdr_encode_YFSStoreStatus_mode(bp, S_IRWXUGO); yfs_check_req(call, bp); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call1(call, &dvnode->fid, name); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -1115,6 +1078,9 @@ int yfs_fs_symlink(struct afs_operation *fc, */ static int yfs_deliver_fs_rename(struct afs_call *call) { + struct afs_operation *op = call->op; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; const __be32 *bp; int ret; @@ -1128,9 +1094,9 @@ static int yfs_deliver_fs_rename(struct afs_call *call) /* If the two dirs are the same, we have two copies of the same status * report, so we just decode it twice. */ - xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb); - xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); - xdr_decode_YFSVolSync(&bp, call->out_volsync); + xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; } @@ -1148,55 +1114,42 @@ static const struct afs_call_type yfs_RXYFSRename = { /* * Rename a file or directory. */ -int yfs_fs_rename(struct afs_operation *fc, - const char *orig_name, - struct afs_vnode *new_dvnode, - const char *new_name, - struct afs_status_cb *orig_dvnode_scb, - struct afs_status_cb *new_dvnode_scb) +void yfs_fs_rename(struct afs_operation *op) { - struct afs_vnode *orig_dvnode = fc->vnode; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; struct afs_call *call; - struct afs_net *net = afs_v2net(orig_dvnode); - size_t o_namesz, n_namesz; __be32 *bp; _enter(""); - o_namesz = strlen(orig_name); - n_namesz = strlen(new_name); - call = afs_alloc_flat_call(net, &yfs_RXYFSRename, + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename, sizeof(__be32) + sizeof(struct yfs_xdr_RPCFlags) + sizeof(struct yfs_xdr_YFSFid) + - xdr_strlen(o_namesz) + + xdr_strlen(orig_name->len) + sizeof(struct yfs_xdr_YFSFid) + - xdr_strlen(n_namesz), + xdr_strlen(new_name->len), sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSVolSync)); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_dir_scb = orig_dvnode_scb; - call->out_scb = new_dvnode_scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSRENAME); bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_YFSFid(bp, &orig_dvnode->fid); - bp = xdr_encode_string(bp, orig_name, o_namesz); - bp = xdr_encode_YFSFid(bp, &new_dvnode->fid); - bp = xdr_encode_string(bp, new_name, n_namesz); + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); yfs_check_req(call, bp); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call2(call, &orig_dvnode->fid, orig_name, new_name); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -1212,27 +1165,23 @@ static const struct afs_call_type yfs_RXYFSStoreData64 = { /* * Store a set of pages to a large file. */ -int yfs_fs_store_data(struct afs_operation *fc, struct address_space *mapping, - pgoff_t first, pgoff_t last, - unsigned offset, unsigned to, - struct afs_status_cb *scb) +void yfs_fs_store_data(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); loff_t size, pos, i_size; __be32 *bp; _enter(",%x,{%llx:%llu},,", - key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + key_serial(op->key), vp->fid.vid, vp->fid.vnode); - size = (loff_t)to - (loff_t)offset; - if (first != last) - size += (loff_t)(last - first) << PAGE_SHIFT; - pos = (loff_t)first << PAGE_SHIFT; - pos += offset; + size = (loff_t)op->store.last_to - (loff_t)op->store.first_offset; + if (op->store.first != op->store.last) + size += (loff_t)(op->store.last - op->store.first) << PAGE_SHIFT; + pos = (loff_t)op->store.first << PAGE_SHIFT; + pos += op->store.first_offset; - i_size = i_size_read(&vnode->vfs_inode); + i_size = i_size_read(&vp->vnode->vfs_inode); if (pos + size > i_size) i_size = size + pos; @@ -1240,7 +1189,7 @@ int yfs_fs_store_data(struct afs_operation *fc, struct address_space *mapping, (unsigned long long)size, (unsigned long long)pos, (unsigned long long)i_size); - call = afs_alloc_flat_call(net, &yfs_RXYFSStoreData64, + call = afs_alloc_flat_call(op->net, &yfs_RXYFSStoreData64, sizeof(__be32) + sizeof(__be32) + sizeof(struct yfs_xdr_YFSFid) + @@ -1249,33 +1198,24 @@ int yfs_fs_store_data(struct afs_operation *fc, struct address_space *mapping, sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSVolSync)); if (!call) - return -ENOMEM; + return afs_op_nomem(op); - call->key = fc->key; - call->mapping = mapping; - call->first = first; - call->last = last; - call->first_offset = offset; - call->last_to = to; + call->key = op->key; call->send_pages = true; - call->out_scb = scb; /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSSTOREDATA64); bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_YFSFid(bp, &vnode->fid); - bp = xdr_encode_YFSStoreStatus_mtime(bp, &vnode->vfs_inode.i_mtime); + bp = xdr_encode_YFSFid(bp, &vp->fid); + bp = xdr_encode_YFSStoreStatus_mtime(bp, &op->mtime); bp = xdr_encode_u64(bp, pos); bp = xdr_encode_u64(bp, size); bp = xdr_encode_u64(bp, i_size); yfs_check_req(call, bp); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -1299,18 +1239,17 @@ static const struct afs_call_type yfs_RXYFSStoreData64_as_Status = { * Set the attributes on a file, using YFS.StoreData64 rather than * YFS.StoreStatus so as to alter the file size also. */ -static int yfs_fs_setattr_size(struct afs_operation *fc, struct iattr *attr, - struct afs_status_cb *scb) +static void yfs_fs_setattr_size(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); + struct iattr *attr = op->setattr.attr; __be32 *bp; _enter(",%x,{%llx:%llu},,", - key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + key_serial(op->key), vp->fid.vid, vp->fid.vnode); - call = afs_alloc_flat_call(net, &yfs_RXYFSStoreData64_as_Status, + call = afs_alloc_flat_call(op->net, &yfs_RXYFSStoreData64_as_Status, sizeof(__be32) * 2 + sizeof(struct yfs_xdr_YFSFid) + sizeof(struct yfs_xdr_YFSStoreStatus) + @@ -1318,72 +1257,59 @@ static int yfs_fs_setattr_size(struct afs_operation *fc, struct iattr *attr, sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSVolSync)); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_scb = scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSSTOREDATA64); bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_YFSFid(bp, &vp->fid); bp = xdr_encode_YFS_StoreStatus(bp, attr); bp = xdr_encode_u64(bp, attr->ia_size); /* position of start of write */ bp = xdr_encode_u64(bp, 0); /* size of write */ bp = xdr_encode_u64(bp, attr->ia_size); /* new file length */ yfs_check_req(call, bp); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* * Set the attributes on a file, using YFS.StoreData64 if there's a change in * file size, and YFS.StoreStatus otherwise. */ -int yfs_fs_setattr(struct afs_operation *fc, struct iattr *attr, - struct afs_status_cb *scb) +void yfs_fs_setattr(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); + struct iattr *attr = op->setattr.attr; __be32 *bp; if (attr->ia_valid & ATTR_SIZE) - return yfs_fs_setattr_size(fc, attr, scb); + return yfs_fs_setattr_size(op); _enter(",%x,{%llx:%llu},,", - key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + key_serial(op->key), vp->fid.vid, vp->fid.vnode); - call = afs_alloc_flat_call(net, &yfs_RXYFSStoreStatus, + call = afs_alloc_flat_call(op->net, &yfs_RXYFSStoreStatus, sizeof(__be32) * 2 + sizeof(struct yfs_xdr_YFSFid) + sizeof(struct yfs_xdr_YFSStoreStatus), sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSVolSync)); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_scb = scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSSTORESTATUS); bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_YFSFid(bp, &vp->fid); bp = xdr_encode_YFS_StoreStatus(bp, attr); yfs_check_req(call, bp); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -1391,6 +1317,7 @@ int yfs_fs_setattr(struct afs_operation *fc, struct iattr *attr, */ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) { + struct afs_operation *op = call->op; const __be32 *bp; char *p; u32 size; @@ -1412,7 +1339,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) return ret; bp = call->buffer; - xdr_decode_YFSFetchVolumeStatus(&bp, call->out_volstatus); + xdr_decode_YFSFetchVolumeStatus(&bp, &op->volstatus.vs); call->unmarshall++; afs_extract_to_tmp(call); /* Fall through */ @@ -1526,17 +1453,15 @@ static const struct afs_call_type yfs_RXYFSGetVolumeStatus = { /* * fetch the status of a volume */ -int yfs_fs_get_volume_status(struct afs_operation *fc, - struct afs_volume_status *vs) +void yfs_fs_get_volume_status(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); __be32 *bp; _enter(""); - call = afs_alloc_flat_call(net, &yfs_RXYFSGetVolumeStatus, + call = afs_alloc_flat_call(op->net, &yfs_RXYFSGetVolumeStatus, sizeof(__be32) * 2 + sizeof(struct yfs_xdr_u64), max_t(size_t, @@ -1544,23 +1469,17 @@ int yfs_fs_get_volume_status(struct afs_operation *fc, sizeof(__be32), AFSOPAQUEMAX + 1)); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->out_volstatus = vs; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSGETVOLUMESTATUS); bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_u64(bp, vnode->fid.vid); + bp = xdr_encode_u64(bp, vp->fid.vid); yfs_check_req(call, bp); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -1598,118 +1517,93 @@ static const struct afs_call_type yfs_RXYFSReleaseLock = { /* * Set a lock on a file */ -int yfs_fs_set_lock(struct afs_operation *fc, afs_lock_type_t type, - struct afs_status_cb *scb) +void yfs_fs_set_lock(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); __be32 *bp; _enter(""); - call = afs_alloc_flat_call(net, &yfs_RXYFSSetLock, + call = afs_alloc_flat_call(op->net, &yfs_RXYFSSetLock, sizeof(__be32) * 2 + sizeof(struct yfs_xdr_YFSFid) + sizeof(__be32), sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSVolSync)); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->lvnode = vnode; - call->out_scb = scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSSETLOCK); bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_YFSFid(bp, &vnode->fid); - bp = xdr_encode_u32(bp, type); + bp = xdr_encode_YFSFid(bp, &vp->fid); + bp = xdr_encode_u32(bp, op->lock.type); yfs_check_req(call, bp); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_calli(call, &vnode->fid, type); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_calli(call, &vp->fid, op->lock.type); + afs_make_op_call(op, call, GFP_NOFS); } /* * extend a lock on a file */ -int yfs_fs_extend_lock(struct afs_operation *fc, struct afs_status_cb *scb) +void yfs_fs_extend_lock(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); __be32 *bp; _enter(""); - call = afs_alloc_flat_call(net, &yfs_RXYFSExtendLock, + call = afs_alloc_flat_call(op->net, &yfs_RXYFSExtendLock, sizeof(__be32) * 2 + sizeof(struct yfs_xdr_YFSFid), sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSVolSync)); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->lvnode = vnode; - call->out_scb = scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSEXTENDLOCK); bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* * release a lock on a file */ -int yfs_fs_release_lock(struct afs_operation *fc, struct afs_status_cb *scb) +void yfs_fs_release_lock(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); __be32 *bp; _enter(""); - call = afs_alloc_flat_call(net, &yfs_RXYFSReleaseLock, + call = afs_alloc_flat_call(op->net, &yfs_RXYFSReleaseLock, sizeof(__be32) * 2 + sizeof(struct yfs_xdr_YFSFid), sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSVolSync)); if (!call) - return -ENOMEM; - - call->key = fc->key; - call->lvnode = vnode; - call->out_scb = scb; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSRELEASELOCK); bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -1725,45 +1619,33 @@ static const struct afs_call_type yfs_RXYFSFetchStatus = { /* * Fetch the status information for a fid without needing a vnode handle. */ -int yfs_fs_fetch_status(struct afs_operation *fc, - struct afs_net *net, - struct afs_fid *fid, - struct afs_status_cb *scb, - struct afs_volsync *volsync) +void yfs_fs_fetch_status(struct afs_operation *op) { + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; __be32 *bp; _enter(",%x,{%llx:%llu},,", - key_serial(fc->key), fid->vid, fid->vnode); + key_serial(op->key), vp->fid.vid, vp->fid.vnode); - call = afs_alloc_flat_call(net, &yfs_RXYFSFetchStatus, + call = afs_alloc_flat_call(op->net, &yfs_RXYFSFetchStatus, sizeof(__be32) * 2 + sizeof(struct yfs_xdr_YFSFid), sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSCallBack) + sizeof(struct yfs_xdr_YFSVolSync)); - if (!call) { - fc->ac.error = -ENOMEM; - return -ENOMEM; - } - - call->key = fc->key; - call->out_scb = scb; - call->out_volsync = volsync; + if (!call) + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSFETCHSTATUS); bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_YFSFid(bp, fid); + bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, fid); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -1771,6 +1653,7 @@ int yfs_fs_fetch_status(struct afs_operation *fc, */ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) { + struct afs_operation *op = call->op; struct afs_status_cb *scb; const __be32 *bp; u32 tmp; @@ -1792,8 +1675,8 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) return ret; tmp = ntohl(call->tmp); - _debug("status count: %u/%u", tmp, call->count2); - if (tmp != call->count2) + _debug("status count: %u/%u", tmp, op->nr_files); + if (tmp != op->nr_files) return afs_protocol_error(call, afs_eproto_ibulkst_count); call->count = 0; @@ -1808,12 +1691,23 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) if (ret < 0) return ret; + switch (call->count) { + case 0: + scb = &op->file[0].scb; + break; + case 1: + scb = &op->file[1].scb; + break; + default: + scb = &op->more_files[call->count - 2].scb; + break; + } + bp = call->buffer; - scb = &call->out_scb[call->count]; xdr_decode_YFSFetchStatus(&bp, call, scb); call->count++; - if (call->count < call->count2) + if (call->count < op->nr_files) goto more_counts; call->count = 0; @@ -1830,7 +1724,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) tmp = ntohl(call->tmp); _debug("CB count: %u", tmp); - if (tmp != call->count2) + if (tmp != op->nr_files) return afs_protocol_error(call, afs_eproto_ibulkst_cb_count); call->count = 0; call->unmarshall++; @@ -1845,11 +1739,22 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) return ret; _debug("unmarshall CB array"); + switch (call->count) { + case 0: + scb = &op->file[0].scb; + break; + case 1: + scb = &op->file[1].scb; + break; + default: + scb = &op->more_files[call->count - 2].scb; + break; + } + bp = call->buffer; - scb = &call->out_scb[call->count]; xdr_decode_YFSCallBack(&bp, call, scb); call->count++; - if (call->count < call->count2) + if (call->count < op->nr_files) goto more_cbs; afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSVolSync)); @@ -1862,7 +1767,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) return ret; bp = call->buffer; - xdr_decode_YFSVolSync(&bp, call->out_volsync); + xdr_decode_YFSVolSync(&bp, &op->volsync); call->unmarshall++; /* Fall through */ @@ -1888,50 +1793,39 @@ static const struct afs_call_type yfs_RXYFSInlineBulkStatus = { /* * Fetch the status information for up to 1024 files */ -int yfs_fs_inline_bulk_status(struct afs_operation *fc, - struct afs_net *net, - struct afs_fid *fids, - struct afs_status_cb *statuses, - unsigned int nr_fids, - struct afs_volsync *volsync) +void yfs_fs_inline_bulk_status(struct afs_operation *op) { + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; struct afs_call *call; __be32 *bp; int i; _enter(",%x,{%llx:%llu},%u", - key_serial(fc->key), fids[0].vid, fids[1].vnode, nr_fids); + key_serial(op->key), vp->fid.vid, vp->fid.vnode, op->nr_files); - call = afs_alloc_flat_call(net, &yfs_RXYFSInlineBulkStatus, + call = afs_alloc_flat_call(op->net, &yfs_RXYFSInlineBulkStatus, sizeof(__be32) + sizeof(__be32) + sizeof(__be32) + - sizeof(struct yfs_xdr_YFSFid) * nr_fids, + sizeof(struct yfs_xdr_YFSFid) * op->nr_files, sizeof(struct yfs_xdr_YFSFetchStatus)); - if (!call) { - fc->ac.error = -ENOMEM; - return -ENOMEM; - } - - call->key = fc->key; - call->out_scb = statuses; - call->out_volsync = volsync; - call->count2 = nr_fids; + if (!call) + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSINLINEBULKSTATUS); bp = xdr_encode_u32(bp, 0); /* RPCFlags */ - bp = xdr_encode_u32(bp, nr_fids); - for (i = 0; i < nr_fids; i++) - bp = xdr_encode_YFSFid(bp, &fids[i]); + bp = xdr_encode_u32(bp, op->nr_files); + bp = xdr_encode_YFSFid(bp, &dvp->fid); + bp = xdr_encode_YFSFid(bp, &vp->fid); + for (i = 0; i < op->nr_files - 2; i++) + bp = xdr_encode_YFSFid(bp, &op->more_files[i].fid); yfs_check_req(call, bp); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &fids[0]); - afs_set_fc_call(call, fc); - afs_make_call(&fc->ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -1939,7 +1833,9 @@ int yfs_fs_inline_bulk_status(struct afs_operation *fc, */ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call) { - struct yfs_acl *yacl = call->out_yacl; + struct afs_operation *op = call->op; + struct afs_vnode_param *vp = &op->file[0]; + struct yfs_acl *yacl = op->yacl; struct afs_acl *acl; const __be32 *bp; unsigned int size; @@ -2029,8 +1925,8 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call) bp = call->buffer; yacl->inherit_flag = ntohl(*bp++); yacl->num_cleaned = ntohl(*bp++); - xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); - xdr_decode_YFSVolSync(&bp, call->out_volsync); + xdr_decode_YFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); call->unmarshall++; /* Fall through */ @@ -2065,45 +1961,33 @@ static const struct afs_call_type yfs_RXYFSFetchOpaqueACL = { /* * Fetch the YFS advanced ACLs for a file. */ -struct yfs_acl *yfs_fs_fetch_opaque_acl(struct afs_operation *fc, - struct yfs_acl *yacl, - struct afs_status_cb *scb) +void yfs_fs_fetch_opaque_acl(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); __be32 *bp; _enter(",%x,{%llx:%llu},,", - key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + key_serial(op->key), vp->fid.vid, vp->fid.vnode); - call = afs_alloc_flat_call(net, &yfs_RXYFSFetchOpaqueACL, + call = afs_alloc_flat_call(op->net, &yfs_RXYFSFetchOpaqueACL, sizeof(__be32) * 2 + sizeof(struct yfs_xdr_YFSFid), sizeof(__be32) * 2 + sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSVolSync)); - if (!call) { - fc->ac.error = -ENOMEM; - return ERR_PTR(-ENOMEM); - } - - call->key = fc->key; - call->out_yacl = yacl; - call->out_scb = scb; - call->out_volsync = NULL; + if (!call) + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSFETCHOPAQUEACL); bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); - afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); - afs_make_call(&fc->ac, call, GFP_KERNEL); - return (struct yfs_acl *)afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_KERNEL); } /* @@ -2119,46 +2003,38 @@ static const struct afs_call_type yfs_RXYFSStoreOpaqueACL2 = { /* * Fetch the YFS ACL for a file. */ -int yfs_fs_store_opaque_acl2(struct afs_operation *fc, const struct afs_acl *acl, - struct afs_status_cb *scb) +void yfs_fs_store_opaque_acl2(struct afs_operation *op) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); + struct afs_acl *acl = op->acl; size_t size; __be32 *bp; _enter(",%x,{%llx:%llu},,", - key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + key_serial(op->key), vp->fid.vid, vp->fid.vnode); size = round_up(acl->size, 4); - call = afs_alloc_flat_call(net, &yfs_RXYFSStoreOpaqueACL2, + call = afs_alloc_flat_call(op->net, &yfs_RXYFSStoreOpaqueACL2, sizeof(__be32) * 2 + sizeof(struct yfs_xdr_YFSFid) + sizeof(__be32) + size, sizeof(struct yfs_xdr_YFSFetchStatus) + sizeof(struct yfs_xdr_YFSVolSync)); - if (!call) { - fc->ac.error = -ENOMEM; - return -ENOMEM; - } - - call->key = fc->key; - call->out_scb = scb; - call->out_volsync = NULL; + if (!call) + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSSTOREOPAQUEACL2); bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_YFSFid(bp, &vp->fid); bp = xdr_encode_u32(bp, acl->size); memcpy(bp, acl->data, acl->size); if (acl->size != size) memset((void *)bp + acl->size, 0, size - acl->size); yfs_check_req(call, bp); - trace_afs_make_fs_call(call, &vnode->fid); - afs_make_call(&fc->ac, call, GFP_KERNEL); - return afs_wait_for_call_to_complete(call, &fc->ac); + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_KERNEL); } diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index a6d8a9891164..f4d66919fb22 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -642,7 +642,7 @@ TRACE_EVENT(afs_make_fs_calli, TRACE_EVENT(afs_make_fs_call1, TP_PROTO(struct afs_call *call, const struct afs_fid *fid, - const char *name), + const struct qstr *name), TP_ARGS(call, fid, name), @@ -654,8 +654,7 @@ TRACE_EVENT(afs_make_fs_call1, ), TP_fast_assign( - int __len = strlen(name); - __len = min(__len, 23); + unsigned int __len = min_t(unsigned int, name->len, 23); __entry->call = call->debug_id; __entry->op = call->operation_ID; if (fid) { @@ -665,7 +664,7 @@ TRACE_EVENT(afs_make_fs_call1, __entry->fid.vnode = 0; __entry->fid.unique = 0; } - memcpy(__entry->name, name, __len); + memcpy(__entry->name, name->name, __len); __entry->name[__len] = 0; ), @@ -680,7 +679,7 @@ TRACE_EVENT(afs_make_fs_call1, TRACE_EVENT(afs_make_fs_call2, TP_PROTO(struct afs_call *call, const struct afs_fid *fid, - const char *name, const char *name2), + const struct qstr *name, const struct qstr *name2), TP_ARGS(call, fid, name, name2), @@ -693,10 +692,8 @@ TRACE_EVENT(afs_make_fs_call2, ), TP_fast_assign( - int __len = strlen(name); - int __len2 = strlen(name2); - __len = min(__len, 23); - __len2 = min(__len2, 23); + unsigned int __len = min_t(unsigned int, name->len, 23); + unsigned int __len2 = min_t(unsigned int, name2->len, 23); __entry->call = call->debug_id; __entry->op = call->operation_ID; if (fid) { @@ -706,9 +703,9 @@ TRACE_EVENT(afs_make_fs_call2, __entry->fid.vnode = 0; __entry->fid.unique = 0; } - memcpy(__entry->name, name, __len); + memcpy(__entry->name, name->name, __len); __entry->name[__len] = 0; - memcpy(__entry->name2, name2, __len2); + memcpy(__entry->name2, name2->name, __len2); __entry->name2[__len2] = 0; ), From 44746355ccb142341f92a0c86fc2e27bfc968b40 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 27 May 2020 15:52:02 +0100 Subject: [PATCH 348/427] afs: Don't get epoch from a server because it may be ambiguous Don't get the epoch from a server, particularly one that we're looking up by UUID, as UUIDs may be ambiguous and may map to more than one server - so we can't draw any conclusions from it. Reported-by: Jeffrey Altman Signed-off-by: David Howells --- fs/afs/cmservice.c | 49 ++-------------------------------------------- fs/afs/internal.h | 7 ------- 2 files changed, 2 insertions(+), 54 deletions(-) diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index ed0fb34d77dd..954030ae7a0f 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -118,8 +118,6 @@ bool afs_cm_incoming_call(struct afs_call *call) { _enter("{%u, CB.OP %u}", call->service_id, call->operation_ID); - call->epoch = rxrpc_kernel_get_epoch(call->net->socket, call->rxcall); - switch (call->operation_ID) { case CBCallBack: call->type = &afs_SRXCBCallBack; @@ -149,49 +147,6 @@ bool afs_cm_incoming_call(struct afs_call *call) } } -/* - * Record a probe to the cache manager from a server. - */ -static int afs_record_cm_probe(struct afs_call *call, struct afs_server *server) -{ - _enter(""); - - if (test_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags) && - !afs_is_probing_server(server)) { - if (server->cm_epoch == call->epoch) - return 0; - - if (!server->probe.said_rebooted) { - pr_notice("kAFS: FS rebooted %pU\n", &server->uuid); - server->probe.said_rebooted = true; - } - } - - spin_lock(&server->probe_lock); - - if (!test_and_set_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags)) { - server->cm_epoch = call->epoch; - server->probe.cm_epoch = call->epoch; - goto out; - } - - if (server->probe.cm_probed && - call->epoch != server->probe.cm_epoch && - !server->probe.said_inconsistent) { - pr_notice("kAFS: FS endpoints inconsistent %pU\n", - &server->uuid); - server->probe.said_inconsistent = true; - } - - if (!server->probe.cm_probed || call->epoch == server->cm_epoch) - server->probe.cm_epoch = server->cm_epoch; - -out: - server->probe.cm_probed = true; - spin_unlock(&server->probe_lock); - return 0; -} - /* * Find the server record by peer address and record a probe to the cache * manager from a server. @@ -210,7 +165,7 @@ static int afs_find_cm_server_by_peer(struct afs_call *call) } call->server = server; - return afs_record_cm_probe(call, server); + return 0; } /* @@ -231,7 +186,7 @@ static int afs_find_cm_server_by_uuid(struct afs_call *call, } call->server = server; - return afs_record_cm_probe(call, server); + return 0; } /* diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 4b8ac049fc17..9f024c1bd650 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -124,7 +124,6 @@ struct afs_call { spinlock_t state_lock; int error; /* error code */ u32 abort_code; /* Remote abort ID or 0 */ - u32 epoch; unsigned int max_lifespan; /* Maximum lifespan to set if not 0 */ unsigned request_size; /* size of request data */ unsigned reply_max; /* maximum size of reply */ @@ -491,12 +490,10 @@ struct afs_server { #define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */ #define AFS_SERVER_FL_IS_YFS 9 /* Server is YFS not AFS */ #define AFS_SERVER_FL_NO_RM2 10 /* Fileserver doesn't support YFS.RemoveFile2 */ -#define AFS_SERVER_FL_HAVE_EPOCH 11 /* ->epoch is valid */ #define AFS_SERVER_FL_NEEDS_UPDATE 12 /* Fileserver address list is out of date */ atomic_t ref; /* Object refcount */ atomic_t active; /* Active user count */ u32 addr_version; /* Address list version */ - u32 cm_epoch; /* Server RxRPC epoch */ unsigned int debug_id; /* Debugging ID for traces */ /* file service access */ @@ -515,15 +512,11 @@ struct afs_server { struct { unsigned int rtt; /* RTT as ktime/64 */ u32 abort_code; - u32 cm_epoch; short error; bool responded:1; bool is_yfs:1; bool not_yfs:1; bool local_failure:1; - bool cm_probed:1; - bool said_rebooted:1; - bool said_inconsistent:1; } probe; }; From 3120c170ef35c1a9be137535e44c7e237e33b78e Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 27 May 2020 16:44:02 +0100 Subject: [PATCH 349/427] afs: Fix handling of CB.ProbeUuid cache manager op The AFS filesystem driver is handling the CB.ProbeUuid request incorrectly. The UUID presented in the request is that of the cache manager, not the fileserver, so afs_deliver_cb_probe_uuid() shouldn't be using that UUID to look up the server. Fix this by looking up the server by address instead. Signed-off-by: David Howells --- fs/afs/cmservice.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 954030ae7a0f..bef413818af7 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -464,7 +464,8 @@ static int afs_deliver_cb_probe(struct afs_call *call) } /* - * allow the fileserver to quickly find out if the fileserver has been rebooted + * Allow the fileserver to quickly find out if the cache manager has been + * rebooted. */ static void SRXAFSCB_ProbeUuid(struct work_struct *work) { @@ -536,7 +537,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - return afs_find_cm_server_by_uuid(call, call->request); + return afs_find_cm_server_by_peer(call); } /* From 194d28cf197ca982556be58dcf687b43fd85e9cc Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 27 Apr 2020 13:42:53 +0100 Subject: [PATCH 350/427] afs: Retain more of the VLDB record for alias detection Save more bits from the volume location database record obtained for a server so that we can use this information in cell alias detection. Signed-off-by: David Howells --- fs/afs/internal.h | 3 ++- fs/afs/server_list.c | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 9f024c1bd650..dce03e068cab 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -546,7 +546,7 @@ struct afs_cb_interest { }; /* - * Replaceable server list. + * Replaceable volume server list. */ struct afs_server_entry { struct afs_server *server; @@ -554,6 +554,7 @@ struct afs_server_entry { }; struct afs_server_list { + afs_volid_t vids[AFS_MAXTYPES]; /* Volume IDs */ refcount_t usage; unsigned char nr_servers; unsigned char preferred; /* Preferred server */ diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index b77e50f62459..a35f6951a74a 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -46,6 +46,9 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, refcount_set(&slist->usage, 1); rwlock_init(&slist->lock); + for (i = 0; i < AFS_MAXTYPES; i++) + slist->vids[i] = vldb->vid[i]; + /* Make sure a records exists for each server in the list. */ for (i = 0; i < vldb->nr_servers; i++) { if (!(vldb->fs_mask[i] & type_mask)) From c3e9f888263bb4df11cbd623ceced02081cb2f9f Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 29 Apr 2020 17:26:41 +0100 Subject: [PATCH 351/427] afs: Implement client support for the YFSVL.GetCellName RPC op Implement client support for the YFSVL.GetCellName RPC operation by which YFS permits the canonical cell name to be queried from a VL server. Signed-off-by: David Howells --- fs/afs/afs.h | 2 +- fs/afs/afs_vl.h | 1 + fs/afs/internal.h | 2 + fs/afs/protocol_yfs.h | 2 +- fs/afs/vlclient.c | 111 +++++++++++++++++++++++++++++++++++++ include/trace/events/afs.h | 4 ++ 6 files changed, 120 insertions(+), 2 deletions(-) diff --git a/fs/afs/afs.h b/fs/afs/afs.h index f8e34406243e..432cb4b23961 100644 --- a/fs/afs/afs.h +++ b/fs/afs/afs.h @@ -10,7 +10,7 @@ #include -#define AFS_MAXCELLNAME 64 /* Maximum length of a cell name */ +#define AFS_MAXCELLNAME 256 /* Maximum length of a cell name */ #define AFS_MAXVOLNAME 64 /* Maximum length of a volume name */ #define AFS_MAXNSERVERS 8 /* Maximum servers in a basic volume record */ #define AFS_NMAXNSERVERS 13 /* Maximum servers in a N/U-class volume record */ diff --git a/fs/afs/afs_vl.h b/fs/afs/afs_vl.h index e9b8029920ec..9c65ffb8a523 100644 --- a/fs/afs/afs_vl.h +++ b/fs/afs/afs_vl.h @@ -22,6 +22,7 @@ enum AFSVL_Operations { VLGETENTRYBYNAMEU = 527, /* AFS Get VLDB entry by name (UUID-variant) */ VLGETADDRSU = 533, /* AFS Get addrs for fileserver */ YVLGETENDPOINTS = 64002, /* YFS Get endpoints for file/volume server */ + YVLGETCELLNAME = 64014, /* YFS Get actual cell name */ VLGETCAPABILITIES = 65537, /* AFS Get server capabilities */ }; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index dce03e068cab..3606cfa50832 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -116,6 +116,7 @@ struct afs_call { long ret0; /* Value to reply with instead of 0 */ struct afs_addr_list *ret_alist; struct afs_vldb_entry *ret_vldb; + char *ret_str; }; struct afs_operation *op; unsigned int server_index; @@ -1373,6 +1374,7 @@ extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *, const uu extern struct afs_call *afs_vl_get_capabilities(struct afs_net *, struct afs_addr_cursor *, struct key *, struct afs_vlserver *, unsigned int); extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *, const uuid_t *); +extern char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *); /* * vl_probe.c diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h index 32be9c698348..b5bd03b1d3c7 100644 --- a/fs/afs/protocol_yfs.h +++ b/fs/afs/protocol_yfs.h @@ -8,7 +8,7 @@ #define YFS_FS_SERVICE 2500 #define YFS_CM_SERVICE 2501 -#define YFSCBMAX 1024 +#define YFSCBMAX 1024 enum YFS_CM_Operations { YFSCBProbe = 206, /* probe client */ diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index d0c85623ce8f..fd82850cd424 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -645,3 +645,114 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc, afs_make_call(&vc->ac, call, GFP_KERNEL); return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac); } + +/* + * Deliver reply data to a YFSVL.GetCellName operation. + */ +static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call) +{ + char *cell_name; + u32 namesz, paddedsz; + int ret; + + _enter("{%u,%zu/%u}", + call->unmarshall, iov_iter_count(call->iter), call->count); + + switch (call->unmarshall) { + case 0: + afs_extract_to_tmp(call); + call->unmarshall++; + + /* Fall through - and extract the cell name length */ + case 1: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + namesz = ntohl(call->tmp); + if (namesz > AFS_MAXCELLNAME) + return afs_protocol_error(call, afs_eproto_cellname_len); + paddedsz = (namesz + 3) & ~3; + call->count = namesz; + call->count2 = paddedsz - namesz; + + cell_name = kmalloc(namesz + 1, GFP_KERNEL); + if (!cell_name) + return -ENOMEM; + cell_name[namesz] = 0; + call->ret_str = cell_name; + + afs_extract_begin(call, cell_name, namesz); + call->unmarshall++; + + /* Fall through - and extract cell name */ + case 2: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + afs_extract_discard(call, call->count2); + call->unmarshall++; + + /* Fall through - and extract padding */ + case 3: + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + call->unmarshall++; + break; + } + + _leave(" = 0 [done]"); + return 0; +} + +static void afs_destroy_yfsvl_get_cell_name(struct afs_call *call) +{ + kfree(call->ret_str); + afs_flat_call_destructor(call); +} + +/* + * VL.GetCapabilities operation type + */ +static const struct afs_call_type afs_YFSVLGetCellName = { + .name = "YFSVL.GetCellName", + .op = afs_YFSVL_GetCellName, + .deliver = afs_deliver_yfsvl_get_cell_name, + .destructor = afs_destroy_yfsvl_get_cell_name, +}; + +/* + * Probe a volume server for the capabilities that it supports. This can + * return up to 196 words. + * + * We use this to probe for service upgrade to determine what the server at the + * other end supports. + */ +char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc) +{ + struct afs_call *call; + struct afs_net *net = vc->cell->net; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(net, &afs_YFSVLGetCellName, 1 * 4, 0); + if (!call) + return ERR_PTR(-ENOMEM); + + call->key = vc->key; + call->ret_str = NULL; + call->max_lifespan = AFS_VL_MAX_LIFESPAN; + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(YVLGETCELLNAME); + + /* Can't take a ref on server */ + trace_afs_make_vl_call(call); + afs_make_call(&vc->ac, call, GFP_KERNEL); + return (char *)afs_wait_for_call_to_complete(call, &vc->ac); +} diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index f4d66919fb22..f320b3ad54da 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -111,6 +111,7 @@ enum afs_vl_operation { afs_VL_GetEntryByNameU = 527, /* AFS Get Vol Entry By Name operation ID */ afs_VL_GetAddrsU = 533, /* AFS Get FS server addresses */ afs_YFSVL_GetEndpoints = 64002, /* YFS Get FS & Vol server addresses */ + afs_YFSVL_GetCellName = 64014, /* YFS Get actual cell name */ afs_VL_GetCapabilities = 65537, /* AFS Get VL server capabilities */ }; @@ -143,6 +144,7 @@ enum afs_eproto_cause { afs_eproto_bad_status, afs_eproto_cb_count, afs_eproto_cb_fid_count, + afs_eproto_cellname_len, afs_eproto_file_type, afs_eproto_ibulkst_cb_count, afs_eproto_ibulkst_count, @@ -316,6 +318,7 @@ enum afs_cb_break_reason { EM(afs_VL_GetEntryByNameU, "VL.GetEntryByNameU") \ EM(afs_VL_GetAddrsU, "VL.GetAddrsU") \ EM(afs_YFSVL_GetEndpoints, "YFSVL.GetEndpoints") \ + EM(afs_YFSVL_GetCellName, "YFSVL.GetCellName") \ E_(afs_VL_GetCapabilities, "VL.GetCapabilities") #define afs_edit_dir_ops \ @@ -345,6 +348,7 @@ enum afs_cb_break_reason { EM(afs_eproto_bad_status, "BadStatus") \ EM(afs_eproto_cb_count, "CbCount") \ EM(afs_eproto_cb_fid_count, "CbFidCount") \ + EM(afs_eproto_cellname_len, "CellNameLen") \ EM(afs_eproto_file_type, "FileTYpe") \ EM(afs_eproto_ibulkst_cb_count, "IBS.CbCount") \ EM(afs_eproto_ibulkst_count, "IBS.FidCount") \ From 8a070a964877c71139cba46202f6f263c2b9419d Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 25 Apr 2020 10:26:02 +0100 Subject: [PATCH 352/427] afs: Detect cell aliases 1 - Cells with root volumes Put in the first phase of cell alias detection. This part handles alias detection for cells that have root.cell volumes (which is expected to be likely). When a cell becomes newly active, it is probed for its root.cell volume, and if it has one, this volume is compared against other root.cell volumes to find out if the list of fileserver UUIDs have any in common - and if that's the case, do the address lists of those fileservers have any addresses in common. If they do, the new cell is adjudged to be an alias of the old cell and the old cell is used instead. Comparing is aided by the server list in struct afs_server_list being sorted in UUID order and the addresses in the fileserver address lists being sorted in address order. The cell then retains the afs_volume object for the root.cell volume, even if it's not mounted for future alias checking. This necessary because: (1) Whilst fileservers have UUIDs that are meant to be globally unique, in practice they are not because cells get cloned without changing the UUIDs - so afs_server records need to be per cell. (2) Sometimes the DNS is used to make cell aliases - but if we don't know they're the same, we may end up with multiple superblocks and multiple afs_server records for the same thing, impairing our ability to deliver callback notifications of third party changes (3) The fileserver RPC API doesn't contain the cell name, so it can't tell us which cell it's notifying and can't see that a change made to to one cell should notify the same client that's also accessed as the other cell. Reported-by: Jeffrey Altman Signed-off-by: David Howells --- fs/afs/Makefile | 1 + fs/afs/cell.c | 3 + fs/afs/internal.h | 17 +++- fs/afs/main.c | 1 + fs/afs/proc.c | 5 +- fs/afs/rotate.c | 8 +- fs/afs/super.c | 21 ++++- fs/afs/vl_alias.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++ fs/afs/volume.c | 9 +- 9 files changed, 287 insertions(+), 13 deletions(-) create mode 100644 fs/afs/vl_alias.c diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 924f02e9d7e7..75c4e4043d1d 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -31,6 +31,7 @@ kafs-y := \ server_list.o \ super.o \ vlclient.o \ + vl_alias.o \ vl_list.o \ vl_probe.o \ vl_rotate.o \ diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 78ba5f932287..212098514ebf 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -164,6 +164,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, INIT_LIST_HEAD(&cell->proc_volumes); rwlock_init(&cell->proc_lock); rwlock_init(&cell->vl_servers_lock); + cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS); /* Provide a VL server list, filling it in if we were given a list of * addresses to use. @@ -481,7 +482,9 @@ static void afs_cell_destroy(struct rcu_head *rcu) ASSERTCMP(atomic_read(&cell->usage), ==, 0); + afs_put_volume(cell->net, cell->root_volume); afs_put_vlserverlist(cell->net, rcu_access_pointer(cell->vl_servers)); + afs_put_cell(cell->net, cell->alias_of); key_put(cell->anonymous_key); kfree(cell); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 3606cfa50832..a3ef97d560ca 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -269,6 +269,7 @@ struct afs_net { struct timer_list cells_timer; atomic_t cells_outstanding; seqlock_t cells_lock; + struct mutex cells_alias_lock; struct mutex proc_cells_lock; struct hlist_head proc_cells; @@ -342,8 +343,10 @@ enum afs_cell_state { * for authentication and encryption. The cell name is not typically used in * the protocol. * - * There is no easy way to determine if two cells are aliases or one is a - * subset of another. + * Two cells are determined to be aliases if they have an explicit alias (YFS + * only), share any VL servers in common or have at least one volume in common. + * "In common" means that the address list of the VL servers or the fileservers + * share at least one endpoint. */ struct afs_cell { union { @@ -351,6 +354,8 @@ struct afs_cell { struct rb_node net_node; /* Node in net->cells */ }; struct afs_net *net; + struct afs_cell *alias_of; /* The cell this is an alias of */ + struct afs_volume *root_volume; /* The root.cell volume if there is one */ struct key *anonymous_key; /* anonymous user key for this cell */ struct work_struct manager; /* Manager for init/deinit/dns */ struct hlist_node proc_link; /* /proc cell list link */ @@ -363,6 +368,7 @@ struct afs_cell { unsigned long flags; #define AFS_CELL_FL_NO_GC 0 /* The cell was added manually, don't auto-gc */ #define AFS_CELL_FL_DO_LOOKUP 1 /* DNS lookup requested */ +#define AFS_CELL_FL_CHECK_ALIAS 2 /* Need to check for aliases */ enum afs_cell_state state; short error; enum dns_record_source dns_source:8; /* Latest source of data from lookup */ @@ -584,7 +590,7 @@ struct afs_volume { #ifdef CONFIG_AFS_FSCACHE struct fscache_cookie *cache; /* caching cookie */ #endif - struct afs_server_list *servers; /* List of servers on which volume resides */ + struct afs_server_list __rcu *servers; /* List of servers on which volume resides */ rwlock_t servers_lock; /* Lock for ->servers */ unsigned int servers_seq; /* Incremented each time ->servers changes */ @@ -1376,6 +1382,11 @@ extern struct afs_call *afs_vl_get_capabilities(struct afs_net *, struct afs_add extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *, const uuid_t *); extern char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *); +/* + * vl_alias.c + */ +extern int afs_cell_detect_alias(struct afs_cell *, struct key *); + /* * vl_probe.c */ diff --git a/fs/afs/main.c b/fs/afs/main.c index 56b52f8dbf15..9c79c91e8005 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -82,6 +82,7 @@ static int __net_init afs_net_init(struct net *net_ns) INIT_WORK(&net->cells_manager, afs_manage_cells); timer_setup(&net->cells_timer, afs_cells_timer, 0); + mutex_init(&net->cells_alias_lock); mutex_init(&net->proc_cells_lock); INIT_HLIST_HEAD(&net->proc_cells); diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 1d21465a4108..256c3eff8c82 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -38,7 +38,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v) if (v == SEQ_START_TOKEN) { /* display header on line 1 */ - seq_puts(m, "USE TTL SV NAME\n"); + seq_puts(m, "USE TTL SV ST NAME\n"); return 0; } @@ -46,10 +46,11 @@ static int afs_proc_cells_show(struct seq_file *m, void *v) vllist = rcu_dereference(cell->vl_servers); /* display one cell per line on subsequent lines */ - seq_printf(m, "%3u %6lld %2u %s\n", + seq_printf(m, "%3u %6lld %2u %2u %s\n", atomic_read(&cell->usage), cell->dns_expiry - ktime_get_real_seconds(), vllist->nr_servers, + cell->state, cell->name); return 0; } diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index 8c8dc2397c5d..d67b8c7eb3b9 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -25,7 +25,9 @@ static bool afs_start_fs_iteration(struct afs_operation *op, int i; read_lock(&op->volume->servers_lock); - op->server_list = afs_get_serverlist(op->volume->servers); + op->server_list = afs_get_serverlist( + rcu_dereference_protected(op->volume->servers, + lockdep_is_held(&op->volume->servers_lock))); read_unlock(&op->volume->servers_lock); op->untried = (1UL << op->server_list->nr_servers) - 1; @@ -173,7 +175,7 @@ bool afs_select_fileserver(struct afs_operation *op) /* If the server list didn't change, then assume that * it's the fileserver having trouble. */ - if (op->volume->servers == op->server_list) { + if (rcu_access_pointer(op->volume->servers) == op->server_list) { op->error = -EREMOTEIO; goto next_server; } @@ -263,7 +265,7 @@ bool afs_select_fileserver(struct afs_operation *op) * * TODO: Retry a few times with sleeps. */ - if (op->volume->servers == op->server_list) { + if (rcu_access_pointer(op->volume->servers) == op->server_list) { op->error = -ENOMEDIUM; goto failed; } diff --git a/fs/afs/super.c b/fs/afs/super.c index c4bb314a22ae..aae6866ed209 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -352,7 +352,9 @@ static int afs_validate_fc(struct fs_context *fc) { struct afs_fs_context *ctx = fc->fs_private; struct afs_volume *volume; + struct afs_cell *cell; struct key *key; + int ret; if (!ctx->dyn_root) { if (ctx->no_cell) { @@ -365,6 +367,7 @@ static int afs_validate_fc(struct fs_context *fc) return -EDESTADDRREQ; } + reget_key: /* We try to do the mount securely. */ key = afs_request_key(ctx->cell); if (IS_ERR(key)) @@ -377,6 +380,21 @@ static int afs_validate_fc(struct fs_context *fc) ctx->volume = NULL; } + if (test_bit(AFS_CELL_FL_CHECK_ALIAS, &ctx->cell->flags)) { + ret = afs_cell_detect_alias(ctx->cell, key); + if (ret < 0) + return ret; + if (ret == 1) { + _debug("switch to alias"); + key_put(ctx->key); + ctx->key = NULL; + cell = afs_get_cell(ctx->cell->alias_of); + afs_put_cell(ctx->net, ctx->cell); + ctx->cell = cell; + goto reget_key; + } + } + volume = afs_create_volume(ctx); if (IS_ERR(volume)) return PTR_ERR(volume); @@ -518,7 +536,8 @@ static void afs_kill_super(struct super_block *sb) * deactivating the superblock. */ if (as->volume) - afs_clear_callback_interests(net, as->volume->servers); + afs_clear_callback_interests( + net, rcu_access_pointer(as->volume->servers)); kill_anon_super(sb); if (as->volume) afs_deactivate_volume(as->volume); diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c new file mode 100644 index 000000000000..d1d91a25fbe0 --- /dev/null +++ b/fs/afs/vl_alias.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* AFS cell alias detection + * + * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include +#include +#include +#include "internal.h" + +/* + * Sample a volume. + */ +static struct afs_volume *afs_sample_volume(struct afs_cell *cell, struct key *key, + const char *name, unsigned int namelen) +{ + struct afs_volume *volume; + struct afs_fs_context fc = { + .type = 0, /* Explicitly leave it to the VLDB */ + .volnamesz = namelen, + .volname = name, + .net = cell->net, + .cell = cell, + .key = key, /* This might need to be something */ + }; + + volume = afs_create_volume(&fc); + _leave(" = %px", volume); + return volume; +} + +/* + * Compare two addresses. + */ +static int afs_compare_addrs(const struct sockaddr_rxrpc *srx_a, + const struct sockaddr_rxrpc *srx_b) +{ + short port_a, port_b; + int addr_a, addr_b, diff; + + diff = (short)srx_a->transport_type - (short)srx_b->transport_type; + if (diff) + goto out; + + switch (srx_a->transport_type) { + case AF_INET: { + const struct sockaddr_in *a = &srx_a->transport.sin; + const struct sockaddr_in *b = &srx_b->transport.sin; + addr_a = ntohl(a->sin_addr.s_addr); + addr_b = ntohl(b->sin_addr.s_addr); + diff = addr_a - addr_b; + if (diff == 0) { + port_a = ntohs(a->sin_port); + port_b = ntohs(b->sin_port); + diff = port_a - port_b; + } + break; + } + + case AF_INET6: { + const struct sockaddr_in6 *a = &srx_a->transport.sin6; + const struct sockaddr_in6 *b = &srx_b->transport.sin6; + diff = memcmp(&a->sin6_addr, &b->sin6_addr, 16); + if (diff == 0) { + port_a = ntohs(a->sin6_port); + port_b = ntohs(b->sin6_port); + diff = port_a - port_b; + } + break; + } + + default: + BUG(); + } + +out: + return diff; +} + +/* + * Compare the address lists of a pair of fileservers. + */ +static int afs_compare_fs_alists(const struct afs_server *server_a, + const struct afs_server *server_b) +{ + const struct afs_addr_list *la, *lb; + int a = 0, b = 0, addr_matches = 0; + + la = rcu_dereference(server_a->addresses); + lb = rcu_dereference(server_b->addresses); + + while (a < la->nr_addrs && b < lb->nr_addrs) { + const struct sockaddr_rxrpc *srx_a = &la->addrs[a]; + const struct sockaddr_rxrpc *srx_b = &lb->addrs[b]; + int diff = afs_compare_addrs(srx_a, srx_b); + + if (diff < 0) { + a++; + } else if (diff > 0) { + b++; + } else { + addr_matches++; + a++; + b++; + } + } + + return addr_matches; +} + +/* + * Compare the fileserver lists of two volumes. The server lists are sorted in + * order of ascending UUID. + */ +static int afs_compare_volume_slists(const struct afs_volume *vol_a, + const struct afs_volume *vol_b) +{ + const struct afs_server_list *la, *lb; + int i, a = 0, b = 0, uuid_matches = 0, addr_matches = 0; + + la = rcu_dereference(vol_a->servers); + lb = rcu_dereference(vol_b->servers); + + for (i = 0; i < AFS_MAXTYPES; i++) + if (la->vids[i] != lb->vids[i]) + return 0; + + while (a < la->nr_servers && b < lb->nr_servers) { + const struct afs_server *server_a = la->servers[a].server; + const struct afs_server *server_b = lb->servers[b].server; + int diff = memcmp(&server_a->uuid, &server_b->uuid, sizeof(uuid_t)); + + if (diff < 0) { + a++; + } else if (diff > 0) { + b++; + } else { + uuid_matches++; + addr_matches += afs_compare_fs_alists(server_a, server_b); + a++; + b++; + } + } + + _leave(" = %d [um %d]", addr_matches, uuid_matches); + return addr_matches; +} + +/* + * Compare root.cell volumes. + */ +static int afs_compare_cell_roots(struct afs_cell *cell) +{ + struct afs_cell *p; + + _enter(""); + + rcu_read_lock(); + + hlist_for_each_entry_rcu(p, &cell->net->proc_cells, proc_link) { + if (p == cell || p->alias_of) + continue; + if (!p->root_volume) + continue; /* Ignore cells that don't have a root.cell volume. */ + + if (afs_compare_volume_slists(cell->root_volume, p->root_volume) != 0) + goto is_alias; + } + + rcu_read_unlock(); + _leave(" = 0"); + return 0; + +is_alias: + rcu_read_unlock(); + cell->alias_of = afs_get_cell(p); + return 1; +} + +static int afs_do_cell_detect_alias(struct afs_cell *cell, struct key *key) +{ + struct afs_volume *root_volume; + + _enter("%s", cell->name); + + /* Try and get the root.cell volume for comparison with other cells */ + root_volume = afs_sample_volume(cell, key, "root.cell", 9); + if (!IS_ERR(root_volume)) { + cell->root_volume = root_volume; + return afs_compare_cell_roots(cell); + } + + if (PTR_ERR(root_volume) != -ENOMEDIUM) + return PTR_ERR(root_volume); + + /* Okay, this cell doesn't have an root.cell volume. We need to + * locate some other random volume and use that to check. + */ + return -ENOMEDIUM; +} + +/* + * Check to see if a new cell is an alias of a cell we already have. At this + * point we have the cell's volume server list. + * + * Returns 0 if we didn't detect an alias, 1 if we found an alias and an error + * if we had problems gathering the data required. In the case the we did + * detect an alias, cell->alias_of is set to point to the assumed master. + */ +int afs_cell_detect_alias(struct afs_cell *cell, struct key *key) +{ + struct afs_net *net = cell->net; + int ret; + + if (mutex_lock_interruptible(&net->cells_alias_lock) < 0) + return -ERESTARTSYS; + + if (test_bit(AFS_CELL_FL_CHECK_ALIAS, &cell->flags)) { + ret = afs_do_cell_detect_alias(cell, key); + if (ret >= 0) + clear_bit_unlock(AFS_CELL_FL_CHECK_ALIAS, &cell->flags); + } else { + ret = cell->alias_of ? 1 : 0; + } + + mutex_unlock(&net->cells_alias_lock); + + if (ret == 1) + pr_notice("kAFS: Cell %s is an alias of %s\n", + cell->name, cell->alias_of->name); + return ret; +} diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 57d0509f7353..d4d9a8fbba3d 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -51,7 +51,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, } refcount_set(&slist->usage, 1); - volume->servers = slist; + rcu_assign_pointer(volume->servers, slist); return volume; error_1: @@ -156,7 +156,7 @@ static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume) ASSERTCMP(volume->cache, ==, NULL); #endif - afs_put_serverlist(net, volume->servers); + afs_put_serverlist(net, rcu_access_pointer(volume->servers)); afs_put_cell(net, volume->cell); kfree(volume); @@ -256,10 +256,11 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) write_lock(&volume->servers_lock); discard = new; - old = volume->servers; + old = rcu_dereference_protected(volume->servers, + lockdep_is_held(&volume->servers_lock)); if (afs_annotate_server_list(new, old)) { new->seq = volume->servers_seq + 1; - volume->servers = new; + rcu_assign_pointer(volume->servers, new); smp_wmb(); volume->servers_seq++; discard = old; From 6ef350b1842081bef7e4879993f47f052b7007e7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sun, 26 Apr 2020 10:12:27 +0100 Subject: [PATCH 353/427] afs: Detect cell aliases 2 - Cells with no root volumes Implement the second phase of cell alias detection. This part handles alias detection for cells that don't have root.cell volumes and so we have to find some other volume or fileserver to query. We take the first volume from each such cell and attempt to look it up in the new cell. If found, we compare the records, if they are the same, we judge the cell names to be aliases. Signed-off-by: David Howells --- fs/afs/vl_alias.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 1 deletion(-) diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index d1d91a25fbe0..76bfa4dde4a4 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -180,6 +180,94 @@ is_alias: return 1; } +/* + * Query the new cell for a volume from a cell we're already using. + */ +static int afs_query_for_alias_one(struct afs_cell *cell, struct key *key, + struct afs_cell *p) +{ + struct afs_volume *volume, *pvol = NULL; + int ret; + + /* Arbitrarily pick the first volume in the list. */ + read_lock(&p->proc_lock); + if (!list_empty(&p->proc_volumes)) + pvol = afs_get_volume(list_first_entry(&p->proc_volumes, + struct afs_volume, proc_link)); + read_unlock(&p->proc_lock); + if (!pvol) + return 0; + + _enter("%s:%s", cell->name, pvol->name); + + /* And see if it's in the new cell. */ + volume = afs_sample_volume(cell, key, pvol->name, pvol->name_len); + if (IS_ERR(volume)) { + afs_put_volume(cell->net, pvol); + if (PTR_ERR(volume) != -ENOMEDIUM) + return PTR_ERR(volume); + /* That volume is not in the new cell, so not an alias */ + return 0; + } + + /* The new cell has a like-named volume also - compare volume ID, + * server and address lists. + */ + ret = 0; + if (pvol->vid == volume->vid) { + rcu_read_lock(); + if (afs_compare_volume_slists(volume, pvol)) + ret = 1; + rcu_read_unlock(); + } + + afs_put_volume(cell->net, volume); + afs_put_volume(cell->net, pvol); + return ret; +} + +/* + * Query the new cell for volumes we know exist in cells we're already using. + */ +static int afs_query_for_alias(struct afs_cell *cell, struct key *key) +{ + struct afs_cell *p; + + _enter("%s", cell->name); + + if (mutex_lock_interruptible(&cell->net->proc_cells_lock) < 0) + return -ERESTARTSYS; + + hlist_for_each_entry(p, &cell->net->proc_cells, proc_link) { + if (p == cell || p->alias_of) + continue; + if (list_empty(&p->proc_volumes)) + continue; + if (p->root_volume) + continue; /* Ignore cells that have a root.cell volume. */ + afs_get_cell(p); + mutex_unlock(&cell->net->proc_cells_lock); + + if (afs_query_for_alias_one(cell, key, p) != 0) + goto is_alias; + + if (mutex_lock_interruptible(&cell->net->proc_cells_lock) < 0) { + afs_put_cell(cell->net, p); + return -ERESTARTSYS; + } + + afs_put_cell(cell->net, p); + } + + mutex_unlock(&cell->net->proc_cells_lock); + _leave(" = 0"); + return 0; + +is_alias: + cell->alias_of = p; /* Transfer our ref */ + return 1; +} + static int afs_do_cell_detect_alias(struct afs_cell *cell, struct key *key) { struct afs_volume *root_volume; @@ -199,7 +287,7 @@ static int afs_do_cell_detect_alias(struct afs_cell *cell, struct key *key) /* Okay, this cell doesn't have an root.cell volume. We need to * locate some other random volume and use that to check. */ - return -ENOMEDIUM; + return afs_query_for_alias(cell, key); } /* From 6dfdf5369c9f0a47920b2f743434c90798f26cd5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 27 Apr 2020 15:01:09 +0100 Subject: [PATCH 354/427] afs: Detect cell aliases 3 - YFS Cells with a canonical cell name op YFS Volume Location servers have an operation by which the cell name may be queried. Use this to find out what a YFS server thinks the canonical cell name should be. Signed-off-by: David Howells --- fs/afs/vl_alias.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++ fs/afs/vl_rotate.c | 4 ++++ 2 files changed, 62 insertions(+) diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index 76bfa4dde4a4..ac7a59e951ed 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -268,12 +268,70 @@ is_alias: return 1; } +/* + * Look up a VLDB record for a volume. + */ +static char *afs_vl_get_cell_name(struct afs_cell *cell, struct key *key) +{ + struct afs_vl_cursor vc; + char *cell_name = ERR_PTR(-EDESTADDRREQ); + bool skipped = false, not_skipped = false; + int ret; + + if (!afs_begin_vlserver_operation(&vc, cell, key)) + return ERR_PTR(-ERESTARTSYS); + + while (afs_select_vlserver(&vc)) { + if (!test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags)) { + vc.ac.error = -EOPNOTSUPP; + skipped = true; + continue; + } + not_skipped = true; + cell_name = afs_yfsvl_get_cell_name(&vc); + } + + ret = afs_end_vlserver_operation(&vc); + if (skipped && !not_skipped) + ret = -EOPNOTSUPP; + return ret < 0 ? ERR_PTR(ret) : cell_name; +} + +static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key) +{ + struct afs_cell *master; + char *cell_name; + + cell_name = afs_vl_get_cell_name(cell, key); + if (IS_ERR(cell_name)) + return PTR_ERR(cell_name); + + if (strcmp(cell_name, cell->name) == 0) { + kfree(cell_name); + return 0; + } + + master = afs_lookup_cell(cell->net, cell_name, strlen(cell_name), + NULL, false); + kfree(cell_name); + if (IS_ERR(master)) + return PTR_ERR(master); + + cell->alias_of = master; /* Transfer our ref */ + return 1; +} + static int afs_do_cell_detect_alias(struct afs_cell *cell, struct key *key) { struct afs_volume *root_volume; + int ret; _enter("%s", cell->name); + ret = yfs_check_canonical_cell_name(cell, key); + if (ret != -EOPNOTSUPP) + return ret; + /* Try and get the root.cell volume for comparison with other cells */ root_volume = afs_sample_volume(cell, key, "root.cell", 9); if (!IS_ERR(root_volume)) { diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index 72eacc14e6e1..f405ca8b240a 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -151,6 +151,10 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) vc->error = error; vc->flags |= AFS_VL_CURSOR_RETRY; goto next_server; + + case -EOPNOTSUPP: + _debug("notsupp"); + goto next_server; } restart_from_beginning: From cca37d45d547434144409ae648a19b7eb6db5eb4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 29 Apr 2020 17:02:04 +0100 Subject: [PATCH 355/427] afs: Add a tracepoint to track the lifetime of the afs_volume struct Add a tracepoint to track the lifetime of the afs_volume struct. Signed-off-by: David Howells --- fs/afs/cell.c | 2 +- fs/afs/fs_operation.c | 4 +-- fs/afs/internal.h | 10 ++----- fs/afs/super.c | 10 ++++--- fs/afs/vl_alias.c | 9 +++--- fs/afs/volume.c | 27 +++++++++++++++--- include/trace/events/afs.h | 56 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 95 insertions(+), 23 deletions(-) diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 212098514ebf..8bfc8a05fd46 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -482,7 +482,7 @@ static void afs_cell_destroy(struct rcu_head *rcu) ASSERTCMP(atomic_read(&cell->usage), ==, 0); - afs_put_volume(cell->net, cell->root_volume); + afs_put_volume(cell->net, cell->root_volume, afs_volume_trace_put_cell_root); afs_put_vlserverlist(cell->net, rcu_access_pointer(cell->vl_servers)); afs_put_cell(cell->net, cell->alias_of); key_put(cell->anonymous_key); diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index f7a768d12141..f57efd9d2db0 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -36,7 +36,7 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo } op->key = key; - op->volume = afs_get_volume(volume); + op->volume = afs_get_volume(volume, afs_volume_trace_get_new_op); op->net = volume->cell->net; op->cb_v_break = volume->cb_v_break; op->debug_id = atomic_inc_return(&afs_operation_debug_counter); @@ -233,7 +233,7 @@ int afs_put_operation(struct afs_operation *op) afs_end_cursor(&op->ac); afs_put_cb_interest(op->net, op->cbi); afs_put_serverlist(op->net, op->server_list); - afs_put_volume(op->net, op->volume); + afs_put_volume(op->net, op->volume, afs_volume_trace_put_put_op); kfree(op); return ret; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a3ef97d560ca..e084936066b0 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1429,17 +1429,11 @@ extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *, /* * volume.c */ -static inline struct afs_volume *afs_get_volume(struct afs_volume *volume) -{ - if (volume) - atomic_inc(&volume->usage); - return volume; -} - extern struct afs_volume *afs_create_volume(struct afs_fs_context *); extern void afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); -extern void afs_put_volume(struct afs_net *, struct afs_volume *); +extern struct afs_volume *afs_get_volume(struct afs_volume *, enum afs_volume_trace); +extern void afs_put_volume(struct afs_net *, struct afs_volume *, enum afs_volume_trace); extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); /* diff --git a/fs/afs/super.c b/fs/afs/super.c index aae6866ed209..f92ccdafc729 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -376,7 +376,8 @@ static int afs_validate_fc(struct fs_context *fc) ctx->key = key; if (ctx->volume) { - afs_put_volume(ctx->net, ctx->volume); + afs_put_volume(ctx->net, ctx->volume, + afs_volume_trace_put_validate_fc); ctx->volume = NULL; } @@ -507,7 +508,8 @@ static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc) as->dyn_root = true; } else { as->cell = afs_get_cell(ctx->cell); - as->volume = afs_get_volume(ctx->volume); + as->volume = afs_get_volume(ctx->volume, + afs_volume_trace_get_alloc_sbi); } } return as; @@ -517,7 +519,7 @@ static void afs_destroy_sbi(struct afs_super_info *as) { if (as) { struct afs_net *net = afs_net(as->net_ns); - afs_put_volume(net, as->volume); + afs_put_volume(net, as->volume, afs_volume_trace_put_destroy_sbi); afs_put_cell(net, as->cell); put_net(as->net_ns); kfree(as); @@ -605,7 +607,7 @@ static void afs_free_fc(struct fs_context *fc) struct afs_fs_context *ctx = fc->fs_private; afs_destroy_sbi(fc->s_fs_info); - afs_put_volume(ctx->net, ctx->volume); + afs_put_volume(ctx->net, ctx->volume, afs_volume_trace_put_free_fc); afs_put_cell(ctx->net, ctx->cell); key_put(ctx->key); kfree(ctx); diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index ac7a59e951ed..c61dd9410202 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -193,7 +193,8 @@ static int afs_query_for_alias_one(struct afs_cell *cell, struct key *key, read_lock(&p->proc_lock); if (!list_empty(&p->proc_volumes)) pvol = afs_get_volume(list_first_entry(&p->proc_volumes, - struct afs_volume, proc_link)); + struct afs_volume, proc_link), + afs_volume_trace_get_query_alias); read_unlock(&p->proc_lock); if (!pvol) return 0; @@ -203,7 +204,7 @@ static int afs_query_for_alias_one(struct afs_cell *cell, struct key *key, /* And see if it's in the new cell. */ volume = afs_sample_volume(cell, key, pvol->name, pvol->name_len); if (IS_ERR(volume)) { - afs_put_volume(cell->net, pvol); + afs_put_volume(cell->net, pvol, afs_volume_trace_put_query_alias); if (PTR_ERR(volume) != -ENOMEDIUM) return PTR_ERR(volume); /* That volume is not in the new cell, so not an alias */ @@ -221,8 +222,8 @@ static int afs_query_for_alias_one(struct afs_cell *cell, struct key *key, rcu_read_unlock(); } - afs_put_volume(cell->net, volume); - afs_put_volume(cell->net, pvol); + afs_put_volume(cell->net, volume, afs_volume_trace_put_query_alias); + afs_put_volume(cell->net, pvol, afs_volume_trace_put_query_alias); return ret; } diff --git a/fs/afs/volume.c b/fs/afs/volume.c index d4d9a8fbba3d..0393f4910a92 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -52,6 +52,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, refcount_set(&slist->usage, 1); rcu_assign_pointer(volume->servers, slist); + trace_afs_volume(volume->vid, 1, afs_volume_trace_alloc); return volume; error_1: @@ -158,20 +159,38 @@ static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume) afs_put_serverlist(net, rcu_access_pointer(volume->servers)); afs_put_cell(net, volume->cell); + trace_afs_volume(volume->vid, atomic_read(&volume->usage), + afs_volume_trace_free); kfree(volume); _leave(" [destroyed]"); } /* - * Drop a reference on a volume record. + * Get a reference on a volume record. */ -void afs_put_volume(struct afs_net *net, struct afs_volume *volume) +struct afs_volume *afs_get_volume(struct afs_volume *volume, + enum afs_volume_trace reason) { if (volume) { - _enter("%s", volume->name); + int u = atomic_inc_return(&volume->usage); + trace_afs_volume(volume->vid, u, reason); + } + return volume; +} - if (atomic_dec_and_test(&volume->usage)) + +/* + * Drop a reference on a volume record. + */ +void afs_put_volume(struct afs_net *net, struct afs_volume *volume, + enum afs_volume_trace reason) +{ + if (volume) { + afs_volid_t vid = volume->vid; + int u = atomic_dec_return(&volume->usage); + trace_afs_volume(vid, u, reason); + if (u == 0) afs_destroy_volume(net, volume); } } diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index f320b3ad54da..5f0c1cf1ea13 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -50,6 +50,23 @@ enum afs_server_trace { afs_server_trace_update, }; +enum afs_volume_trace { + afs_volume_trace_alloc, + afs_volume_trace_free, + afs_volume_trace_get_alloc_sbi, + afs_volume_trace_get_cell_insert, + afs_volume_trace_get_new_op, + afs_volume_trace_get_query_alias, + afs_volume_trace_put_cell_dup, + afs_volume_trace_put_cell_root, + afs_volume_trace_put_destroy_sbi, + afs_volume_trace_put_free_fc, + afs_volume_trace_put_put_op, + afs_volume_trace_put_query_alias, + afs_volume_trace_put_validate_fc, + afs_volume_trace_remove, +}; + enum afs_fs_operation { afs_FS_FetchData = 130, /* AFS Fetch file data */ afs_FS_FetchACL = 131, /* AFS Fetch file ACL */ @@ -262,6 +279,22 @@ enum afs_cb_break_reason { EM(afs_server_trace_put_uuid_rsq, "PUT u-req") \ E_(afs_server_trace_update, "UPDATE") +#define afs_volume_traces \ + EM(afs_volume_trace_alloc, "ALLOC ") \ + EM(afs_volume_trace_free, "FREE ") \ + EM(afs_volume_trace_get_alloc_sbi, "GET sbi-alloc ") \ + EM(afs_volume_trace_get_cell_insert, "GET cell-insrt") \ + EM(afs_volume_trace_get_new_op, "GET op-new ") \ + EM(afs_volume_trace_get_query_alias, "GET cell-alias") \ + EM(afs_volume_trace_put_cell_dup, "PUT cell-dup ") \ + EM(afs_volume_trace_put_cell_root, "PUT cell-root ") \ + EM(afs_volume_trace_put_destroy_sbi, "PUT sbi-destry") \ + EM(afs_volume_trace_put_free_fc, "PUT fc-free ") \ + EM(afs_volume_trace_put_put_op, "PUT op-put ") \ + EM(afs_volume_trace_put_query_alias, "PUT cell-alias") \ + EM(afs_volume_trace_put_validate_fc, "PUT fc-validat") \ + E_(afs_volume_trace_remove, "REMOVE ") + #define afs_fs_operations \ EM(afs_FS_FetchData, "FS.FetchData") \ EM(afs_FS_FetchStatus, "FS.FetchStatus") \ @@ -1302,6 +1335,29 @@ TRACE_EVENT(afs_server, __entry->active) ); +TRACE_EVENT(afs_volume, + TP_PROTO(afs_volid_t vid, int ref, enum afs_volume_trace reason), + + TP_ARGS(vid, ref, reason), + + TP_STRUCT__entry( + __field(afs_volid_t, vid ) + __field(int, ref ) + __field(enum afs_volume_trace, reason ) + ), + + TP_fast_assign( + __entry->vid = vid; + __entry->ref = ref; + __entry->reason = reason; + ), + + TP_printk("V=%llx %s u=%d", + __entry->vid, + __print_symbolic(__entry->reason, afs_volume_traces), + __entry->ref) + ); + #endif /* _TRACE_AFS_H */ /* This part must be outside protection */ From 20325960f8750165964a6891a733e4cc15d19076 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 30 Apr 2020 01:03:49 +0100 Subject: [PATCH 356/427] afs: Reorganise volume and server trees to be rooted on the cell Reorganise afs_volume objects such that they're in a tree keyed on volume ID, rooted at on an afs_cell object rather than being in multiple trees, each of which is rooted on an afs_server object. afs_server structs become per-cell and acquire a pointer to the cell. The process of breaking a callback then starts with finding the server by its network address, following that to the cell and then looking up each volume ID in the volume tree. This is simpler than the afs_vol_interest/afs_cb_interest N:M mapping web and allows those structs and the code for maintaining them to be simplified or removed. It does make a couple of things a bit more tricky, though: (1) Operations now start with a volume, not a server, so there can be more than one answer as to whether or not the server we'll end up using supports the FS.InlineBulkStatus RPC. (2) CB RPC operations that specify the server UUID. There's still a tree of servers by UUID on the afs_net struct, but the UUIDs in it aren't guaranteed unique. Signed-off-by: David Howells --- fs/afs/callback.c | 286 +++++++----------------------------------- fs/afs/cell.c | 7 +- fs/afs/dir.c | 45 +++++-- fs/afs/fs_operation.c | 11 +- fs/afs/fsclient.c | 7 +- fs/afs/inode.c | 59 ++++----- fs/afs/internal.h | 78 ++++-------- fs/afs/proc.c | 15 +-- fs/afs/rotate.c | 128 +++---------------- fs/afs/rxrpc.c | 5 +- fs/afs/security.c | 8 +- fs/afs/server.c | 13 +- fs/afs/server_list.c | 30 +---- fs/afs/super.c | 8 +- fs/afs/vl_alias.c | 14 +-- fs/afs/volume.c | 84 +++++++++++-- fs/afs/yfsclient.c | 2 +- 17 files changed, 257 insertions(+), 543 deletions(-) diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 282dbac84435..b4cb9bb63f0a 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -20,185 +20,6 @@ #include #include "internal.h" -/* - * Create volume and callback interests on a server. - */ -static struct afs_cb_interest *afs_create_interest(struct afs_server *server, - struct afs_vnode *vnode) -{ - struct afs_vol_interest *new_vi, *vi; - struct afs_cb_interest *new; - struct rb_node *parent, **pp; - - new_vi = kzalloc(sizeof(struct afs_vol_interest), GFP_KERNEL); - if (!new_vi) - return NULL; - - new = kzalloc(sizeof(struct afs_cb_interest), GFP_KERNEL); - if (!new) { - kfree(new_vi); - return NULL; - } - - new_vi->usage = 1; - new_vi->vid = vnode->volume->vid; - INIT_HLIST_HEAD(&new_vi->cb_interests); - - refcount_set(&new->usage, 1); - new->sb = vnode->vfs_inode.i_sb; - new->server = afs_get_server(server, afs_server_trace_get_new_cbi); - INIT_HLIST_NODE(&new->cb_vlink); - - write_seqlock(&server->cb_break_lock); - - pp = &server->cb_volumes.rb_node; - while ((parent = *pp)) { - vi = rb_entry(parent, struct afs_vol_interest, srv_node); - if (vi->vid < new_vi->vid) { - pp = &(*pp)->rb_left; - } else if (vi->vid > new_vi->vid) { - pp = &(*pp)->rb_right; - } else { - vi->usage++; - goto found_vi; - } - } - - vi = new_vi; - new_vi = NULL; - rb_link_node_rcu(&vi->srv_node, parent, pp); - rb_insert_color(&vi->srv_node, &server->cb_volumes); - -found_vi: - new->vol_interest = vi; - hlist_add_head(&new->cb_vlink, &vi->cb_interests); - - write_sequnlock(&server->cb_break_lock); - kfree(new_vi); - return new; -} - -/* - * Set up an interest-in-callbacks record for a volume on a server and - * register it with the server. - * - Called with vnode->io_lock held. - */ -int afs_register_server_cb_interest(struct afs_vnode *vnode, - struct afs_server_list *slist, - unsigned int index) -{ - struct afs_server_entry *entry = &slist->servers[index]; - struct afs_cb_interest *cbi, *vcbi, *new, *old; - struct afs_server *server = entry->server; - -again: - vcbi = rcu_dereference_protected(vnode->cb_interest, - lockdep_is_held(&vnode->io_lock)); - if (vcbi && likely(vcbi == entry->cb_interest)) - return 0; - - read_lock(&slist->lock); - cbi = afs_get_cb_interest(entry->cb_interest); - read_unlock(&slist->lock); - - if (vcbi) { - if (vcbi == cbi) { - afs_put_cb_interest(afs_v2net(vnode), cbi); - return 0; - } - - /* Use a new interest in the server list for the same server - * rather than an old one that's still attached to a vnode. - */ - if (cbi && vcbi->server == cbi->server) { - write_seqlock(&vnode->cb_lock); - old = rcu_dereference_protected(vnode->cb_interest, - lockdep_is_held(&vnode->cb_lock.lock)); - rcu_assign_pointer(vnode->cb_interest, cbi); - write_sequnlock(&vnode->cb_lock); - afs_put_cb_interest(afs_v2net(vnode), old); - return 0; - } - - /* Re-use the one attached to the vnode. */ - if (!cbi && vcbi->server == server) { - write_lock(&slist->lock); - if (entry->cb_interest) { - write_unlock(&slist->lock); - afs_put_cb_interest(afs_v2net(vnode), cbi); - goto again; - } - - entry->cb_interest = cbi; - write_unlock(&slist->lock); - return 0; - } - } - - if (!cbi) { - new = afs_create_interest(server, vnode); - if (!new) - return -ENOMEM; - - write_lock(&slist->lock); - if (!entry->cb_interest) { - entry->cb_interest = afs_get_cb_interest(new); - cbi = new; - new = NULL; - } else { - cbi = afs_get_cb_interest(entry->cb_interest); - } - write_unlock(&slist->lock); - afs_put_cb_interest(afs_v2net(vnode), new); - } - - ASSERT(cbi); - - /* Change the server the vnode is using. This entails scrubbing any - * interest the vnode had in the previous server it was using. - */ - write_seqlock(&vnode->cb_lock); - - old = rcu_dereference_protected(vnode->cb_interest, - lockdep_is_held(&vnode->cb_lock.lock)); - rcu_assign_pointer(vnode->cb_interest, cbi); - vnode->cb_s_break = cbi->server->cb_s_break; - vnode->cb_v_break = vnode->volume->cb_v_break; - clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); - - write_sequnlock(&vnode->cb_lock); - afs_put_cb_interest(afs_v2net(vnode), old); - return 0; -} - -/* - * Remove an interest on a server. - */ -void afs_put_cb_interest(struct afs_net *net, struct afs_cb_interest *cbi) -{ - struct afs_vol_interest *vi; - - if (cbi && refcount_dec_and_test(&cbi->usage)) { - if (!hlist_unhashed(&cbi->cb_vlink)) { - write_seqlock(&cbi->server->cb_break_lock); - - hlist_del_init(&cbi->cb_vlink); - vi = cbi->vol_interest; - cbi->vol_interest = NULL; - if (--vi->usage == 0) - rb_erase(&vi->srv_node, &cbi->server->cb_volumes); - else - vi = NULL; - - write_sequnlock(&cbi->server->cb_break_lock); - if (vi) - kfree_rcu(vi, rcu); - afs_put_server(net, cbi->server, afs_server_trace_put_cbi); - } - kfree_rcu(cbi, rcu); - } -} - /* * allow the fileserver to request callback state (re-)initialisation */ @@ -236,12 +57,12 @@ void afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reason } /* - * Look up a volume interest by volume ID under RCU conditions. + * Look up a volume by volume ID under RCU conditions. */ -static struct afs_vol_interest *afs_lookup_vol_interest_rcu(struct afs_server *server, - afs_volid_t vid) +static struct afs_volume *afs_lookup_volume_rcu(struct afs_cell *cell, + afs_volid_t vid) { - struct afs_vol_interest *vi = NULL; + struct afs_volume *volume = NULL; struct rb_node *p; int seq = 0; @@ -250,28 +71,25 @@ static struct afs_vol_interest *afs_lookup_vol_interest_rcu(struct afs_server *s * under just the RCU read lock, so we have to check for * changes. */ - read_seqbegin_or_lock(&server->cb_break_lock, &seq); + read_seqbegin_or_lock(&cell->volume_lock, &seq); - p = rcu_dereference_raw(server->cb_volumes.rb_node); + p = rcu_dereference_raw(cell->volumes.rb_node); while (p) { - vi = rb_entry(p, struct afs_vol_interest, srv_node); + volume = rb_entry(p, struct afs_volume, cell_node); - if (vi->vid < vid) + if (volume->vid < vid) p = rcu_dereference_raw(p->rb_left); - else if (vi->vid > vid) + else if (volume->vid > vid) p = rcu_dereference_raw(p->rb_right); else break; - /* We want to repeat the search, this time with the - * lock properly locked. - */ - vi = NULL; + volume = NULL; } - } while (need_seqretry(&server->cb_break_lock, seq)); + } while (need_seqretry(&cell->volume_lock, seq)); - done_seqretry(&server->cb_break_lock, seq); - return vi; + done_seqretry(&cell->volume_lock, seq); + return volume; } /* @@ -280,42 +98,37 @@ static struct afs_vol_interest *afs_lookup_vol_interest_rcu(struct afs_server *s * - the backing file is changed * - a lock is released */ -static void afs_break_one_callback(struct afs_server *server, - struct afs_fid *fid, - struct afs_vol_interest *vi) +static void afs_break_one_callback(struct afs_volume *volume, + struct afs_fid *fid) { - struct afs_cb_interest *cbi; + struct super_block *sb; struct afs_vnode *vnode; struct inode *inode; - /* Step through all interested superblocks. There may be more than one - * because of cell aliasing. - */ - hlist_for_each_entry_rcu(cbi, &vi->cb_interests, cb_vlink) { - if (fid->vnode == 0 && fid->unique == 0) { - /* The callback break applies to an entire volume. */ - struct afs_super_info *as = AFS_FS_S(cbi->sb); - struct afs_volume *volume = as->volume; + if (fid->vnode == 0 && fid->unique == 0) { + /* The callback break applies to an entire volume. */ + write_lock(&volume->cb_v_break_lock); + volume->cb_v_break++; + trace_afs_cb_break(fid, volume->cb_v_break, + afs_cb_break_for_volume_callback, false); + write_unlock(&volume->cb_v_break_lock); + return; + } - write_lock(&volume->cb_v_break_lock); - volume->cb_v_break++; - trace_afs_cb_break(fid, volume->cb_v_break, - afs_cb_break_for_volume_callback, false); - write_unlock(&volume->cb_v_break_lock); - } else { - /* See if we can find a matching inode - even an I_NEW - * inode needs to be marked as it can have its callback - * broken before we finish setting up the local inode. - */ - inode = find_inode_rcu(cbi->sb, fid->vnode, - afs_ilookup5_test_by_fid, fid); - if (inode) { - vnode = AFS_FS_I(inode); - afs_break_callback(vnode, afs_cb_break_for_callback); - } else { - trace_afs_cb_miss(fid, afs_cb_break_for_callback); - } - } + /* See if we can find a matching inode - even an I_NEW inode needs to + * be marked as it can have its callback broken before we finish + * setting up the local inode. + */ + sb = rcu_dereference(volume->sb); + if (!sb) + return; + + inode = find_inode_rcu(sb, fid->vnode, afs_ilookup5_test_by_fid, fid); + if (inode) { + vnode = AFS_FS_I(inode); + afs_break_callback(vnode, afs_cb_break_for_callback); + } else { + trace_afs_cb_miss(fid, afs_cb_break_for_callback); } } @@ -324,11 +137,11 @@ static void afs_break_some_callbacks(struct afs_server *server, size_t *_count) { struct afs_callback_break *residue = cbb; - struct afs_vol_interest *vi; + struct afs_volume *volume; afs_volid_t vid = cbb->fid.vid; size_t i; - vi = afs_lookup_vol_interest_rcu(server, vid); + volume = afs_lookup_volume_rcu(server->cell, vid); /* TODO: Find all matching volumes if we couldn't match the server and * break them anyway. @@ -341,8 +154,8 @@ static void afs_break_some_callbacks(struct afs_server *server, cbb->fid.vnode, cbb->fid.unique); --*_count; - if (vi) - afs_break_one_callback(server, &cbb->fid, vi); + if (volume) + afs_break_one_callback(volume, &cbb->fid); } else { *residue++ = *cbb; } @@ -367,16 +180,3 @@ void afs_break_callbacks(struct afs_server *server, size_t count, rcu_read_unlock(); return; } - -/* - * Clear the callback interests in a server list. - */ -void afs_clear_callback_interests(struct afs_net *net, struct afs_server_list *slist) -{ - int i; - - for (i = 0; i < slist->nr_servers; i++) { - afs_put_cb_interest(net, slist->servers[i].cb_interest); - slist->servers[i].cb_interest = NULL; - } -} diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 8bfc8a05fd46..005921e3b38d 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -161,8 +161,11 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, atomic_set(&cell->usage, 2); INIT_WORK(&cell->manager, afs_manage_cell); - INIT_LIST_HEAD(&cell->proc_volumes); - rwlock_init(&cell->proc_lock); + cell->volumes = RB_ROOT; + INIT_HLIST_HEAD(&cell->proc_volumes); + seqlock_init(&cell->volume_lock); + cell->fs_servers = RB_ROOT; + seqlock_init(&cell->fs_lock); rwlock_init(&cell->vl_servers_lock); cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 0d3cf3af0352..25cbe0aeeec5 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -702,6 +702,37 @@ static const struct afs_operation_ops afs_fetch_status_operation = { .success = afs_do_lookup_success, }; +/* + * See if we know that the server we expect to use doesn't support + * FS.InlineBulkStatus. + */ +static bool afs_server_supports_ibulk(struct afs_vnode *dvnode) +{ + struct afs_server_list *slist; + struct afs_volume *volume = dvnode->volume; + struct afs_server *server; + bool ret = true; + int i; + + if (!test_bit(AFS_VOLUME_MAYBE_NO_IBULK, &volume->flags)) + return true; + + rcu_read_lock(); + slist = rcu_dereference(volume->servers); + + for (i = 0; i < slist->nr_servers; i++) { + server = slist->servers[i].server; + if (server == dvnode->cb_server) { + if (test_bit(AFS_SERVER_FL_NO_IBULK, &server->flags)) + ret = false; + break; + } + } + + rcu_read_unlock(); + return ret; +} + /* * Do a lookup in a directory. We make use of bulk lookup to query a slew of * files in one go and create inodes for them. The inode of the file we were @@ -711,10 +742,8 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, struct key *key) { struct afs_lookup_cookie *cookie; - struct afs_cb_interest *dcbi; struct afs_vnode_param *vp; struct afs_operation *op; - struct afs_server *server; struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode; struct inode *inode = NULL, *ti; afs_dataversion_t data_version = READ_ONCE(dvnode->status.data_version); @@ -734,16 +763,8 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, cookie->nr_fids = 2; /* slot 0 is saved for the fid we actually want * and slot 1 for the directory */ - read_seqlock_excl(&dvnode->cb_lock); - dcbi = rcu_dereference_protected(dvnode->cb_interest, - lockdep_is_held(&dvnode->cb_lock.lock)); - if (dcbi) { - server = dcbi->server; - if (server && - test_bit(AFS_SERVER_FL_NO_IBULK, &server->flags)) - cookie->one_only = true; - } - read_sequnlock_excl(&dvnode->cb_lock); + if (!afs_server_supports_ibulk(dvnode)) + cookie->one_only = true; /* search the directory */ ret = afs_dir_iterate(dir, &cookie->ctx, key, &data_version); diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index f57efd9d2db0..2d2dff5688a4 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -143,12 +143,6 @@ bool afs_begin_vnode_operation(struct afs_operation *op) if (!afs_get_io_locks(op)) return false; - read_seqlock_excl(&vnode->cb_lock); - op->cbi = afs_get_cb_interest( - rcu_dereference_protected(vnode->cb_interest, - lockdep_is_held(&vnode->cb_lock.lock))); - read_sequnlock_excl(&vnode->cb_lock); - afs_prepare_vnode(op, &op->file[0], 0); afs_prepare_vnode(op, &op->file[1], 1); op->cb_v_break = op->volume->cb_v_break; @@ -183,8 +177,8 @@ void afs_wait_for_operation(struct afs_operation *op) _enter(""); while (afs_select_fileserver(op)) { - op->cb_s_break = op->cbi->server->cb_s_break; - if (test_bit(AFS_SERVER_FL_IS_YFS, &op->cbi->server->flags) && + op->cb_s_break = op->server->cb_s_break; + if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags) && op->ops->issue_yfs_rpc) op->ops->issue_yfs_rpc(op); else @@ -231,7 +225,6 @@ int afs_put_operation(struct afs_operation *op) } afs_end_cursor(&op->ac); - afs_put_cb_interest(op->net, op->cbi); afs_put_serverlist(op->net, op->server_list); afs_put_volume(op->net, op->volume, afs_volume_trace_put_put_op); kfree(op); diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index b1b45f10583d..acb4d0ca2649 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -1893,8 +1893,11 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) static void afs_done_fs_inline_bulk_status(struct afs_call *call) { if (call->error == -ECONNABORTED && - call->abort_code == RX_INVALID_OPERATION) + call->abort_code == RX_INVALID_OPERATION) { set_bit(AFS_SERVER_FL_NO_IBULK, &call->server->flags); + if (call->op) + set_bit(AFS_VOLUME_MAYBE_NO_IBULK, &call->op->volume->flags); + } } /* @@ -1919,7 +1922,7 @@ void afs_fs_inline_bulk_status(struct afs_operation *op) __be32 *bp; int i; - if (test_bit(AFS_SERVER_FL_NO_IBULK, &op->cbi->server->flags)) { + if (test_bit(AFS_SERVER_FL_NO_IBULK, &op->server->flags)) { op->error = -ENOTSUPP; return; } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 94675acb6a3a..7dde703df40c 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -71,7 +71,6 @@ static int afs_inode_init_from_status(struct afs_operation *op, struct afs_vnode_param *vp, struct afs_vnode *vnode) { - struct afs_cb_interest *old_cbi = NULL; struct afs_file_status *status = &vp->scb.status; struct inode *inode = AFS_VNODE_TO_I(vnode); struct timespec64 t; @@ -150,18 +149,11 @@ static int afs_inode_init_from_status(struct afs_operation *op, vnode->cb_expires_at = ktime_get_real_seconds(); } else { vnode->cb_expires_at = vp->scb.callback.expires_at; - old_cbi = rcu_dereference_protected(vnode->cb_interest, - lockdep_is_held(&vnode->cb_lock.lock)); - if (op->cbi != old_cbi) - rcu_assign_pointer(vnode->cb_interest, - afs_get_cb_interest(op->cbi)); - else - old_cbi = NULL; + vnode->cb_server = op->server; set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); } write_sequnlock(&vnode->cb_lock); - afs_put_cb_interest(afs_v2net(vnode), old_cbi); return 0; } @@ -255,18 +247,12 @@ static void afs_apply_status(struct afs_operation *op, static void afs_apply_callback(struct afs_operation *op, struct afs_vnode_param *vp) { - struct afs_cb_interest *old; struct afs_callback *cb = &vp->scb.callback; struct afs_vnode *vnode = vp->vnode; - if (!afs_cb_is_broken(vp->cb_break_before, vnode, op->cbi)) { + if (!afs_cb_is_broken(vp->cb_break_before, vnode)) { vnode->cb_expires_at = cb->expires_at; - old = rcu_dereference_protected(vnode->cb_interest, - lockdep_is_held(&vnode->cb_lock.lock)); - if (old != op->cbi) { - rcu_assign_pointer(vnode->cb_interest, afs_get_cb_interest(op->cbi)); - afs_put_cb_interest(afs_v2net(vnode), old); - } + vnode->cb_server = op->server; set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); } } @@ -569,13 +555,31 @@ void afs_zap_data(struct afs_vnode *vnode) invalidate_inode_pages2(vnode->vfs_inode.i_mapping); } +/* + * Get the server reinit counter for a vnode's current server. + */ +static bool afs_get_s_break_rcu(struct afs_vnode *vnode, unsigned int *_s_break) +{ + struct afs_server_list *slist = rcu_dereference(vnode->volume->servers); + struct afs_server *server; + int i; + + for (i = 0; i < slist->nr_servers; i++) { + server = slist->servers[i].server; + if (server == vnode->cb_server) { + *_s_break = READ_ONCE(server->cb_s_break); + return true; + } + } + + return false; +} + /* * Check the validity of a vnode/inode. */ bool afs_check_validity(struct afs_vnode *vnode) { - struct afs_cb_interest *cbi; - struct afs_server *server; struct afs_volume *volume = vnode->volume; enum afs_cb_break_reason need_clear = afs_cb_break_no_break; time64_t now = ktime_get_real_seconds(); @@ -588,11 +592,8 @@ bool afs_check_validity(struct afs_vnode *vnode) cb_v_break = READ_ONCE(volume->cb_v_break); cb_break = vnode->cb_break; - if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { - cbi = rcu_dereference(vnode->cb_interest); - server = rcu_dereference(cbi->server); - cb_s_break = READ_ONCE(server->cb_s_break); - + if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) && + afs_get_s_break_rcu(vnode, &cb_s_break)) { if (vnode->cb_s_break != cb_s_break || vnode->cb_v_break != cb_v_break) { vnode->cb_s_break = cb_s_break; @@ -739,7 +740,6 @@ int afs_drop_inode(struct inode *inode) */ void afs_evict_inode(struct inode *inode) { - struct afs_cb_interest *cbi; struct afs_vnode *vnode; vnode = AFS_FS_I(inode); @@ -756,15 +756,6 @@ void afs_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); clear_inode(inode); - write_seqlock(&vnode->cb_lock); - cbi = rcu_dereference_protected(vnode->cb_interest, - lockdep_is_held(&vnode->cb_lock.lock)); - if (cbi) { - afs_put_cb_interest(afs_i2net(inode), cbi); - rcu_assign_pointer(vnode->cb_interest, NULL); - } - write_sequnlock(&vnode->cb_lock); - while (!list_empty(&vnode->wb_keys)) { struct afs_wb_key *wbk = list_entry(vnode->wb_keys.next, struct afs_wb_key, vnode_link); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index e084936066b0..c64c2b47ece7 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -103,7 +103,6 @@ struct afs_call { struct afs_net *net; /* The network namespace */ struct afs_server *server; /* The fileserver record if fs op (pins ref) */ struct afs_vlserver *vlserver; /* The vlserver record if vl op */ - struct afs_cb_interest *cbi; /* Callback interest for server used */ void *request; /* request data (first part) */ struct iov_iter def_iter; /* Default buffer/data iterator */ struct iov_iter *iter; /* Iterator currently in use */ @@ -375,9 +374,14 @@ struct afs_cell { enum dns_lookup_status dns_status:8; /* Latest status of data from lookup */ unsigned int dns_lookup_count; /* Counter of DNS lookups */ + /* The volumes belonging to this cell */ + struct rb_root volumes; /* Tree of volumes on this server */ + struct hlist_head proc_volumes; /* procfs volume list */ + seqlock_t volume_lock; /* For volumes */ + /* Active fileserver interaction state. */ - struct list_head proc_volumes; /* procfs volume list */ - rwlock_t proc_lock; + struct rb_root fs_servers; /* afs_server (by server UUID) */ + seqlock_t fs_lock; /* For fs_servers */ /* VL server list. */ rwlock_t vl_servers_lock; /* Lock on vl_servers */ @@ -481,7 +485,8 @@ struct afs_server { }; struct afs_addr_list __rcu *addresses; - struct rb_node uuid_rb; /* Link in net->fs_servers */ + struct afs_cell *cell; /* Cell to which belongs (pins ref) */ + struct rb_node uuid_rb; /* Link in cell->fs_servers */ struct list_head probe_link; /* Link in net->fs_probe_list */ struct hlist_node addr4_link; /* Link in net->fs_addresses4 */ struct hlist_node addr6_link; /* Link in net->fs_addresses6 */ @@ -507,9 +512,7 @@ struct afs_server { rwlock_t fs_lock; /* access lock */ /* callback promise management */ - struct rb_root cb_volumes; /* List of volume interests on this server */ unsigned cb_s_break; /* Break-everything counter. */ - seqlock_t cb_break_lock; /* Volume finding lock */ /* Probe state */ unsigned long probed_at; /* Time last probe was dispatched (jiffies) */ @@ -527,37 +530,11 @@ struct afs_server { } probe; }; -/* - * Volume collation in the server's callback interest list. - */ -struct afs_vol_interest { - struct rb_node srv_node; /* Link in server->cb_volumes */ - struct hlist_head cb_interests; /* List of callback interests on the server */ - union { - struct rcu_head rcu; - afs_volid_t vid; /* Volume ID to match */ - }; - unsigned int usage; -}; - -/* - * Interest by a superblock on a server. - */ -struct afs_cb_interest { - struct hlist_node cb_vlink; /* Link in vol_interest->cb_interests */ - struct afs_vol_interest *vol_interest; - struct afs_server *server; /* Server on which this interest resides */ - struct super_block *sb; /* Superblock on which inodes reside */ - struct rcu_head rcu; - refcount_t usage; -}; - /* * Replaceable volume server list. */ struct afs_server_entry { struct afs_server *server; - struct afs_cb_interest *cb_interest; }; struct afs_server_list { @@ -575,11 +552,16 @@ struct afs_server_list { * Live AFS volume management. */ struct afs_volume { - afs_volid_t vid; /* volume ID */ + union { + struct rcu_head rcu; + afs_volid_t vid; /* volume ID */ + }; atomic_t usage; time64_t update_at; /* Time at which to next update */ struct afs_cell *cell; /* Cell to which belongs (pins ref) */ - struct list_head proc_link; /* Link in cell->vl_proc */ + struct rb_node cell_node; /* Link in cell->volumes */ + struct hlist_node proc_link; /* Link in cell->proc_volumes */ + struct super_block __rcu *sb; /* Superblock on which inodes reside */ unsigned long flags; #define AFS_VOLUME_NEEDS_UPDATE 0 /* - T if an update needs performing */ #define AFS_VOLUME_UPDATING 1 /* - T if an update is in progress */ @@ -587,6 +569,7 @@ struct afs_volume { #define AFS_VOLUME_DELETED 3 /* - T if volume appears deleted */ #define AFS_VOLUME_OFFLINE 4 /* - T if volume offline notice given */ #define AFS_VOLUME_BUSY 5 /* - T if volume busy notice given */ +#define AFS_VOLUME_MAYBE_NO_IBULK 6 /* - T if some servers don't have InlineBulkStatus */ #ifdef CONFIG_AFS_FSCACHE struct fscache_cookie *cache; /* caching cookie */ #endif @@ -598,7 +581,6 @@ struct afs_volume { rwlock_t cb_v_break_lock; afs_voltype_t type; /* type of volume */ - short error; char type_force; /* force volume type (suppress R/O -> R/W) */ u8 name_len; u8 name[AFS_MAXVOLNAME + 1]; /* NUL-padded volume name */ @@ -659,11 +641,11 @@ struct afs_vnode { afs_lock_type_t lock_type : 8; /* outstanding callback notification on this file */ - struct afs_cb_interest __rcu *cb_interest; /* Server on which this resides */ + void *cb_server; /* Server with callback/filelock */ unsigned int cb_s_break; /* Mass break counter on ->server */ unsigned int cb_v_break; /* Mass break counter on ->volume */ unsigned int cb_break; /* Break counter on vnode */ - seqlock_t cb_lock; /* Lock for ->cb_interest, ->status, ->cb_*break */ + seqlock_t cb_lock; /* Lock for ->cb_server, ->status, ->cb_*break */ time64_t cb_expires_at; /* time at which callback expires */ }; @@ -833,7 +815,7 @@ struct afs_operation { /* Fileserver iteration state */ struct afs_addr_cursor ac; struct afs_server_list *server_list; /* Current server list (pins ref) */ - struct afs_cb_interest *cbi; /* Server on which this resides (pins ref) */ + struct afs_server *server; /* Server we're using (ref pinned by server_list) */ struct afs_call *call; unsigned long untried; /* Bitmask of untried servers */ short index; /* Current server */ @@ -907,29 +889,15 @@ extern void __afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason); extern void afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason); extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback_break *); -extern int afs_register_server_cb_interest(struct afs_vnode *, - struct afs_server_list *, unsigned int); -extern void afs_put_cb_interest(struct afs_net *, struct afs_cb_interest *); -extern void afs_clear_callback_interests(struct afs_net *, struct afs_server_list *); - -static inline struct afs_cb_interest *afs_get_cb_interest(struct afs_cb_interest *cbi) -{ - if (cbi) - refcount_inc(&cbi->usage); - return cbi; -} - static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode) { return vnode->cb_break + vnode->cb_v_break; } static inline bool afs_cb_is_broken(unsigned int cb_break, - const struct afs_vnode *vnode, - const struct afs_cb_interest *cbi) + const struct afs_vnode *vnode) { - return !cbi || cb_break != (vnode->cb_break + - vnode->volume->cb_v_break); + return cb_break != (vnode->cb_break + vnode->volume->cb_v_break); } /* @@ -1182,7 +1150,6 @@ static inline void afs_put_sysnames(struct afs_sysnames *sysnames) {} * rotate.c */ extern bool afs_select_fileserver(struct afs_operation *); -extern bool afs_select_current_fileserver(struct afs_operation *); extern void afs_dump_edestaddrreq(const struct afs_operation *); /* @@ -1212,7 +1179,6 @@ static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *c op->type = call->type; call->op = op; call->key = op->key; - call->cbi = afs_get_cb_interest(op->cbi); call->intr = !(op->flags & AFS_OPERATION_UNINTR); afs_make_call(&op->ac, call, gfp); } diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 256c3eff8c82..309a7b578255 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -209,11 +209,10 @@ static const char afs_vol_types[3][3] = { */ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) { - struct afs_cell *cell = PDE_DATA(file_inode(m->file)); - struct afs_volume *vol = list_entry(v, struct afs_volume, proc_link); + struct afs_volume *vol = hlist_entry(v, struct afs_volume, proc_link); /* Display header on line 1 */ - if (v == &cell->proc_volumes) { + if (v == SEQ_START_TOKEN) { seq_puts(m, "USE VID TY NAME\n"); return 0; } @@ -231,8 +230,8 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) { struct afs_cell *cell = PDE_DATA(file_inode(m->file)); - read_lock(&cell->proc_lock); - return seq_list_start_head(&cell->proc_volumes, *_pos); + rcu_read_lock(); + return seq_hlist_start_head_rcu(&cell->proc_volumes, *_pos); } static void *afs_proc_cell_volumes_next(struct seq_file *m, void *v, @@ -240,15 +239,13 @@ static void *afs_proc_cell_volumes_next(struct seq_file *m, void *v, { struct afs_cell *cell = PDE_DATA(file_inode(m->file)); - return seq_list_next(v, &cell->proc_volumes, _pos); + return seq_hlist_next_rcu(v, &cell->proc_volumes, _pos); } static void afs_proc_cell_volumes_stop(struct seq_file *m, void *v) __releases(cell->proc_lock) { - struct afs_cell *cell = PDE_DATA(file_inode(m->file)); - - read_unlock(&cell->proc_lock); + rcu_read_unlock(); } static const struct seq_operations afs_proc_cell_volumes_ops = { diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index d67b8c7eb3b9..8d5473cd8ea4 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -21,7 +21,8 @@ static bool afs_start_fs_iteration(struct afs_operation *op, struct afs_vnode *vnode) { - struct afs_cb_interest *cbi; + struct afs_server *server; + void *cb_server; int i; read_lock(&op->volume->servers_lock); @@ -33,12 +34,12 @@ static bool afs_start_fs_iteration(struct afs_operation *op, op->untried = (1UL << op->server_list->nr_servers) - 1; op->index = READ_ONCE(op->server_list->preferred); - cbi = rcu_dereference_protected(vnode->cb_interest, - lockdep_is_held(&vnode->io_lock)); - if (cbi) { + cb_server = vnode->cb_server; + if (cb_server) { /* See if the vnode's preferred record is still available */ for (i = 0; i < op->server_list->nr_servers; i++) { - if (op->server_list->servers[i].cb_interest == cbi) { + server = op->server_list->servers[i].server; + if (server == cb_server) { op->index = i; goto found_interest; } @@ -55,14 +56,11 @@ static bool afs_start_fs_iteration(struct afs_operation *op, /* Note that the callback promise is effectively broken */ write_seqlock(&vnode->cb_lock); - ASSERTCMP(cbi, ==, rcu_access_pointer(vnode->cb_interest)); - rcu_assign_pointer(vnode->cb_interest, NULL); + ASSERTCMP(cb_server, ==, vnode->cb_server); + vnode->cb_server = NULL; if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) vnode->cb_break++; write_sequnlock(&vnode->cb_lock); - - afs_put_cb_interest(op->net, cbi); - cbi = NULL; } found_interest: @@ -303,8 +301,7 @@ bool afs_select_fileserver(struct afs_operation *op) restart_from_beginning: _debug("restart"); afs_end_cursor(&op->ac); - afs_put_cb_interest(op->net, op->cbi); - op->cbi = NULL; + op->server = NULL; afs_put_serverlist(op->net, op->server_list); op->server_list = NULL; start: @@ -331,13 +328,12 @@ pick_server: /* Pick the untried server with the lowest RTT. If we have outstanding * callbacks, we stick with the server we're already using if we can. */ - if (op->cbi) { - _debug("cbi %u", op->index); + if (op->server) { + _debug("server %u", op->index); if (test_bit(op->index, &op->untried)) goto selected_server; - afs_put_cb_interest(op->net, op->cbi); - op->cbi = NULL; - _debug("nocbi"); + op->server = NULL; + _debug("no server"); } op->index = -1; @@ -372,19 +368,13 @@ selected_server: _debug("USING SERVER: %pU", &server->uuid); - /* Make sure we've got a callback interest record for this server. We - * have to link it in before we send the request as we can be sent a - * break request before we've finished decoding the reply and - * installing the vnode. - */ - error = afs_register_server_cb_interest(vnode, op->server_list, - op->index); - if (error < 0) - goto failed_set_error; - - op->cbi = afs_get_cb_interest( - rcu_dereference_protected(vnode->cb_interest, - lockdep_is_held(&vnode->io_lock))); + op->server = server; + if (vnode->cb_server != server) { + vnode->cb_server = server; + vnode->cb_s_break = server->cb_s_break; + vnode->cb_v_break = vnode->volume->cb_v_break; + clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + } read_lock(&server->fs_lock); alist = rcu_dereference_protected(server->addresses, @@ -446,84 +436,6 @@ failed: return false; } -/* - * Select the same fileserver we used for a vnode before and only that - * fileserver. We use this when we have a lock on that file, which is backed - * only by the fileserver we obtained it from. - */ -bool afs_select_current_fileserver(struct afs_operation *op) -{ - struct afs_cb_interest *cbi; - struct afs_addr_list *alist; - int error = op->ac.error; - - _enter(""); - - switch (error) { - case SHRT_MAX: - cbi = op->cbi; - if (!cbi) { - op->error = -ESTALE; - op->flags |= AFS_OPERATION_STOP; - return false; - } - - read_lock(&cbi->server->fs_lock); - alist = rcu_dereference_protected(cbi->server->addresses, - lockdep_is_held(&cbi->server->fs_lock)); - afs_get_addrlist(alist); - read_unlock(&cbi->server->fs_lock); - if (!alist) { - op->error = -ESTALE; - op->flags |= AFS_OPERATION_STOP; - return false; - } - - memset(&op->ac, 0, sizeof(op->ac)); - op->ac.alist = alist; - op->ac.index = -1; - goto iterate_address; - - case 0: - default: - /* Success or local failure. Stop. */ - op->error = error; - op->flags |= AFS_OPERATION_STOP; - _leave(" = f [okay/local %d]", error); - return false; - - case -ECONNABORTED: - op->error = afs_abort_to_error(op->ac.abort_code); - op->flags |= AFS_OPERATION_STOP; - _leave(" = f [abort]"); - return false; - - case -ERFKILL: - case -EADDRNOTAVAIL: - case -ENETUNREACH: - case -EHOSTUNREACH: - case -EHOSTDOWN: - case -ECONNREFUSED: - case -ETIMEDOUT: - case -ETIME: - _debug("no conn"); - op->error = error; - goto iterate_address; - } - -iterate_address: - /* Iterate over the current server's address list to try and find an - * address on which it will respond to us. - */ - if (afs_iterate_addresses(&op->ac)) { - _leave(" = t"); - return true; - } - - afs_end_cursor(&op->ac); - return false; -} - /* * Dump cursor state in the case of the error being EDESTADDRREQ. */ diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index bd4d8e5efe59..b7fb5f98f80c 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -184,7 +184,6 @@ void afs_put_call(struct afs_call *call) call->type->destructor(call); afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call); - afs_put_cb_interest(call->net, call->cbi); afs_put_addrlist(call->alist); kfree(call->request); @@ -550,9 +549,9 @@ static void afs_deliver_to_call(struct afs_call *call) case 0: afs_queue_call_work(call); if (state == AFS_CALL_CL_PROC_REPLY) { - if (call->cbi) + if (call->op) set_bit(AFS_SERVER_FL_MAY_HAVE_CB, - &call->cbi->server->flags); + &call->op->server->flags); goto call_complete; } ASSERTCMP(state, >, AFS_CALL_CL_PROC_REPLY); diff --git a/fs/afs/security.c b/fs/afs/security.c index ce9de1e6742b..90d852704328 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -170,8 +170,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, break; } - if (afs_cb_is_broken(cb_break, vnode, - rcu_dereference(vnode->cb_interest))) { + if (afs_cb_is_broken(cb_break, vnode)) { changed = true; break; } @@ -201,7 +200,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, } } - if (afs_cb_is_broken(cb_break, vnode, rcu_dereference(vnode->cb_interest))) + if (afs_cb_is_broken(cb_break, vnode)) goto someone_else_changed_it; /* We need a ref on any permits list we want to copy as we'll have to @@ -281,8 +280,7 @@ found: rcu_read_lock(); spin_lock(&vnode->lock); zap = rcu_access_pointer(vnode->permit_cache); - if (!afs_cb_is_broken(cb_break, vnode, rcu_dereference(vnode->cb_interest)) && - zap == permits) + if (!afs_cb_is_broken(cb_break, vnode) && zap == permits) rcu_assign_pointer(vnode->permit_cache, replacement); else zap = replacement; diff --git a/fs/afs/server.c b/fs/afs/server.c index 1c1e315094ae..c51039a077cd 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -132,11 +132,12 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu /* * Install a server record in the namespace tree */ -static struct afs_server *afs_install_server(struct afs_net *net, +static struct afs_server *afs_install_server(struct afs_cell *cell, struct afs_server *candidate) { const struct afs_addr_list *alist; struct afs_server *server; + struct afs_net *net = cell->net; struct rb_node **pp, *p; int diff; @@ -193,11 +194,12 @@ exists: /* * Allocate a new server record and mark it active. */ -static struct afs_server *afs_alloc_server(struct afs_net *net, +static struct afs_server *afs_alloc_server(struct afs_cell *cell, const uuid_t *uuid, struct afs_addr_list *alist) { struct afs_server *server; + struct afs_net *net = cell->net; _enter(""); @@ -212,11 +214,10 @@ static struct afs_server *afs_alloc_server(struct afs_net *net, server->addr_version = alist->version; server->uuid = *uuid; rwlock_init(&server->fs_lock); - server->cb_volumes = RB_ROOT; - seqlock_init(&server->cb_break_lock); init_waitqueue_head(&server->probe_wq); INIT_LIST_HEAD(&server->probe_link); spin_lock_init(&server->probe_lock); + server->cell = cell; afs_inc_servers_outstanding(net); trace_afs_server(server, 1, 1, afs_server_trace_alloc); @@ -275,13 +276,13 @@ struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key, if (IS_ERR(alist)) return ERR_CAST(alist); - candidate = afs_alloc_server(cell->net, uuid, alist); + candidate = afs_alloc_server(cell, uuid, alist); if (!candidate) { afs_put_addrlist(alist); return ERR_PTR(-ENOMEM); } - server = afs_install_server(cell->net, candidate); + server = afs_install_server(cell, candidate); if (server != candidate) { afs_put_addrlist(alist); kfree(candidate); diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index a35f6951a74a..ed9056703505 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -14,11 +14,9 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist) int i; if (slist && refcount_dec_and_test(&slist->usage)) { - for (i = 0; i < slist->nr_servers; i++) { - afs_put_cb_interest(net, slist->servers[i].cb_interest); + for (i = 0; i < slist->nr_servers; i++) afs_unuse_server(net, slist->servers[i].server, afs_server_trace_put_slist); - } kfree(slist); } } @@ -127,31 +125,5 @@ changed: } } - /* Keep the old callback interest records where possible so that we - * maintain callback interception. - */ - i = 0; - j = 0; - while (i < old->nr_servers && j < new->nr_servers) { - if (new->servers[j].server == old->servers[i].server) { - struct afs_cb_interest *cbi = old->servers[i].cb_interest; - if (cbi) { - new->servers[j].cb_interest = cbi; - refcount_inc(&cbi->usage); - } - i++; - j++; - continue; - } - - if (new->servers[j].server < old->servers[i].server) { - j++; - continue; - } - - i++; - continue; - } - return true; } diff --git a/fs/afs/super.c b/fs/afs/super.c index f92ccdafc729..c77b11b31233 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -485,6 +485,7 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) goto error; } else { sb->s_d_op = &afs_fs_dentry_operations; + rcu_assign_pointer(as->volume->sb, sb); } _leave(" = 0"); @@ -529,7 +530,6 @@ static void afs_destroy_sbi(struct afs_super_info *as) static void afs_kill_super(struct super_block *sb) { struct afs_super_info *as = AFS_FS_S(sb); - struct afs_net *net = afs_net(as->net_ns); if (as->dyn_root) afs_dynroot_depopulate(sb); @@ -538,8 +538,7 @@ static void afs_kill_super(struct super_block *sb) * deactivating the superblock. */ if (as->volume) - afs_clear_callback_interests( - net, rcu_access_pointer(as->volume->servers)); + rcu_assign_pointer(as->volume->sb, NULL); kill_anon_super(sb); if (as->volume) afs_deactivate_volume(as->volume); @@ -689,7 +688,6 @@ static struct inode *afs_alloc_inode(struct super_block *sb) vnode->volume = NULL; vnode->lock_key = NULL; vnode->permit_cache = NULL; - RCU_INIT_POINTER(vnode->cb_interest, NULL); #ifdef CONFIG_AFS_FSCACHE vnode->cache = NULL; #endif @@ -719,8 +717,6 @@ static void afs_destroy_inode(struct inode *inode) _debug("DESTROY INODE %p", inode); - ASSERTCMP(rcu_access_pointer(vnode->cb_interest), ==, NULL); - atomic_dec(&afs_count_active_inodes); } diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index c61dd9410202..093895c49c21 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -189,13 +189,13 @@ static int afs_query_for_alias_one(struct afs_cell *cell, struct key *key, struct afs_volume *volume, *pvol = NULL; int ret; - /* Arbitrarily pick the first volume in the list. */ - read_lock(&p->proc_lock); - if (!list_empty(&p->proc_volumes)) - pvol = afs_get_volume(list_first_entry(&p->proc_volumes, - struct afs_volume, proc_link), + /* Arbitrarily pick a volume from the list. */ + read_seqlock_excl(&p->volume_lock); + if (!RB_EMPTY_ROOT(&p->volumes)) + pvol = afs_get_volume(rb_entry(p->volumes.rb_node, + struct afs_volume, cell_node), afs_volume_trace_get_query_alias); - read_unlock(&p->proc_lock); + read_sequnlock_excl(&p->volume_lock); if (!pvol) return 0; @@ -242,7 +242,7 @@ static int afs_query_for_alias(struct afs_cell *cell, struct key *key) hlist_for_each_entry(p, &cell->net->proc_cells, proc_link) { if (p == cell || p->alias_of) continue; - if (list_empty(&p->proc_volumes)) + if (RB_EMPTY_ROOT(&p->volumes)) continue; if (p->root_volume) continue; /* Ignore cells that have a root.cell volume. */ diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 0393f4910a92..9bc0509e3634 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -12,6 +12,56 @@ unsigned __read_mostly afs_volume_gc_delay = 10; unsigned __read_mostly afs_volume_record_life = 60 * 60; +/* + * Insert a volume into a cell. If there's an existing volume record, that is + * returned instead with a ref held. + */ +static struct afs_volume *afs_insert_volume_into_cell(struct afs_cell *cell, + struct afs_volume *volume) +{ + struct afs_volume *p; + struct rb_node *parent = NULL, **pp; + + write_seqlock(&cell->volume_lock); + + pp = &cell->volumes.rb_node; + while (*pp) { + parent = *pp; + p = rb_entry(parent, struct afs_volume, cell_node); + if (p->vid < volume->vid) { + pp = &(*pp)->rb_left; + } else if (p->vid > volume->vid) { + pp = &(*pp)->rb_right; + } else { + volume = afs_get_volume(p, afs_volume_trace_get_cell_insert); + goto found; + } + } + + rb_link_node_rcu(&volume->cell_node, parent, pp); + rb_insert_color(&volume->cell_node, &cell->volumes); + hlist_add_head_rcu(&volume->proc_link, &cell->proc_volumes); + +found: + write_sequnlock(&cell->volume_lock); + return volume; + +} + +static void afs_remove_volume_from_cell(struct afs_volume *volume) +{ + struct afs_cell *cell = volume->cell; + + if (!hlist_unhashed(&volume->proc_link)) { + trace_afs_volume(volume->vid, atomic_read(&volume->usage), + afs_volume_trace_remove); + write_seqlock(&cell->volume_lock); + hlist_del_rcu(&volume->proc_link); + rb_erase(&volume->cell_node, &cell->volumes); + write_sequnlock(&cell->volume_lock); + } +} + /* * Allocate a volume record and load it up from a vldb record. */ @@ -39,7 +89,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, volume->name_len = vldb->name_len; atomic_set(&volume->usage, 1); - INIT_LIST_HEAD(&volume->proc_link); + INIT_HLIST_NODE(&volume->proc_link); rwlock_init(&volume->servers_lock); rwlock_init(&volume->cb_v_break_lock); memcpy(volume->name, vldb->name, vldb->name_len + 1); @@ -62,6 +112,25 @@ error_0: return ERR_PTR(ret); } +/* + * Look up or allocate a volume record. + */ +static struct afs_volume *afs_lookup_volume(struct afs_fs_context *params, + struct afs_vldb_entry *vldb, + unsigned long type_mask) +{ + struct afs_volume *candidate, *volume; + + candidate = afs_alloc_volume(params, vldb, type_mask); + if (IS_ERR(candidate)) + return candidate; + + volume = afs_insert_volume_into_cell(params->cell, candidate); + if (volume != candidate) + afs_put_volume(params->net, candidate, afs_volume_trace_put_cell_dup); + return volume; +} + /* * Look up a VLDB record for a volume. */ @@ -139,7 +208,7 @@ struct afs_volume *afs_create_volume(struct afs_fs_context *params) } type_mask = 1UL << params->type; - volume = afs_alloc_volume(params, vldb, type_mask); + volume = afs_lookup_volume(params, vldb, type_mask); error: kfree(vldb); @@ -157,11 +226,12 @@ static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume) ASSERTCMP(volume->cache, ==, NULL); #endif + afs_remove_volume_from_cell(volume); afs_put_serverlist(net, rcu_access_pointer(volume->servers)); afs_put_cell(net, volume->cell); trace_afs_volume(volume->vid, atomic_read(&volume->usage), afs_volume_trace_free); - kfree(volume); + kfree_rcu(volume, rcu); _leave(" [destroyed]"); } @@ -207,10 +277,6 @@ void afs_activate_volume(struct afs_volume *volume) NULL, 0, volume, 0, true); #endif - - write_lock(&volume->cell->proc_lock); - list_add_tail(&volume->proc_link, &volume->cell->proc_volumes); - write_unlock(&volume->cell->proc_lock); } /* @@ -220,10 +286,6 @@ void afs_deactivate_volume(struct afs_volume *volume) { _enter("%s", volume->name); - write_lock(&volume->cell->proc_lock); - list_del_init(&volume->proc_link); - write_unlock(&volume->cell->proc_lock); - #ifdef CONFIG_AFS_FSCACHE fscache_relinquish_cookie(volume->cache, NULL, test_bit(AFS_VOLUME_DELETED, &volume->flags)); diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index d0cd112a3720..b0a6e40b4da3 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -854,7 +854,7 @@ void yfs_fs_remove_file(struct afs_operation *op) _enter(""); - if (!test_bit(AFS_SERVER_FL_NO_RM2, &op->cbi->server->flags)) + if (!test_bit(AFS_SERVER_FL_NO_RM2, &op->server->flags)) return yfs_fs_remove_file2(op); call = afs_alloc_flat_call(op->net, &yfs_RXYFSRemoveFile, From 3c4c4075fc61f5c37a0112b1dc8398025dc3e26a Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 27 May 2020 15:51:30 +0100 Subject: [PATCH 357/427] afs: Fix the by-UUID server tree to allow servers with the same UUID Whilst it shouldn't happen, it is possible for multiple fileservers to share a UUID, particularly if an entire cell has been duplicated, UUIDs and all. In such a case, it's not necessarily possible to map the effect of the CB.InitCallBackState3 incoming RPC to a specific server unambiguously by UUID and thus to a specific cell. Indeed, there's a problem whereby multiple server records may need to occupy the same spot in the rb_tree rooted in the afs_net struct. Fix this by allowing servers to form a list, with the head of the list in the tree. When the front entry in the list is removed, the second in the list just replaces it. afs_init_callback_state() then just goes down the line, poking each server in the list. This means that some servers will be unnecessarily poked, unfortunately. An alternative would be to route by call parameters. Reported-by: Jeffrey Altman Signed-off-by: David Howells Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation") --- fs/afs/callback.c | 10 +++++++-- fs/afs/internal.h | 4 +++- fs/afs/server.c | 56 ++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 59 insertions(+), 11 deletions(-) diff --git a/fs/afs/callback.c b/fs/afs/callback.c index b4cb9bb63f0a..7d9b23d981bf 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -21,11 +21,17 @@ #include "internal.h" /* - * allow the fileserver to request callback state (re-)initialisation + * Allow the fileserver to request callback state (re-)initialisation. + * Unfortunately, UUIDs are not guaranteed unique. */ void afs_init_callback_state(struct afs_server *server) { - server->cb_s_break++; + rcu_read_lock(); + do { + server->cb_s_break++; + server = rcu_dereference(server->uuid_next); + } while (0); + rcu_read_unlock(); } /* diff --git a/fs/afs/internal.h b/fs/afs/internal.h index c64c2b47ece7..e0dc14d4d8b9 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -486,7 +486,9 @@ struct afs_server { struct afs_addr_list __rcu *addresses; struct afs_cell *cell; /* Cell to which belongs (pins ref) */ - struct rb_node uuid_rb; /* Link in cell->fs_servers */ + struct rb_node uuid_rb; /* Link in net->fs_servers */ + struct afs_server __rcu *uuid_next; /* Next server with same UUID */ + struct afs_server *uuid_prev; /* Previous server with same UUID */ struct list_head probe_link; /* Link in net->fs_probe_list */ struct hlist_node addr4_link; /* Link in net->fs_addresses4 */ struct hlist_node addr6_link; /* Link in net->fs_addresses6 */ diff --git a/fs/afs/server.c b/fs/afs/server.c index c51039a077cd..88593ffcb54e 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -130,13 +130,15 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu } /* - * Install a server record in the namespace tree + * Install a server record in the namespace tree. If there's a clash, we stick + * it into a list anchored on whichever afs_server struct is actually in the + * tree. */ static struct afs_server *afs_install_server(struct afs_cell *cell, struct afs_server *candidate) { const struct afs_addr_list *alist; - struct afs_server *server; + struct afs_server *server, *next; struct afs_net *net = cell->net; struct rb_node **pp, *p; int diff; @@ -153,12 +155,30 @@ static struct afs_server *afs_install_server(struct afs_cell *cell, _debug("- consider %p", p); server = rb_entry(p, struct afs_server, uuid_rb); diff = memcmp(&candidate->uuid, &server->uuid, sizeof(uuid_t)); - if (diff < 0) + if (diff < 0) { pp = &(*pp)->rb_left; - else if (diff > 0) + } else if (diff > 0) { pp = &(*pp)->rb_right; - else - goto exists; + } else { + if (server->cell == cell) + goto exists; + + /* We have the same UUID representing servers in + * different cells. Append the new server to the list. + */ + for (;;) { + next = rcu_dereference_protected( + server->uuid_next, + lockdep_is_held(&net->fs_lock.lock)); + if (!next) + break; + server = next; + } + rcu_assign_pointer(server->uuid_next, candidate); + candidate->uuid_prev = server; + server = candidate; + goto added_dup; + } } server = candidate; @@ -166,6 +186,7 @@ static struct afs_server *afs_install_server(struct afs_cell *cell, rb_insert_color(&server->uuid_rb, &net->fs_servers); hlist_add_head_rcu(&server->proc_link, &net->fs_proc); +added_dup: write_seqlock(&net->fs_addr_lock); alist = rcu_dereference_protected(server->addresses, lockdep_is_held(&net->fs_addr_lock.lock)); @@ -453,7 +474,7 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server) */ static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list) { - struct afs_server *server; + struct afs_server *server, *next, *prev; int active; while ((server = gc_list)) { @@ -465,7 +486,26 @@ static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list) if (active == 0) { trace_afs_server(server, atomic_read(&server->ref), active, afs_server_trace_gc); - rb_erase(&server->uuid_rb, &net->fs_servers); + next = rcu_dereference_protected( + server->uuid_next, lockdep_is_held(&net->fs_lock.lock)); + prev = server->uuid_prev; + if (!prev) { + /* The one at the front is in the tree */ + if (!next) { + rb_erase(&server->uuid_rb, &net->fs_servers); + } else { + rb_replace_node_rcu(&server->uuid_rb, + &next->uuid_rb, + &net->fs_servers); + next->uuid_prev = NULL; + } + } else { + /* This server is not at the front */ + rcu_assign_pointer(prev->uuid_next, next); + if (next) + next->uuid_prev = prev; + } + list_del(&server->probe_link); hlist_del_rcu(&server->proc_link); if (!hlist_unhashed(&server->addr4_link)) From f11a016a852f32e9c991baf6a036390eac5b4266 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 1 May 2020 22:06:02 +0100 Subject: [PATCH 358/427] afs: Fix afs_statfs() to not let the values go below zero Fix afs_statfs() so that the value for f_bavail and f_bfree don't go "negative" if the number of blocks in use by a volume exceeds the max quota for that volume. Signed-off-by: David Howells --- fs/afs/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/afs/super.c b/fs/afs/super.c index c77b11b31233..b552357b1d13 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -729,7 +729,10 @@ static void afs_get_volume_status_success(struct afs_operation *op) buf->f_blocks = vs->part_max_blocks; else buf->f_blocks = vs->max_quota; - buf->f_bavail = buf->f_bfree = buf->f_blocks - vs->blocks_in_use; + + if (buf->f_blocks > vs->blocks_in_use) + buf->f_bavail = buf->f_bfree = + buf->f_blocks - vs->blocks_in_use; } static const struct afs_operation_ops afs_get_volume_status_operation = { From f3c130e6e6d15822e1553531f91ecc8f3375bac3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 2 May 2020 13:39:57 +0100 Subject: [PATCH 359/427] afs: Don't use probe running state to make decisions outside probe code Don't use the running state for fileserver probes to make decisions about which server to use as the state is cleared at the start of a probe and also intermediate values might be misleading. Instead, add a separate 'latest known' rtt in the afs_server struct and a flag to indicate if the server is known to be responding and update these as and when we know what to change them to. Signed-off-by: David Howells --- fs/afs/fs_probe.c | 18 ++++++++++++------ fs/afs/internal.h | 4 +++- fs/afs/rotate.c | 3 ++- fs/afs/server.c | 1 + 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index 442b5e7944ff..c41cf3b2ab89 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -42,10 +42,13 @@ static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server bool responded = server->probe.responded; write_seqlock(&net->fs_lock); - if (responded) + if (responded) { list_add_tail(&server->probe_link, &net->fs_probe_slow); - else + } else { + server->rtt = UINT_MAX; + clear_bit(AFS_SERVER_FL_RESPONDING, &server->flags); list_add_tail(&server->probe_link, &net->fs_probe_fast); + } write_sequnlock(&net->fs_lock); afs_schedule_fs_probe(net, server, !responded); @@ -161,12 +164,14 @@ responded: rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall); if (rtt_us < server->probe.rtt) { server->probe.rtt = rtt_us; + server->rtt = rtt_us; alist->preferred = index; } smp_wmb(); /* Set rtt before responded. */ server->probe.responded = true; set_bit(index, &alist->responded); + set_bit(AFS_SERVER_FL_RESPONDING, &server->flags); out: spin_unlock(&server->probe_lock); @@ -224,7 +229,7 @@ int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried) { struct wait_queue_entry *waits; struct afs_server *server; - unsigned int rtt = UINT_MAX; + unsigned int rtt = UINT_MAX, rtt_s; bool have_responders = false; int pref = -1, i; @@ -280,10 +285,11 @@ stop: for (i = 0; i < slist->nr_servers; i++) { if (test_bit(i, &untried)) { server = slist->servers[i].server; - if (server->probe.responded && - server->probe.rtt < rtt) { + rtt_s = READ_ONCE(server->rtt); + if (test_bit(AFS_SERVER_FL_RESPONDING, &server->flags) && + rtt_s < rtt) { pref = i; - rtt = server->probe.rtt; + rtt = rtt_s; } remove_wait_queue(&server->probe_wq, &waits[i]); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index e0dc14d4d8b9..a4fe5d1a8b53 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -496,6 +496,7 @@ struct afs_server { struct afs_server *gc_next; /* Next server in manager's list */ time64_t unuse_time; /* Time at which last unused */ unsigned long flags; +#define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */ #define AFS_SERVER_FL_NOT_READY 1 /* The record is not ready for use */ #define AFS_SERVER_FL_NOT_FOUND 2 /* VL server says no such server */ #define AFS_SERVER_FL_VL_FAIL 3 /* Failed to access VL server */ @@ -508,6 +509,7 @@ struct afs_server { atomic_t ref; /* Object refcount */ atomic_t active; /* Active user count */ u32 addr_version; /* Address list version */ + unsigned int rtt; /* Server's current RTT in uS */ unsigned int debug_id; /* Debugging ID for traces */ /* file service access */ @@ -522,7 +524,7 @@ struct afs_server { atomic_t probe_outstanding; spinlock_t probe_lock; struct { - unsigned int rtt; /* RTT as ktime/64 */ + unsigned int rtt; /* RTT in uS */ u32 abort_code; short error; bool responded:1; diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index 8d5473cd8ea4..14863678ae9e 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -341,7 +341,8 @@ pick_server: for (i = 0; i < op->server_list->nr_servers; i++) { struct afs_server *s = op->server_list->servers[i].server; - if (!test_bit(i, &op->untried) || !s->probe.responded) + if (!test_bit(i, &op->untried) || + !test_bit(AFS_SERVER_FL_RESPONDING, &s->flags)) continue; if (s->probe.rtt < rtt) { op->index = i; diff --git a/fs/afs/server.c b/fs/afs/server.c index 88593ffcb54e..039e3488511c 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -239,6 +239,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell, INIT_LIST_HEAD(&server->probe_link); spin_lock_init(&server->probe_lock); server->cell = cell; + server->rtt = UINT_MAX; afs_inc_servers_outstanding(net); trace_afs_server(server, 1, 1, afs_server_trace_alloc); From 32275d3f758f1252511709b77b3bab060a0e1d4f Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 2 May 2020 13:44:50 +0100 Subject: [PATCH 360/427] afs: Show more a bit more server state in /proc/net/afs/servers Display more information about the state of a server record, including the flags, rtt and break counter plus the probe state for each server in /proc/net/afs/servers. Rearrange the server flags a bit to make them easier to read at a glance in the proc file. Signed-off-by: David Howells --- fs/afs/internal.h | 16 ++++++++-------- fs/afs/proc.c | 10 +++++++--- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a4fe5d1a8b53..af0b7fca87db 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -497,15 +497,15 @@ struct afs_server { time64_t unuse_time; /* Time at which last unused */ unsigned long flags; #define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */ -#define AFS_SERVER_FL_NOT_READY 1 /* The record is not ready for use */ -#define AFS_SERVER_FL_NOT_FOUND 2 /* VL server says no such server */ -#define AFS_SERVER_FL_VL_FAIL 3 /* Failed to access VL server */ -#define AFS_SERVER_FL_UPDATING 4 -#define AFS_SERVER_FL_NO_IBULK 7 /* Fileserver doesn't support FS.InlineBulkStatus */ +#define AFS_SERVER_FL_UPDATING 1 +#define AFS_SERVER_FL_NEEDS_UPDATE 2 /* Fileserver address list is out of date */ +#define AFS_SERVER_FL_NOT_READY 4 /* The record is not ready for use */ +#define AFS_SERVER_FL_NOT_FOUND 5 /* VL server says no such server */ +#define AFS_SERVER_FL_VL_FAIL 6 /* Failed to access VL server */ #define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */ -#define AFS_SERVER_FL_IS_YFS 9 /* Server is YFS not AFS */ -#define AFS_SERVER_FL_NO_RM2 10 /* Fileserver doesn't support YFS.RemoveFile2 */ -#define AFS_SERVER_FL_NEEDS_UPDATE 12 /* Fileserver address list is out of date */ +#define AFS_SERVER_FL_IS_YFS 16 /* Server is YFS not AFS */ +#define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */ +#define AFS_SERVER_FL_NO_RM2 18 /* Fileserver doesn't support YFS.RemoveFile2 */ atomic_t ref; /* Object refcount */ atomic_t active; /* Active user count */ u32 addr_version; /* Address list version */ diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 309a7b578255..22d00cf1913d 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -386,9 +386,13 @@ static int afs_proc_servers_show(struct seq_file *m, void *v) &server->uuid, atomic_read(&server->ref), atomic_read(&server->active)); - seq_printf(m, " - ALIST v=%u osp=%u r=%lx f=%lx\n", - alist->version, atomic_read(&server->probe_outstanding), - alist->responded, alist->failed); + seq_printf(m, " - info: fl=%lx rtt=%u brk=%x\n", + server->flags, server->rtt, server->cb_s_break); + seq_printf(m, " - probe: last=%d out=%d\n", + (int)(jiffies - server->probed_at) / HZ, + atomic_read(&server->probe_outstanding)); + seq_printf(m, " - ALIST v=%u rsp=%lx f=%lx\n", + alist->version, alist->responded, alist->failed); for (i = 0; i < alist->nr_addrs; i++) seq_printf(m, " [%x] %pISpc%s\n", i, &alist->addrs[i].transport, From 8409f67b6437c4b327ee95a71081b9c7bfee0b00 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 22 Apr 2020 00:02:46 +0100 Subject: [PATCH 361/427] afs: Adjust the fileserver rotation algorithm to reprobe/retry more quickly Adjust the fileserver rotation algorithm so that if we've tried all the addresses on a server (cumulatively over multiple operations) until we've run out of untried addresses, immediately reprobe all that server's interfaces and retry the op at least once before we move onto the next server. Signed-off-by: David Howells --- fs/afs/fs_probe.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ fs/afs/internal.h | 24 ++++++++++++++---------- fs/afs/rotate.c | 29 +++++++++++++++++++++++++++-- 3 files changed, 88 insertions(+), 12 deletions(-) diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index c41cf3b2ab89..b34f74b0f319 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -338,6 +338,18 @@ static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server afs_put_server(net, server, afs_server_trace_put_probe); } +/* + * Probe a server immediately without waiting for its due time to come + * round. This is used when all of the addresses have been tried. + */ +void afs_probe_fileserver(struct afs_net *net, struct afs_server *server) +{ + write_seqlock(&net->fs_lock); + if (!list_empty(&server->probe_link)) + return afs_dispatch_fs_probe(net, server, true); + write_sequnlock(&net->fs_lock); +} + /* * Probe dispatcher to regularly dispatch probes to keep NAT alive. */ @@ -411,3 +423,38 @@ again: _leave(" [quiesce]"); } } + +/* + * Wait for a probe on a particular fileserver to complete for 2s. + */ +int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr) +{ + struct wait_queue_entry wait; + unsigned long timo = 2 * HZ; + + if (atomic_read(&server->probe_outstanding) == 0) + goto dont_wait; + + init_wait_entry(&wait, 0); + for (;;) { + prepare_to_wait_event(&server->probe_wq, &wait, + is_intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); + if (timo == 0 || + server->probe.responded || + atomic_read(&server->probe_outstanding) == 0 || + (is_intr && signal_pending(current))) + break; + timo = schedule_timeout(timo); + } + + finish_wait(&server->probe_wq, &wait); + +dont_wait: + if (server->probe.responded) + return 0; + if (is_intr && signal_pending(current)) + return -ERESTARTSYS; + if (timo == 0) + return -ETIME; + return -EDESTADDRREQ; +} diff --git a/fs/afs/internal.h b/fs/afs/internal.h index af0b7fca87db..e1621b0670cc 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -826,16 +826,18 @@ struct afs_operation { unsigned short nr_iterations; /* Number of server iterations */ unsigned int flags; -#define AFS_OPERATION_STOP 0x0001 /* Set to cease iteration */ -#define AFS_OPERATION_VBUSY 0x0002 /* Set if seen VBUSY */ -#define AFS_OPERATION_VMOVED 0x0004 /* Set if seen VMOVED */ -#define AFS_OPERATION_VNOVOL 0x0008 /* Set if seen VNOVOL */ -#define AFS_OPERATION_CUR_ONLY 0x0010 /* Set if current server only (file lock held) */ -#define AFS_OPERATION_NO_VSLEEP 0x0020 /* Set to prevent sleep on VBUSY, VOFFLINE, ... */ -#define AFS_OPERATION_UNINTR 0x0040 /* Set if op is uninterruptible */ -#define AFS_OPERATION_DOWNGRADE 0x0080 /* Set to retry with downgraded opcode */ -#define AFS_OPERATION_LOCK_0 0x0100 /* Set if have io_lock on file[0] */ -#define AFS_OPERATION_LOCK_1 0x0200 /* Set if have io_lock on file[1] */ +#define AFS_OPERATION_STOP 0x0001 /* Set to cease iteration */ +#define AFS_OPERATION_VBUSY 0x0002 /* Set if seen VBUSY */ +#define AFS_OPERATION_VMOVED 0x0004 /* Set if seen VMOVED */ +#define AFS_OPERATION_VNOVOL 0x0008 /* Set if seen VNOVOL */ +#define AFS_OPERATION_CUR_ONLY 0x0010 /* Set if current server only (file lock held) */ +#define AFS_OPERATION_NO_VSLEEP 0x0020 /* Set to prevent sleep on VBUSY, VOFFLINE, ... */ +#define AFS_OPERATION_UNINTR 0x0040 /* Set if op is uninterruptible */ +#define AFS_OPERATION_DOWNGRADE 0x0080 /* Set to retry with downgraded opcode */ +#define AFS_OPERATION_LOCK_0 0x0100 /* Set if have io_lock on file[0] */ +#define AFS_OPERATION_LOCK_1 0x0200 /* Set if have io_lock on file[1] */ +#define AFS_OPERATION_TRIED_ALL 0x0400 /* Set if we've tried all the fileservers */ +#define AFS_OPERATION_RETRY_SERVER 0x0800 /* Set if we should retry the current server */ }; /* @@ -1055,7 +1057,9 @@ static inline void afs_op_set_fid(struct afs_operation *op, unsigned int n, extern void afs_fileserver_probe_result(struct afs_call *); extern void afs_fs_probe_fileserver(struct afs_net *, struct afs_server *, struct key *, bool); extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long); +extern void afs_probe_fileserver(struct afs_net *, struct afs_server *); extern void afs_fs_probe_dispatcher(struct work_struct *); +extern int afs_wait_for_one_fs_probe(struct afs_server *, bool); /* * inode.c diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index 14863678ae9e..6a0935cb822f 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -369,6 +369,7 @@ selected_server: _debug("USING SERVER: %pU", &server->uuid); + op->flags |= AFS_OPERATION_RETRY_SERVER; op->server = server; if (vnode->cb_server != server) { vnode->cb_server = server; @@ -383,6 +384,7 @@ selected_server: afs_get_addrlist(alist); read_unlock(&server->fs_lock); +retry_server: memset(&op->ac, 0, sizeof(op->ac)); if (!op->ac.alist) @@ -398,13 +400,36 @@ iterate_address: * address on which it will respond to us. */ if (!afs_iterate_addresses(&op->ac)) - goto next_server; + goto out_of_addresses; - _debug("address [%u] %u/%u", op->index, op->ac.index, op->ac.alist->nr_addrs); + _debug("address [%u] %u/%u %pISp", + op->index, op->ac.index, op->ac.alist->nr_addrs, + &op->ac.alist->addrs[op->ac.index].transport); _leave(" = t"); return true; +out_of_addresses: + /* We've now had a failure to respond on all of a server's addresses - + * immediately probe them again and consider retrying the server. + */ + afs_probe_fileserver(op->net, op->server); + if (op->flags & AFS_OPERATION_RETRY_SERVER) { + alist = op->ac.alist; + error = afs_wait_for_one_fs_probe( + op->server, !(op->flags & AFS_OPERATION_UNINTR)); + switch (error) { + case 0: + op->flags &= ~AFS_OPERATION_RETRY_SERVER; + goto retry_server; + case -ERESTARTSYS: + goto failed_set_error; + case -ETIME: + case -EDESTADDRREQ: + goto next_server; + } + } + next_server: _debug("next"); afs_end_cursor(&op->ac); From 8e84a61a9c5ce55c5707448bb3c2cc544fccaa21 Mon Sep 17 00:00:00 2001 From: Kenneth D'souza Date: Thu, 4 Jun 2020 21:14:41 +0530 Subject: [PATCH 362/427] cifs: dump Security Type info in DebugData Currently the end user is unaware with what sec type the cifs share is mounted if no sec= option is parsed. With this patch one can easily check from DebugData. Example: 1) Name: x.x.x.x Uses: 1 Capability: 0x8001f3fc Session Status: 1 Security type: RawNTLMSSP Signed-off-by: Kenneth D'souza Signed-off-by: Roberto Bergantinos Corpas Signed-off-by: Steve French Acked-by: Aurelien Aptel --- fs/cifs/cifs_debug.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 916567d770f5..3ad1a98fd567 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -221,6 +221,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) struct cifs_ses *ses; struct cifs_tcon *tcon; int i, j; + const char *security_types[] = {"Unspecified", "LANMAN", "NTLM", + "NTLMv2", "RawNTLMSSP", "Kerberos"}; seq_puts(m, "Display Internal CIFS Data Structures for Debugging\n" @@ -375,6 +377,10 @@ skip_rdma: ses->ses_count, ses->serverOS, ses->serverNOS, ses->capabilities, ses->status); } + + seq_printf(m,"Security type: %s\n", + security_types[server->ops->select_sectype(server, ses->sectype)]); + if (server->rdma) seq_printf(m, "RDMA\n\t"); seq_printf(m, "TCP status: %d Instance: %d\n\tLocal Users To " From 7c06514afd38ed7a4b83edfd39ab033c804b4cf3 Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Thu, 4 Jun 2020 17:23:55 +0200 Subject: [PATCH 363/427] cifs: multichannel: always zero struct cifs_io_parms SMB2_read/SMB2_write check and use cifs_io_parms->server, which might be uninitialized memory. This change makes all callers zero-initialize the struct. Signed-off-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/file.c | 4 ++-- fs/cifs/inode.c | 2 +- fs/cifs/link.c | 6 +++--- fs/cifs/smb2ops.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 226bfa5e9444..de130f3aa452 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1853,7 +1853,7 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, unsigned int xid; struct dentry *dentry = open_file->dentry; struct cifsInodeInfo *cifsi = CIFS_I(d_inode(dentry)); - struct cifs_io_parms io_parms; + struct cifs_io_parms io_parms = {0}; cifs_dbg(FYI, "write %zd bytes to offset %lld of %pd\n", write_size, *offset, dentry); @@ -4014,7 +4014,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) unsigned int xid; char *cur_offset; struct cifsFileInfo *open_file; - struct cifs_io_parms io_parms; + struct cifs_io_parms io_parms = {0}; int buf_type = CIFS_NO_BUFFER; __u32 pid; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index b94c6398da94..5416ff339401 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -447,7 +447,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, struct cifs_tcon *tcon; struct cifs_fid fid; struct cifs_open_parms oparms; - struct cifs_io_parms io_parms; + struct cifs_io_parms io_parms = {0}; char buf[24]; unsigned int bytes_read; char *pbuf; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index a25ef35b023e..2072458e6e24 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -308,7 +308,7 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, int oplock = 0; struct cifs_fid fid; struct cifs_open_parms oparms; - struct cifs_io_parms io_parms; + struct cifs_io_parms io_parms = {0}; int buf_type = CIFS_NO_BUFFER; FILE_ALL_INFO file_info; @@ -352,7 +352,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, int oplock = 0; struct cifs_fid fid; struct cifs_open_parms oparms; - struct cifs_io_parms io_parms; + struct cifs_io_parms io_parms = {0}; oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; @@ -389,7 +389,7 @@ smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, int rc; struct cifs_fid fid; struct cifs_open_parms oparms; - struct cifs_io_parms io_parms; + struct cifs_io_parms io_parms = {0}; int buf_type = CIFS_NO_BUFFER; __le16 *utf16_path; __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index dec055d7c2f4..49c5c80f5d36 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -4584,7 +4584,7 @@ smb2_make_node(unsigned int xid, struct inode *inode, struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); int rc = -EPERM; FILE_ALL_INFO *buf = NULL; - struct cifs_io_parms io_parms; + struct cifs_io_parms io_parms = {0}; __u32 oplock = 0; struct cifs_fid fid; struct cifs_open_parms oparms; From 352d96f3acc6e02099f58a24d5cabce7f8ee061f Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Sun, 31 May 2020 12:38:22 -0500 Subject: [PATCH 364/427] cifs: multichannel: move channel selection above transport layer Move the channel (TCP_Server_Info*) selection from the tranport layer to higher in the call stack so that: - credit handling is done with the server that will actually be used to send. * ->wait_mtu_credit * ->set_credits / set_credits * ->add_credits / add_credits * add_credits_and_wake_if - potential reconnection (smb2_reconnect) done when initializing a request is checked and done with the server that will actually be used to send. To do this: - remove the cifs_pick_channel() call out of compound_send_recv() - select channel and pass it down by adding a cifs_pick_channel(ses) call in: - smb311_posix_mkdir - SMB2_open - SMB2_ioctl - __SMB2_close - query_info - SMB2_change_notify - SMB2_flush - smb2_async_readv (if none provided in context param) - SMB2_read (if none provided in context param) - smb2_async_writev (if none provided in context param) - SMB2_write (if none provided in context param) - SMB2_query_directory - send_set_info - SMB2_oplock_break - SMB311_posix_qfs_info - SMB2_QFS_info - SMB2_QFS_attr - smb2_lockv - SMB2_lease_break - smb2_compound_op - smb2_set_ea - smb2_ioctl_query_info - smb2_query_dir_first - smb2_query_info_comound - smb2_query_symlink - cifs_writepages - cifs_write_from_iter - cifs_send_async_read - cifs_read - cifs_readpages - add TCP_Server_Info *server param argument to: - cifs_send_recv - compound_send_recv - SMB2_open_init - SMB2_query_info_init - SMB2_set_info_init - SMB2_close_init - SMB2_ioctl_init - smb2_iotcl_req_init - SMB2_query_directory_init - SMB2_notify_init - SMB2_flush_init - build_qfs_info_req - smb2_hdr_assemble - smb2_reconnect - fill_small_buf - smb2_plain_req_init - __smb2_plain_req_init The read/write codepath is different than the rest as it is using pages, io iterators and async calls. To deal with those we add a server pointer in the cifs_writedata/cifs_readdata/cifs_io_parms context struct and set it in: - cifs_writepages (wdata) - cifs_write_from_iter (wdata) - cifs_readpages (rdata) - cifs_send_async_read (rdata) The [rw]data->server pointer is eventually copied to cifs_io_parms->server to pass it down to SMB2_read/SMB2_write. If SMB2_read/SMB2_write is called from a different place that doesn't set the server field it will pick a channel. Some places do not pick a channel and just use ses->server or cifs_ses_server(ses). All cifs_ses_server(ses) calls are in codepaths involving negprot/sess.setup. - SMB2_negotiate (binding channel) - SMB2_sess_alloc_buffer (binding channel) - SMB2_echo (uses provided one) - SMB2_logoff (uses master) - SMB2_tdis (uses master) (list not exhaustive) Signed-off-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 3 + fs/cifs/cifsproto.h | 2 + fs/cifs/file.c | 32 ++-- fs/cifs/link.c | 2 +- fs/cifs/smb2inode.c | 44 ++++-- fs/cifs/smb2ops.c | 79 ++++++---- fs/cifs/smb2pdu.c | 353 +++++++++++++++++++++++++++----------------- fs/cifs/smb2proto.h | 25 +++- fs/cifs/transport.c | 14 +- 9 files changed, 356 insertions(+), 198 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 4d261fd78fcb..c0cbbd0bbb1d 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1335,6 +1335,7 @@ struct cifs_io_parms { __u64 offset; unsigned int length; struct cifs_tcon *tcon; + struct TCP_Server_Info *server; }; struct cifs_aio_ctx { @@ -1382,6 +1383,7 @@ struct cifs_readdata { struct cifs_readdata *rdata, struct iov_iter *iter); struct kvec iov[2]; + struct TCP_Server_Info *server; #ifdef CONFIG_CIFS_SMB_DIRECT struct smbd_mr *mr; #endif @@ -1408,6 +1410,7 @@ struct cifs_writedata { pid_t pid; unsigned int bytes; int result; + struct TCP_Server_Info *server; #ifdef CONFIG_CIFS_SMB_DIRECT struct smbd_mr *mr; #endif diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 9767f9b5d315..a25a46237f9f 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -98,9 +98,11 @@ extern int cifs_call_async(struct TCP_Server_Info *server, const struct cifs_credits *exist_credits); extern struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses); extern int cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, + struct TCP_Server_Info *server, struct smb_rqst *rqst, int *resp_buf_type, const int flags, struct kvec *resp_iov); extern int compound_send_recv(const unsigned int xid, struct cifs_ses *ses, + struct TCP_Server_Info *server, const int flags, const int num_rqst, struct smb_rqst *rqst, int *resp_buf_type, struct kvec *resp_iov); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index de130f3aa452..2ca9b387d216 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2292,8 +2292,6 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, struct address_space *mapping, struct writeback_control *wbc) { int rc; - struct TCP_Server_Info *server = - tlink_tcon(wdata->cfile->tlink)->ses->server; wdata->sync_mode = wbc->sync_mode; wdata->nr_pages = nr_pages; @@ -2305,14 +2303,15 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, wdata->bytes = ((nr_pages - 1) * PAGE_SIZE) + wdata->tailsz; wdata->pid = wdata->cfile->pid; - rc = adjust_credits(server, &wdata->credits, wdata->bytes); + rc = adjust_credits(wdata->server, &wdata->credits, wdata->bytes); if (rc) return rc; if (wdata->cfile->invalidHandle) rc = -EAGAIN; else - rc = server->ops->async_writev(wdata, cifs_writedata_release); + rc = wdata->server->ops->async_writev(wdata, + cifs_writedata_release); return rc; } @@ -2349,7 +2348,8 @@ static int cifs_writepages(struct address_space *mapping, range_whole = true; scanned = true; } - server = cifs_sb_master_tcon(cifs_sb)->ses->server; + server = cifs_pick_channel(cifs_sb_master_tcon(cifs_sb)->ses); + retry: while (!done && index <= end) { unsigned int i, nr_pages, found_pages, wsize; @@ -2403,6 +2403,7 @@ retry: wdata->credits = credits_on_stack; wdata->cfile = cfile; + wdata->server = server; cfile = NULL; if (!wdata->cfile) { @@ -2806,8 +2807,7 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list, unsigned int wsize; struct cifs_credits credits; int rc; - struct TCP_Server_Info *server = - tlink_tcon(wdata->cfile->tlink)->ses->server; + struct TCP_Server_Info *server = wdata->server; do { if (wdata->cfile->invalidHandle) { @@ -2893,7 +2893,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, else pid = current->tgid; - server = tlink_tcon(open_file->tlink)->ses->server; + server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses); xid = get_xid(); do { @@ -2997,6 +2997,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, wdata->nr_pages = nr_pages; wdata->offset = (__u64)offset; wdata->cfile = cifsFileInfo_get(open_file); + wdata->server = server; wdata->pid = pid; wdata->bytes = cur_len; wdata->pagesz = PAGE_SIZE; @@ -3538,8 +3539,10 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata, unsigned int rsize; struct cifs_credits credits; int rc; - struct TCP_Server_Info *server = - tlink_tcon(rdata->cfile->tlink)->ses->server; + struct TCP_Server_Info *server; + + /* XXX: should we pick a new channel here? */ + server = rdata->server; do { if (rdata->cfile->invalidHandle) { @@ -3618,7 +3621,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, size_t start; struct iov_iter direct_iov = ctx->iter; - server = tlink_tcon(open_file->tlink)->ses->server; + server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; @@ -3702,6 +3705,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, rdata->tailsz = PAGE_SIZE; } + rdata->server = server; rdata->cfile = cifsFileInfo_get(open_file); rdata->nr_pages = npages; rdata->offset = offset; @@ -4031,7 +4035,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) } open_file = file->private_data; tcon = tlink_tcon(open_file->tlink); - server = tcon->ses->server; + server = cifs_pick_channel(tcon->ses); if (!server->ops->sync_read) { free_xid(xid); @@ -4070,6 +4074,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) io_parms.tcon = tcon; io_parms.offset = *offset; io_parms.length = current_read_size; + io_parms.server = server; rc = server->ops->sync_read(xid, &open_file->fid, &io_parms, &bytes_read, &cur_offset, &buf_type); @@ -4372,7 +4377,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, pid = current->tgid; rc = 0; - server = tlink_tcon(open_file->tlink)->ses->server; + server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses); cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n", __func__, file, mapping, num_pages); @@ -4443,6 +4448,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, } rdata->cfile = cifsFileInfo_get(open_file); + rdata->server = server; rdata->mapping = mapping; rdata->offset = offset; rdata->bytes = bytes; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 2072458e6e24..c381d2d03ef6 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -450,7 +450,7 @@ smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, int rc; struct cifs_fid fid; struct cifs_open_parms oparms; - struct cifs_io_parms io_parms; + struct cifs_io_parms io_parms = {0}; __le16 *utf16_path; __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct kvec iov[2]; diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index fa86c78384c3..0a116fc490a9 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -74,6 +74,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_fid fid; struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server; int num_rqst = 0; int resp_buftype[3]; struct smb2_query_info_rsp *qi_rsp = NULL; @@ -89,6 +90,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, rqst = &vars->rqst[0]; rsp_iov = &vars->rsp_iov[0]; + server = cifs_pick_channel(ses); + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -115,7 +118,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, rqst[num_rqst].rq_iov = &vars->open_iov[0]; rqst[num_rqst].rq_nvec = SMB2_CREATE_IOV_SIZE; - rc = SMB2_open_init(tcon, &rqst[num_rqst], &oplock, &vars->oparms, + rc = SMB2_open_init(tcon, server, + &rqst[num_rqst], &oplock, &vars->oparms, utf16_path); kfree(utf16_path); if (rc) @@ -133,7 +137,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, rqst[num_rqst].rq_nvec = 1; if (cfile) - rc = SMB2_query_info_init(tcon, &rqst[num_rqst], + rc = SMB2_query_info_init(tcon, server, + &rqst[num_rqst], cfile->fid.persistent_fid, cfile->fid.volatile_fid, FILE_ALL_INFORMATION, @@ -141,10 +146,11 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, sizeof(struct smb2_file_all_info) + PATH_MAX * 2, 0, NULL); else { - rc = SMB2_query_info_init(tcon, &rqst[num_rqst], + rc = SMB2_query_info_init(tcon, server, + &rqst[num_rqst], COMPOUND_FID, COMPOUND_FID, - FILE_ALL_INFORMATION, + FILE_ALL_INFORMATION, SMB2_O_INFO_FILE, 0, sizeof(struct smb2_file_all_info) + PATH_MAX * 2, 0, NULL); @@ -177,7 +183,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */ data[0] = &delete_pending[0]; - rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID, + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], COMPOUND_FID, COMPOUND_FID, current->tgid, FILE_DISPOSITION_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); @@ -194,7 +201,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, size[0] = 8; /* sizeof __le64 */ data[0] = ptr; - rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID, + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], COMPOUND_FID, COMPOUND_FID, current->tgid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); @@ -213,13 +221,15 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, data[0] = ptr; if (cfile) - rc = SMB2_set_info_init(tcon, &rqst[num_rqst], + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], cfile->fid.persistent_fid, cfile->fid.volatile_fid, current->tgid, FILE_BASIC_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); else { - rc = SMB2_set_info_init(tcon, &rqst[num_rqst], + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], COMPOUND_FID, COMPOUND_FID, current->tgid, FILE_BASIC_INFORMATION, @@ -253,13 +263,15 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, data[1] = (__le16 *)ptr; if (cfile) - rc = SMB2_set_info_init(tcon, &rqst[num_rqst], + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], cfile->fid.persistent_fid, cfile->fid.volatile_fid, current->tgid, FILE_RENAME_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); else { - rc = SMB2_set_info_init(tcon, &rqst[num_rqst], + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], COMPOUND_FID, COMPOUND_FID, current->tgid, FILE_RENAME_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); @@ -289,7 +301,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, size[1] = len + 2 /* null */; data[1] = (__le16 *)ptr; - rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID, + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], COMPOUND_FID, COMPOUND_FID, current->tgid, FILE_LINK_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); @@ -312,7 +325,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, /* Close */ rqst[num_rqst].rq_iov = &vars->close_iov[0]; rqst[num_rqst].rq_nvec = 1; - rc = SMB2_close_init(tcon, &rqst[num_rqst], COMPOUND_FID, + rc = SMB2_close_init(tcon, server, + &rqst[num_rqst], COMPOUND_FID, COMPOUND_FID, false); smb2_set_related(&rqst[num_rqst]); if (rc) @@ -323,11 +337,13 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, if (cfile) { cifsFileInfo_put(cfile); cfile = NULL; - rc = compound_send_recv(xid, ses, flags, num_rqst - 2, + rc = compound_send_recv(xid, ses, server, + flags, num_rqst - 2, &rqst[1], &resp_buftype[1], &rsp_iov[1]); } else - rc = compound_send_recv(xid, ses, flags, num_rqst, + rc = compound_send_recv(xid, ses, server, + flags, num_rqst, rqst, resp_buftype, rsp_iov); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 49c5c80f5d36..e97eb0050a0e 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -708,7 +708,8 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, oparms.fid = pfid; oparms.reconnect = false; - rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, &utf16_path); + rc = SMB2_open_init(tcon, server, + &rqst[0], &oplock, &oparms, &utf16_path); if (rc) goto oshr_free; smb2_set_next_command(tcon, &rqst[0]); @@ -717,7 +718,8 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, rqst[1].rq_iov = qi_iov; rqst[1].rq_nvec = 1; - rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, + rc = SMB2_query_info_init(tcon, server, + &rqst[1], COMPOUND_FID, COMPOUND_FID, FILE_ALL_INFORMATION, SMB2_O_INFO_FILE, 0, sizeof(struct smb2_file_all_info) + @@ -727,7 +729,8 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, smb2_set_related(&rqst[1]); - rc = compound_send_recv(xid, ses, flags, 2, rqst, + rc = compound_send_recv(xid, ses, server, + flags, 2, rqst, resp_buftype, rsp_iov); mutex_lock(&tcon->crfid.fid_mutex); @@ -1102,6 +1105,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb) { struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = cifs_pick_channel(ses); __le16 *utf16_path = NULL; int ea_name_len = strlen(ea_name); int flags = 0; @@ -1190,7 +1194,8 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, utf16_path); + rc = SMB2_open_init(tcon, server, + &rqst[0], &oplock, &oparms, utf16_path); if (rc) goto sea_exit; smb2_set_next_command(tcon, &rqst[0]); @@ -1216,7 +1221,8 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, size[0] = len; data[0] = ea; - rc = SMB2_set_info_init(tcon, &rqst[1], COMPOUND_FID, + rc = SMB2_set_info_init(tcon, server, + &rqst[1], COMPOUND_FID, COMPOUND_FID, current->tgid, FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); @@ -1228,10 +1234,12 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, memset(&close_iov, 0, sizeof(close_iov)); rqst[2].rq_iov = close_iov; rqst[2].rq_nvec = 1; - rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID, false); + rc = SMB2_close_init(tcon, server, + &rqst[2], COMPOUND_FID, COMPOUND_FID, false); smb2_set_related(&rqst[2]); - rc = compound_send_recv(xid, ses, flags, 3, rqst, + rc = compound_send_recv(xid, ses, server, + flags, 3, rqst, resp_buftype, rsp_iov); /* no need to bump num_remote_opens because handle immediately closed */ @@ -1473,6 +1481,7 @@ smb2_ioctl_query_info(const unsigned int xid, struct smb_rqst *rqst; struct kvec *rsp_iov; struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = cifs_pick_channel(ses); char __user *arg = (char __user *)p; struct smb_query_info qi; struct smb_query_info __user *pqi; @@ -1505,7 +1514,7 @@ smb2_ioctl_query_info(const unsigned int xid, return -EINVAL; } - if (!ses || !(ses->server)) { + if (!ses || !server) { kfree(vars); return -EIO; } @@ -1552,7 +1561,8 @@ smb2_ioctl_query_info(const unsigned int xid, oparms.desired_access = FILE_READ_ATTRIBUTES | READ_CONTROL; } - rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, path); + rc = SMB2_open_init(tcon, server, + &rqst[0], &oplock, &oparms, path); if (rc) goto iqinf_exit; smb2_set_next_command(tcon, &rqst[0]); @@ -1566,7 +1576,8 @@ smb2_ioctl_query_info(const unsigned int xid, rqst[1].rq_iov = &vars->io_iov[0]; rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; - rc = SMB2_ioctl_init(tcon, &rqst[1], + rc = SMB2_ioctl_init(tcon, server, + &rqst[1], COMPOUND_FID, COMPOUND_FID, qi.info_type, true, buffer, qi.output_buffer_length, @@ -1585,7 +1596,8 @@ smb2_ioctl_query_info(const unsigned int xid, size[0] = 8; data[0] = buffer; - rc = SMB2_set_info_init(tcon, &rqst[1], + rc = SMB2_set_info_init(tcon, server, + &rqst[1], COMPOUND_FID, COMPOUND_FID, current->tgid, FILE_END_OF_FILE_INFORMATION, @@ -1595,7 +1607,8 @@ smb2_ioctl_query_info(const unsigned int xid, rqst[1].rq_iov = &vars->qi_iov[0]; rqst[1].rq_nvec = 1; - rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, + rc = SMB2_query_info_init(tcon, server, + &rqst[1], COMPOUND_FID, COMPOUND_FID, qi.file_info_class, qi.info_type, qi.additional_information, qi.input_buffer_length, @@ -1615,12 +1628,14 @@ smb2_ioctl_query_info(const unsigned int xid, rqst[2].rq_iov = &vars->close_iov[0]; rqst[2].rq_nvec = 1; - rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID, false); + rc = SMB2_close_init(tcon, server, + &rqst[2], COMPOUND_FID, COMPOUND_FID, false); if (rc) goto iqinf_exit; smb2_set_related(&rqst[2]); - rc = compound_send_recv(xid, ses, flags, 3, rqst, + rc = compound_send_recv(xid, ses, server, + flags, 3, rqst, resp_buftype, rsp_iov); if (rc) goto iqinf_exit; @@ -2172,6 +2187,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct smb2_query_directory_rsp *qd_rsp = NULL; struct smb2_create_rsp *op_rsp = NULL; + struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) @@ -2196,7 +2212,8 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = fid; oparms.reconnect = false; - rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, utf16_path); + rc = SMB2_open_init(tcon, server, + &rqst[0], &oplock, &oparms, utf16_path); if (rc) goto qdf_free; smb2_set_next_command(tcon, &rqst[0]); @@ -2209,7 +2226,8 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, rqst[1].rq_iov = qd_iov; rqst[1].rq_nvec = SMB2_QUERY_DIRECTORY_IOV_SIZE; - rc = SMB2_query_directory_init(xid, tcon, &rqst[1], + rc = SMB2_query_directory_init(xid, tcon, server, + &rqst[1], COMPOUND_FID, COMPOUND_FID, 0, srch_inf->info_level); if (rc) @@ -2217,7 +2235,8 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, smb2_set_related(&rqst[1]); - rc = compound_send_recv(xid, tcon->ses, flags, 2, rqst, + rc = compound_send_recv(xid, tcon->ses, server, + flags, 2, rqst, resp_buftype, rsp_iov); /* If the open failed there is nothing to do */ @@ -2422,6 +2441,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb) { struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = cifs_pick_channel(ses); int flags = 0; struct smb_rqst rqst[3]; int resp_buftype[3]; @@ -2452,7 +2472,8 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, utf16_path); + rc = SMB2_open_init(tcon, server, + &rqst[0], &oplock, &oparms, utf16_path); if (rc) goto qic_exit; smb2_set_next_command(tcon, &rqst[0]); @@ -2461,7 +2482,8 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, rqst[1].rq_iov = qi_iov; rqst[1].rq_nvec = 1; - rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, COMPOUND_FID, + rc = SMB2_query_info_init(tcon, server, + &rqst[1], COMPOUND_FID, COMPOUND_FID, class, type, 0, output_len, 0, NULL); @@ -2474,12 +2496,14 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, rqst[2].rq_iov = close_iov; rqst[2].rq_nvec = 1; - rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID, false); + rc = SMB2_close_init(tcon, server, + &rqst[2], COMPOUND_FID, COMPOUND_FID, false); if (rc) goto qic_exit; smb2_set_related(&rqst[2]); - rc = compound_send_recv(xid, ses, flags, 3, rqst, + rc = compound_send_recv(xid, ses, server, + flags, 3, rqst, resp_buftype, rsp_iov); if (rc) { free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); @@ -2811,6 +2835,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, struct kvec err_iov = {NULL, 0}; struct smb2_err_rsp *err_buf = NULL; struct smb2_symlink_err_rsp *symlink; + struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); unsigned int sub_len; unsigned int sub_offset; unsigned int print_len; @@ -2856,7 +2881,8 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, utf16_path); + rc = SMB2_open_init(tcon, server, + &rqst[0], &oplock, &oparms, utf16_path); if (rc) goto querty_exit; smb2_set_next_command(tcon, &rqst[0]); @@ -2867,7 +2893,8 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, rqst[1].rq_iov = io_iov; rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; - rc = SMB2_ioctl_init(tcon, &rqst[1], fid.persistent_fid, + rc = SMB2_ioctl_init(tcon, server, + &rqst[1], fid.persistent_fid, fid.volatile_fid, FSCTL_GET_REPARSE_POINT, true /* is_fctl */, NULL, 0, CIFSMaxBufSize - @@ -2885,13 +2912,15 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, rqst[2].rq_iov = close_iov; rqst[2].rq_nvec = 1; - rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID, false); + rc = SMB2_close_init(tcon, server, + &rqst[2], COMPOUND_FID, COMPOUND_FID, false); if (rc) goto querty_exit; smb2_set_related(&rqst[2]); - rc = compound_send_recv(xid, tcon->ses, flags, 3, rqst, + rc = compound_send_recv(xid, tcon->ses, server, + flags, 3, rqst, resp_buftype, rsp_iov); create_rsp = rsp_iov[0].iov_base; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 12de0af12f75..fe5a8452b213 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -98,14 +98,13 @@ int smb3_encryption_required(const struct cifs_tcon *tcon) static void smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd, - const struct cifs_tcon *tcon) + const struct cifs_tcon *tcon, + struct TCP_Server_Info *server) { shdr->ProtocolId = SMB2_PROTO_NUMBER; shdr->StructureSize = cpu_to_le16(64); shdr->Command = smb2_cmd; - if (tcon && tcon->ses && tcon->ses->server) { - struct TCP_Server_Info *server = tcon->ses->server; - + if (server) { spin_lock(&server->req_lock); /* Request up to 10 credits but don't go over the limit. */ if (server->credits >= server->max_credits) @@ -125,8 +124,7 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd, /* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */ /* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */ - if ((tcon->ses) && (tcon->ses->server) && - (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) + if (server && (server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) shdr->CreditCharge = cpu_to_le16(1); /* else CreditCharge MBZ */ @@ -148,8 +146,7 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd, /* if (tcon->share_flags & SHI1005_FLAGS_DFS) shdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */ - if (tcon->ses && tcon->ses->server && tcon->ses->server->sign && - !smb3_encryption_required(tcon)) + if (server && server->sign && !smb3_encryption_required(tcon)) shdr->Flags |= SMB2_FLAGS_SIGNED; out: return; @@ -267,12 +264,12 @@ static inline int __smb2_reconnect(const struct nls_table *nlsc, #endif static int -smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) +smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, + struct TCP_Server_Info *server) { int rc; struct nls_table *nls_codepage; struct cifs_ses *ses; - struct TCP_Server_Info *server; int retries; /* @@ -301,12 +298,10 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) } } if ((!tcon->ses) || (tcon->ses->status == CifsExiting) || - (!tcon->ses->server)) + (!tcon->ses->server) || !server) return -EIO; ses = tcon->ses; - server = ses->server; - retries = server->nr_targets; /* @@ -439,7 +434,9 @@ failed: } static void -fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, void *buf, +fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, + struct TCP_Server_Info *server, + void *buf, unsigned int *total_len) { struct smb2_sync_pdu *spdu = (struct smb2_sync_pdu *)buf; @@ -452,7 +449,7 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, void *buf, */ memset(buf, 0, 256); - smb2_hdr_assemble(&spdu->sync_hdr, smb2_command, tcon); + smb2_hdr_assemble(&spdu->sync_hdr, smb2_command, tcon, server); spdu->StructureSize2 = cpu_to_le16(parmsize); *total_len = parmsize + sizeof(struct smb2_sync_hdr); @@ -464,7 +461,8 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, void *buf, * function must have filled in request_buf pointer. */ static int __smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, - void **request_buf, unsigned int *total_len) + struct TCP_Server_Info *server, + void **request_buf, unsigned int *total_len) { /* BB eventually switch this to SMB2 specific small buf size */ if (smb2_command == SMB2_SET_INFO) @@ -476,7 +474,7 @@ static int __smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, return -ENOMEM; } - fill_small_buf(smb2_command, tcon, + fill_small_buf(smb2_command, tcon, server, (struct smb2_sync_hdr *)(*request_buf), total_len); @@ -490,27 +488,30 @@ static int __smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, } static int smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, + struct TCP_Server_Info *server, void **request_buf, unsigned int *total_len) { int rc; - rc = smb2_reconnect(smb2_command, tcon); + rc = smb2_reconnect(smb2_command, tcon, server); if (rc) return rc; - return __smb2_plain_req_init(smb2_command, tcon, request_buf, + return __smb2_plain_req_init(smb2_command, tcon, server, request_buf, total_len); } static int smb2_ioctl_req_init(u32 opcode, struct cifs_tcon *tcon, + struct TCP_Server_Info *server, void **request_buf, unsigned int *total_len) { /* Skip reconnect only for FSCTL_VALIDATE_NEGOTIATE_INFO IOCTLs */ if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) { - return __smb2_plain_req_init(SMB2_IOCTL, tcon, request_buf, - total_len); + return __smb2_plain_req_init(SMB2_IOCTL, tcon, server, + request_buf, total_len); } - return smb2_plain_req_init(SMB2_IOCTL, tcon, request_buf, total_len); + return smb2_plain_req_init(SMB2_IOCTL, tcon, server, + request_buf, total_len); } /* For explanation of negotiate contexts see MS-SMB2 section 2.2.3.1 */ @@ -858,7 +859,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) return -EIO; } - rc = smb2_plain_req_init(SMB2_NEGOTIATE, NULL, (void **) &req, &total_len); + rc = smb2_plain_req_init(SMB2_NEGOTIATE, NULL, server, + (void **) &req, &total_len); if (rc) return rc; @@ -916,7 +918,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) rqst.rq_iov = iov; rqst.rq_nvec = 1; - rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); + rc = cifs_send_recv(xid, ses, server, + &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_negotiate_rsp *)rsp_iov.iov_base; /* @@ -1227,8 +1230,9 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) struct TCP_Server_Info *server = cifs_ses_server(ses); unsigned int total_len; - rc = smb2_plain_req_init(SMB2_SESSION_SETUP, NULL, (void **) &req, - &total_len); + rc = smb2_plain_req_init(SMB2_SESSION_SETUP, NULL, server, + (void **) &req, + &total_len); if (rc) return rc; @@ -1305,6 +1309,7 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) /* BB add code to build os and lm fields */ rc = cifs_send_recv(sess_data->xid, sess_data->ses, + cifs_ses_server(sess_data->ses), &rqst, &sess_data->buf0_type, CIFS_LOG_ERROR | CIFS_NEG_OP, &rsp_iov); @@ -1689,7 +1694,8 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) if (ses->need_reconnect) goto smb2_session_already_dead; - rc = smb2_plain_req_init(SMB2_LOGOFF, NULL, (void **) &req, &total_len); + rc = smb2_plain_req_init(SMB2_LOGOFF, NULL, ses->server, + (void **) &req, &total_len); if (rc) return rc; @@ -1710,7 +1716,8 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) rqst.rq_iov = iov; rqst.rq_nvec = 1; - rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); + rc = cifs_send_recv(xid, ses, ses->server, + &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); /* * No tcon so can't do @@ -1751,7 +1758,10 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, __le16 *unc_path = NULL; int flags = 0; unsigned int total_len; - struct TCP_Server_Info *server = ses->server; + struct TCP_Server_Info *server; + + /* always use master channel */ + server = ses->server; cifs_dbg(FYI, "TCON\n"); @@ -1772,8 +1782,8 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, /* SMB2 TREE_CONNECT request must be called with TreeId == 0 */ tcon->tid = 0; atomic_set(&tcon->num_remote_opens, 0); - rc = smb2_plain_req_init(SMB2_TREE_CONNECT, tcon, (void **) &req, - &total_len); + rc = smb2_plain_req_init(SMB2_TREE_CONNECT, tcon, server, + (void **) &req, &total_len); if (rc) { kfree(unc_path); return rc; @@ -1812,7 +1822,8 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, /* Need 64 for max size write so ask for more in case not there yet */ req->sync_hdr.CreditRequest = cpu_to_le16(64); - rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); + rc = cifs_send_recv(xid, ses, server, + &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base; trace_smb3_tcon(xid, tcon->tid, ses->Suid, tree, rc); @@ -1897,8 +1908,9 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) close_shroot_lease(&tcon->crfid); - rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, (void **) &req, - &total_len); + rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, ses->server, + (void **) &req, + &total_len); if (rc) return rc; @@ -1914,7 +1926,8 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) rqst.rq_iov = iov; rqst.rq_nvec = 1; - rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); + rc = cifs_send_recv(xid, ses, ses->server, + &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) cifs_stats_fail_inc(tcon, SMB2_TREE_DISCONNECT_HE); @@ -2468,6 +2481,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, int flags = 0; unsigned int total_len; __le16 *utf16_path = NULL; + struct TCP_Server_Info *server = cifs_pick_channel(ses); cifs_dbg(FYI, "mkdir\n"); @@ -2476,13 +2490,14 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, if (!utf16_path) return -ENOMEM; - if (!ses || !(ses->server)) { + if (!ses || !server) { rc = -EIO; goto err_free_path; } /* resource #2: request */ - rc = smb2_plain_req_init(SMB2_CREATE, tcon, (void **) &req, &total_len); + rc = smb2_plain_req_init(SMB2_CREATE, tcon, server, + (void **) &req, &total_len); if (rc) goto err_free_path; @@ -2568,7 +2583,8 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, CREATE_NOT_FILE, FILE_WRITE_ATTRIBUTES); /* resource #4: response buffer */ - rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); + rc = cifs_send_recv(xid, ses, server, + &rqst, &resp_buftype, flags, &rsp_iov); if (rc) { cifs_stats_fail_inc(tcon, SMB2_CREATE_HE); trace_smb3_posix_mkdir_err(xid, tcon->tid, ses->Suid, @@ -2597,10 +2613,10 @@ err_free_path: } int -SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock, +SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, + struct smb_rqst *rqst, __u8 *oplock, struct cifs_open_parms *oparms, __le16 *path) { - struct TCP_Server_Info *server = tcon->ses->server; struct smb2_create_req *req; unsigned int n_iov = 2; __u32 file_attributes = 0; @@ -2611,7 +2627,8 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock, __le16 *copy_path; int rc; - rc = smb2_plain_req_init(SMB2_CREATE, tcon, (void **) &req, &total_len); + rc = smb2_plain_req_init(SMB2_CREATE, tcon, server, + (void **) &req, &total_len); if (rc) return rc; @@ -2783,9 +2800,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, { struct smb_rqst rqst; struct smb2_create_rsp *rsp = NULL; - struct TCP_Server_Info *server; struct cifs_tcon *tcon = oparms->tcon; struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = cifs_pick_channel(ses); struct kvec iov[SMB2_CREATE_IOV_SIZE]; struct kvec rsp_iov = {NULL, 0}; int resp_buftype = CIFS_NO_BUFFER; @@ -2793,9 +2810,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, int flags = 0; cifs_dbg(FYI, "create/open\n"); - if (ses && (ses->server)) - server = ses->server; - else + if (!ses || !server) return -EIO; if (smb3_encryption_required(tcon)) @@ -2806,14 +2821,16 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, rqst.rq_iov = iov; rqst.rq_nvec = SMB2_CREATE_IOV_SIZE; - rc = SMB2_open_init(tcon, &rqst, oplock, oparms, path); + rc = SMB2_open_init(tcon, server, + &rqst, oplock, oparms, path); if (rc) goto creat_exit; trace_smb3_open_enter(xid, tcon->tid, tcon->ses->Suid, oparms->create_options, oparms->desired_access); - rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, + rc = cifs_send_recv(xid, ses, server, + &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_create_rsp *)rsp_iov.iov_base; @@ -2865,7 +2882,8 @@ creat_exit: } int -SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, +SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, + struct smb_rqst *rqst, u64 persistent_fid, u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data, u32 indatalen, __u32 max_response_size) @@ -2876,7 +2894,8 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, int rc; char *in_data_buf; - rc = smb2_ioctl_req_init(opcode, tcon, (void **) &req, &total_len); + rc = smb2_ioctl_req_init(opcode, tcon, server, + (void **) &req, &total_len); if (rc) return rc; @@ -2976,12 +2995,12 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, struct smb_rqst rqst; struct smb2_ioctl_rsp *rsp = NULL; struct cifs_ses *ses; + struct TCP_Server_Info *server; struct kvec iov[SMB2_IOCTL_IOV_SIZE]; struct kvec rsp_iov = {NULL, 0}; int resp_buftype = CIFS_NO_BUFFER; int rc = 0; int flags = 0; - struct TCP_Server_Info *server; cifs_dbg(FYI, "SMB2 IOCTL\n"); @@ -2992,14 +3011,14 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (plen) *plen = 0; - if (tcon) - ses = tcon->ses; - else + if (!tcon) return -EIO; + ses = tcon->ses; if (!ses) return -EIO; - server = ses->server; + + server = cifs_pick_channel(ses); if (!server) return -EIO; @@ -3011,12 +3030,14 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, rqst.rq_iov = iov; rqst.rq_nvec = SMB2_IOCTL_IOV_SIZE; - rc = SMB2_ioctl_init(tcon, &rqst, persistent_fid, volatile_fid, opcode, + rc = SMB2_ioctl_init(tcon, server, + &rqst, persistent_fid, volatile_fid, opcode, is_fsctl, in_data, indatalen, max_out_data_len); if (rc) goto ioctl_exit; - rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, + rc = cifs_send_recv(xid, ses, server, + &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base; @@ -3104,7 +3125,8 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, } int -SMB2_close_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, +SMB2_close_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, + struct smb_rqst *rqst, u64 persistent_fid, u64 volatile_fid, bool query_attrs) { struct smb2_close_req *req; @@ -3112,7 +3134,8 @@ SMB2_close_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, unsigned int total_len; int rc; - rc = smb2_plain_req_init(SMB2_CLOSE, tcon, (void **) &req, &total_len); + rc = smb2_plain_req_init(SMB2_CLOSE, tcon, server, + (void **) &req, &total_len); if (rc) return rc; @@ -3143,6 +3166,7 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, struct smb_rqst rqst; struct smb2_close_rsp *rsp = NULL; struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = cifs_pick_channel(ses); struct kvec iov[1]; struct kvec rsp_iov; int resp_buftype = CIFS_NO_BUFFER; @@ -3152,7 +3176,7 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, cifs_dbg(FYI, "Close\n"); - if (!ses || !(ses->server)) + if (!ses || !server) return -EIO; if (smb3_encryption_required(tcon)) @@ -3168,12 +3192,14 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, query_attrs = true; trace_smb3_close_enter(xid, persistent_fid, tcon->tid, ses->Suid); - rc = SMB2_close_init(tcon, &rqst, persistent_fid, volatile_fid, + rc = SMB2_close_init(tcon, server, + &rqst, persistent_fid, volatile_fid, query_attrs); if (rc) goto close_exit; - rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); + rc = cifs_send_recv(xid, ses, server, + &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_close_rsp *)rsp_iov.iov_base; if (rc != 0) { @@ -3273,7 +3299,8 @@ smb2_validate_and_copy_iov(unsigned int offset, unsigned int buffer_length, } int -SMB2_query_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, +SMB2_query_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, + struct smb_rqst *rqst, u64 persistent_fid, u64 volatile_fid, u8 info_class, u8 info_type, u32 additional_info, size_t output_len, size_t input_len, void *input) @@ -3283,8 +3310,8 @@ SMB2_query_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, unsigned int total_len; int rc; - rc = smb2_plain_req_init(SMB2_QUERY_INFO, tcon, (void **) &req, - &total_len); + rc = smb2_plain_req_init(SMB2_QUERY_INFO, tcon, server, + (void **) &req, &total_len); if (rc) return rc; @@ -3336,7 +3363,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, if (!ses) return -EIO; - server = ses->server; + server = cifs_pick_channel(ses); if (!server) return -EIO; @@ -3348,7 +3375,8 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = iov; rqst.rq_nvec = 1; - rc = SMB2_query_info_init(tcon, &rqst, persistent_fid, volatile_fid, + rc = SMB2_query_info_init(tcon, server, + &rqst, persistent_fid, volatile_fid, info_class, info_type, additional_info, output_len, 0, NULL); if (rc) @@ -3357,7 +3385,8 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_query_info_enter(xid, persistent_fid, tcon->tid, ses->Suid, info_class, (__u32)info_type); - rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); + rc = cifs_send_recv(xid, ses, server, + &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; if (rc) { @@ -3442,15 +3471,17 @@ SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon, static int SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst, - struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, - u32 completion_filter, bool watch_tree) + struct cifs_tcon *tcon, struct TCP_Server_Info *server, + u64 persistent_fid, u64 volatile_fid, + u32 completion_filter, bool watch_tree) { struct smb2_change_notify_req *req; struct kvec *iov = rqst->rq_iov; unsigned int total_len; int rc; - rc = smb2_plain_req_init(SMB2_CHANGE_NOTIFY, tcon, (void **) &req, &total_len); + rc = smb2_plain_req_init(SMB2_CHANGE_NOTIFY, tcon, server, + (void **) &req, &total_len); if (rc) return rc; @@ -3477,6 +3508,7 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, u32 completion_filter) { struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = cifs_pick_channel(ses); struct smb_rqst rqst; struct kvec iov[1]; struct kvec rsp_iov = {NULL, 0}; @@ -3485,7 +3517,7 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; cifs_dbg(FYI, "change notify\n"); - if (!ses || !(ses->server)) + if (!ses || !server) return -EIO; if (smb3_encryption_required(tcon)) @@ -3496,14 +3528,16 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = iov; rqst.rq_nvec = 1; - rc = SMB2_notify_init(xid, &rqst, tcon, persistent_fid, volatile_fid, + rc = SMB2_notify_init(xid, &rqst, tcon, server, + persistent_fid, volatile_fid, completion_filter, watch_tree); if (rc) goto cnotify_exit; trace_smb3_notify_enter(xid, persistent_fid, tcon->tid, ses->Suid, (u8)watch_tree, completion_filter); - rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); + rc = cifs_send_recv(xid, ses, server, + &rqst, &resp_buftype, flags, &rsp_iov); if (rc != 0) { cifs_stats_fail_inc(tcon, SMB2_CHANGE_NOTIFY_HE); @@ -3593,7 +3627,7 @@ void smb2_reconnect_server(struct work_struct *work) spin_unlock(&cifs_tcp_ses_lock); list_for_each_entry_safe(tcon, tcon2, &tmp_list, rlist) { - rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon); + rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server); if (!rc) cifs_reopen_persistent_handles(tcon); else @@ -3633,7 +3667,8 @@ SMB2_echo(struct TCP_Server_Info *server) return rc; } - rc = smb2_plain_req_init(SMB2_ECHO, NULL, (void **)&req, &total_len); + rc = smb2_plain_req_init(SMB2_ECHO, NULL, server, + (void **)&req, &total_len); if (rc) return rc; @@ -3660,14 +3695,16 @@ SMB2_flush_free(struct smb_rqst *rqst) int SMB2_flush_init(const unsigned int xid, struct smb_rqst *rqst, - struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid) + struct cifs_tcon *tcon, struct TCP_Server_Info *server, + u64 persistent_fid, u64 volatile_fid) { struct smb2_flush_req *req; struct kvec *iov = rqst->rq_iov; unsigned int total_len; int rc; - rc = smb2_plain_req_init(SMB2_FLUSH, tcon, (void **) &req, &total_len); + rc = smb2_plain_req_init(SMB2_FLUSH, tcon, server, + (void **) &req, &total_len); if (rc) return rc; @@ -3688,6 +3725,7 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, struct smb_rqst rqst; struct kvec iov[1]; struct kvec rsp_iov = {NULL, 0}; + struct TCP_Server_Info *server = cifs_pick_channel(ses); int resp_buftype = CIFS_NO_BUFFER; int flags = 0; int rc = 0; @@ -3704,12 +3742,14 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, rqst.rq_iov = iov; rqst.rq_nvec = 1; - rc = SMB2_flush_init(xid, &rqst, tcon, persistent_fid, volatile_fid); + rc = SMB2_flush_init(xid, &rqst, tcon, server, + persistent_fid, volatile_fid); if (rc) goto flush_exit; trace_smb3_flush_enter(xid, persistent_fid, tcon->tid, ses->Suid); - rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); + rc = cifs_send_recv(xid, ses, server, + &rqst, &resp_buftype, flags, &rsp_iov); if (rc != 0) { cifs_stats_fail_inc(tcon, SMB2_FLUSH_HE); @@ -3737,14 +3777,13 @@ smb2_new_read_req(void **buf, unsigned int *total_len, int rc = -EACCES; struct smb2_read_plain_req *req = NULL; struct smb2_sync_hdr *shdr; - struct TCP_Server_Info *server; + struct TCP_Server_Info *server = io_parms->server; - rc = smb2_plain_req_init(SMB2_READ, io_parms->tcon, (void **) &req, - total_len); + rc = smb2_plain_req_init(SMB2_READ, io_parms->tcon, server, + (void **) &req, total_len); if (rc) return rc; - server = io_parms->tcon->ses->server; if (server == NULL) return -ECONNABORTED; @@ -3773,8 +3812,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len, rdata->bytes >= server->smbd_conn->rdma_readwrite_threshold) { struct smbd_buffer_descriptor_v1 *v1; - bool need_invalidate = - io_parms->tcon->ses->server->dialect == SMB30_PROT_ID; + bool need_invalidate = server->dialect == SMB30_PROT_ID; rdata->mr = smbd_register_mr( server->smbd_conn, rdata->pages, @@ -3831,7 +3869,7 @@ smb2_readv_callback(struct mid_q_entry *mid) { struct cifs_readdata *rdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); - struct TCP_Server_Info *server = tcon->ses->server; + struct TCP_Server_Info *server = rdata->server; struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)rdata->iov[0].iov_base; struct cifs_credits credits = { .value = 0, .instance = 0 }; @@ -3843,6 +3881,10 @@ smb2_readv_callback(struct mid_q_entry *mid) .rq_pagesz = rdata->pagesz, .rq_tailsz = rdata->tailsz }; + WARN_ONCE(rdata->server != mid->server, + "rdata server %p != mid server %p", + rdata->server, mid->server); + cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n", __func__, mid->mid, mid->mid_state, rdata->result, rdata->bytes); @@ -3920,20 +3962,23 @@ smb2_async_readv(struct cifs_readdata *rdata) struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 1 }; struct TCP_Server_Info *server; + struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); unsigned int total_len; cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n", __func__, rdata->offset, rdata->bytes); + if (!rdata->server) + rdata->server = cifs_pick_channel(tcon->ses); + io_parms.tcon = tlink_tcon(rdata->cfile->tlink); + io_parms.server = server = rdata->server; io_parms.offset = rdata->offset; io_parms.length = rdata->bytes; io_parms.persistent_fid = rdata->cfile->fid.persistent_fid; io_parms.volatile_fid = rdata->cfile->fid.volatile_fid; io_parms.pid = rdata->pid; - server = io_parms.tcon->ses->server; - rc = smb2_new_read_req( (void **) &buf, &total_len, &io_parms, rdata, 0, 0); if (rc) @@ -3961,7 +4006,7 @@ smb2_async_readv(struct cifs_readdata *rdata) } kref_get(&rdata->refcount); - rc = cifs_call_async(io_parms.tcon->ses->server, &rqst, + rc = cifs_call_async(server, &rqst, cifs_readv_receive, smb2_readv_callback, smb3_handle_read_data, rdata, flags, &rdata->credits); @@ -3993,6 +4038,9 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, int flags = CIFS_LOG_ERROR; struct cifs_ses *ses = io_parms->tcon->ses; + if (!io_parms->server) + io_parms->server = cifs_pick_channel(io_parms->tcon->ses); + *nbytes = 0; rc = smb2_new_read_req((void **)&req, &total_len, io_parms, NULL, 0, 0); if (rc) @@ -4008,7 +4056,8 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, rqst.rq_iov = iov; rqst.rq_nvec = 1; - rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); + rc = cifs_send_recv(xid, ses, io_parms->server, + &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_read_rsp *)rsp_iov.iov_base; if (rc) { @@ -4064,11 +4113,15 @@ smb2_writev_callback(struct mid_q_entry *mid) { struct cifs_writedata *wdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); - struct TCP_Server_Info *server = tcon->ses->server; + struct TCP_Server_Info *server = wdata->server; unsigned int written; struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf; struct cifs_credits credits = { .value = 0, .instance = 0 }; + WARN_ONCE(wdata->server != mid->server, + "wdata server %p != mid server %p", + wdata->server, mid->server); + switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest); @@ -4146,12 +4199,16 @@ smb2_async_writev(struct cifs_writedata *wdata, struct smb2_write_req *req = NULL; struct smb2_sync_hdr *shdr; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); - struct TCP_Server_Info *server = tcon->ses->server; + struct TCP_Server_Info *server = wdata->server; struct kvec iov[1]; struct smb_rqst rqst = { }; unsigned int total_len; - rc = smb2_plain_req_init(SMB2_WRITE, tcon, (void **) &req, &total_len); + if (!wdata->server) + server = wdata->server = cifs_pick_channel(tcon->ses); + + rc = smb2_plain_req_init(SMB2_WRITE, tcon, server, + (void **) &req, &total_len); if (rc) return rc; @@ -4290,20 +4347,24 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, struct kvec rsp_iov; int flags = 0; unsigned int total_len; + struct TCP_Server_Info *server; *nbytes = 0; if (n_vec < 1) return rc; - rc = smb2_plain_req_init(SMB2_WRITE, io_parms->tcon, (void **) &req, - &total_len); + if (!io_parms->server) + io_parms->server = cifs_pick_channel(io_parms->tcon->ses); + server = io_parms->server; + if (server == NULL) + return -ECONNABORTED; + + rc = smb2_plain_req_init(SMB2_WRITE, io_parms->tcon, server, + (void **) &req, &total_len); if (rc) return rc; - if (io_parms->tcon->ses->server == NULL) - return -ECONNABORTED; - if (smb3_encryption_required(io_parms->tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -4332,7 +4393,8 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, rqst.rq_iov = iov; rqst.rq_nvec = n_vec + 1; - rc = cifs_send_recv(xid, io_parms->tcon->ses, &rqst, + rc = cifs_send_recv(xid, io_parms->tcon->ses, server, + &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_write_rsp *)rsp_iov.iov_base; @@ -4506,11 +4568,12 @@ num_entries(int infotype, char *bufstart, char *end_of_buf, char **lastentry, * Readdir/FindFirst */ int SMB2_query_directory_init(const unsigned int xid, - struct cifs_tcon *tcon, struct smb_rqst *rqst, + struct cifs_tcon *tcon, + struct TCP_Server_Info *server, + struct smb_rqst *rqst, u64 persistent_fid, u64 volatile_fid, int index, int info_level) { - struct TCP_Server_Info *server = tcon->ses->server; struct smb2_query_directory_req *req; unsigned char *bufptr; __le16 asteriks = cpu_to_le16('*'); @@ -4521,8 +4584,8 @@ int SMB2_query_directory_init(const unsigned int xid, struct kvec *iov = rqst->rq_iov; int len, rc; - rc = smb2_plain_req_init(SMB2_QUERY_DIRECTORY, tcon, (void **) &req, - &total_len); + rc = smb2_plain_req_init(SMB2_QUERY_DIRECTORY, tcon, server, + (void **) &req, &total_len); if (rc) return rc; @@ -4665,6 +4728,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, struct kvec rsp_iov; int rc = 0; struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = cifs_pick_channel(ses); int flags = 0; if (!ses || !(ses->server)) @@ -4678,13 +4742,15 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = iov; rqst.rq_nvec = SMB2_QUERY_DIRECTORY_IOV_SIZE; - rc = SMB2_query_directory_init(xid, tcon, &rqst, persistent_fid, + rc = SMB2_query_directory_init(xid, tcon, server, + &rqst, persistent_fid, volatile_fid, index, srch_inf->info_level); if (rc) goto qdir_exit; - rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); + rc = cifs_send_recv(xid, ses, server, + &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base; if (rc) { @@ -4721,17 +4787,19 @@ qdir_exit: } int -SMB2_set_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, - u64 persistent_fid, u64 volatile_fid, u32 pid, u8 info_class, - u8 info_type, u32 additional_info, - void **data, unsigned int *size) +SMB2_set_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, + struct smb_rqst *rqst, + u64 persistent_fid, u64 volatile_fid, u32 pid, + u8 info_class, u8 info_type, u32 additional_info, + void **data, unsigned int *size) { struct smb2_set_info_req *req; struct kvec *iov = rqst->rq_iov; unsigned int i, total_len; int rc; - rc = smb2_plain_req_init(SMB2_SET_INFO, tcon, (void **) &req, &total_len); + rc = smb2_plain_req_init(SMB2_SET_INFO, tcon, server, + (void **) &req, &total_len); if (rc) return rc; @@ -4782,9 +4850,10 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype; struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = cifs_pick_channel(ses); int flags = 0; - if (!ses || !(ses->server)) + if (!ses || !server) return -EIO; if (!num) @@ -4801,7 +4870,8 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = iov; rqst.rq_nvec = num; - rc = SMB2_set_info_init(tcon, &rqst, persistent_fid, volatile_fid, pid, + rc = SMB2_set_info_init(tcon, server, + &rqst, persistent_fid, volatile_fid, pid, info_class, info_type, additional_info, data, size); if (rc) { @@ -4810,7 +4880,8 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, } - rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, + rc = cifs_send_recv(xid, ses, server, + &rqst, &resp_buftype, flags, &rsp_iov); SMB2_set_info_free(&rqst); rsp = (struct smb2_set_info_rsp *)rsp_iov.iov_base; @@ -4873,6 +4944,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, int rc; struct smb2_oplock_break *req = NULL; struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = cifs_pick_channel(ses); int flags = CIFS_OBREAK_OP; unsigned int total_len; struct kvec iov[1]; @@ -4880,8 +4952,8 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, int resp_buf_type; cifs_dbg(FYI, "SMB2_oplock_break\n"); - rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req, - &total_len); + rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, server, + (void **) &req, &total_len); if (rc) return rc; @@ -4902,7 +4974,8 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = iov; rqst.rq_nvec = 1; - rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); + rc = cifs_send_recv(xid, ses, server, + &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) { @@ -4945,8 +5018,10 @@ copy_posix_fs_info_to_kstatfs(FILE_SYSTEM_POSIX_INFO *response_data, } static int -build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level, - int outbuf_len, u64 persistent_fid, u64 volatile_fid) +build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, + struct TCP_Server_Info *server, + int level, int outbuf_len, u64 persistent_fid, + u64 volatile_fid) { int rc; struct smb2_query_info_req *req; @@ -4954,11 +5029,11 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level, cifs_dbg(FYI, "Query FSInfo level %d\n", level); - if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) + if ((tcon->ses == NULL) || server == NULL) return -EIO; - rc = smb2_plain_req_init(SMB2_QUERY_INFO, tcon, (void **) &req, - &total_len); + rc = smb2_plain_req_init(SMB2_QUERY_INFO, tcon, server, + (void **) &req, &total_len); if (rc) return rc; @@ -4988,10 +5063,12 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype; struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = cifs_pick_channel(ses); FILE_SYSTEM_POSIX_INFO *info = NULL; int flags = 0; - rc = build_qfs_info_req(&iov, tcon, FS_POSIX_INFORMATION, + rc = build_qfs_info_req(&iov, tcon, server, + FS_POSIX_INFORMATION, sizeof(FILE_SYSTEM_POSIX_INFO), persistent_fid, volatile_fid); if (rc) @@ -5004,7 +5081,8 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = &iov; rqst.rq_nvec = 1; - rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); + rc = cifs_send_recv(xid, ses, server, + &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(iov.iov_base); if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); @@ -5036,10 +5114,12 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype; struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = cifs_pick_channel(ses); struct smb2_fs_full_size_info *info = NULL; int flags = 0; - rc = build_qfs_info_req(&iov, tcon, FS_FULL_SIZE_INFORMATION, + rc = build_qfs_info_req(&iov, tcon, server, + FS_FULL_SIZE_INFORMATION, sizeof(struct smb2_fs_full_size_info), persistent_fid, volatile_fid); if (rc) @@ -5052,7 +5132,8 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = &iov; rqst.rq_nvec = 1; - rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); + rc = cifs_send_recv(xid, ses, server, + &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(iov.iov_base); if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); @@ -5084,6 +5165,7 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype, max_len, min_len; struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = cifs_pick_channel(ses); unsigned int rsp_len, offset; int flags = 0; @@ -5104,7 +5186,8 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, return -EINVAL; } - rc = build_qfs_info_req(&iov, tcon, level, max_len, + rc = build_qfs_info_req(&iov, tcon, server, + level, max_len, persistent_fid, volatile_fid); if (rc) return rc; @@ -5116,7 +5199,8 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = &iov; rqst.rq_nvec = 1; - rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); + rc = cifs_send_recv(xid, ses, server, + &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(iov.iov_base); if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); @@ -5169,10 +5253,12 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, unsigned int count; int flags = CIFS_NO_RSP_BUF; unsigned int total_len; + struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock); - rc = smb2_plain_req_init(SMB2_LOCK, tcon, (void **) &req, &total_len); + rc = smb2_plain_req_init(SMB2_LOCK, tcon, server, + (void **) &req, &total_len); if (rc) return rc; @@ -5198,7 +5284,8 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = iov; rqst.rq_nvec = 2; - rc = cifs_send_recv(xid, tcon->ses, &rqst, &resp_buf_type, flags, + rc = cifs_send_recv(xid, tcon->ses, server, + &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) { @@ -5243,10 +5330,11 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, int resp_buf_type; __u64 *please_key_high; __u64 *please_key_low; + struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); cifs_dbg(FYI, "SMB2_lease_break\n"); - rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req, - &total_len); + rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, server, + (void **) &req, &total_len); if (rc) return rc; @@ -5269,7 +5357,8 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = iov; rqst.rq_nvec = 1; - rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); + rc = cifs_send_recv(xid, ses, server, + &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); please_key_low = (__u64 *)lease_key; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 087d5f14320b..71ba74792c9e 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -143,7 +143,9 @@ extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, struct smb2_file_all_info *buf, struct create_posix_rsp *posix, struct kvec *err_iov, int *resp_buftype); -extern int SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, +extern int SMB2_open_init(struct cifs_tcon *tcon, + struct TCP_Server_Info *server, + struct smb_rqst *rqst, __u8 *oplock, struct cifs_open_parms *oparms, __le16 *path); extern void SMB2_open_free(struct smb_rqst *rqst); @@ -151,7 +153,9 @@ extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data, u32 indatalen, u32 maxoutlen, char **out_data, u32 *plen /* returned data len */); -extern int SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, +extern int SMB2_ioctl_init(struct cifs_tcon *tcon, + struct TCP_Server_Info *server, + struct smb_rqst *rqst, u64 persistent_fid, u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data, u32 indatalen, __u32 max_response_size); @@ -165,19 +169,25 @@ extern int __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, struct smb2_file_network_open_info *pbuf); extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); -extern int SMB2_close_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, - u64 persistent_fid, u64 volatile_fid, bool query_attrs); +extern int SMB2_close_init(struct cifs_tcon *tcon, + struct TCP_Server_Info *server, + struct smb_rqst *rqst, + u64 persistent_fid, u64 volatile_fid, + bool query_attrs); extern void SMB2_close_free(struct smb_rqst *rqst); extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); extern int SMB2_flush_init(const unsigned int xid, struct smb_rqst *rqst, struct cifs_tcon *tcon, + struct TCP_Server_Info *server, u64 persistent_file_id, u64 volatile_file_id); extern void SMB2_flush_free(struct smb_rqst *rqst); extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id, struct smb2_file_all_info *data); -extern int SMB2_query_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, +extern int SMB2_query_info_init(struct cifs_tcon *tcon, + struct TCP_Server_Info *server, + struct smb_rqst *rqst, u64 persistent_fid, u64 volatile_fid, u8 info_class, u8 info_type, u32 additional_info, size_t output_len, @@ -201,6 +211,7 @@ extern int SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, int index, struct cifs_search_info *srch_inf); extern int SMB2_query_directory_init(unsigned int xid, struct cifs_tcon *tcon, + struct TCP_Server_Info *server, struct smb_rqst *rqst, u64 persistent_fid, u64 volatile_fid, int index, int info_level); @@ -208,7 +219,9 @@ extern void SMB2_query_directory_free(struct smb_rqst *rqst); extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 pid, __le64 *eof); -extern int SMB2_set_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, +extern int SMB2_set_info_init(struct cifs_tcon *tcon, + struct TCP_Server_Info *server, + struct smb_rqst *rqst, u64 persistent_fid, u64 volatile_fid, u32 pid, u8 info_class, u8 info_type, u32 additional_info, void **data, unsigned int *size); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 4d4cb26d2ae1..100d04af62b1 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -1020,6 +1020,7 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) int compound_send_recv(const unsigned int xid, struct cifs_ses *ses, + struct TCP_Server_Info *server, const int flags, const int num_rqst, struct smb_rqst *rqst, int *resp_buf_type, struct kvec *resp_iov) { @@ -1031,20 +1032,17 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, }; unsigned int instance; char *buf; - struct TCP_Server_Info *server; optype = flags & CIFS_OP_MASK; for (i = 0; i < num_rqst; i++) resp_buf_type[i] = CIFS_NO_BUFFER; /* no response buf yet */ - if ((ses == NULL) || (ses->server == NULL)) { + if (!ses || !ses->server || !server) { cifs_dbg(VFS, "Null session\n"); return -EIO; } - server = cifs_pick_channel(ses); - if (server->tcpStatus == CifsExiting) return -ENOENT; @@ -1239,11 +1237,12 @@ out: int cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, + struct TCP_Server_Info *server, struct smb_rqst *rqst, int *resp_buf_type, const int flags, struct kvec *resp_iov) { - return compound_send_recv(xid, ses, flags, 1, rqst, resp_buf_type, - resp_iov); + return compound_send_recv(xid, ses, server, flags, 1, + rqst, resp_buf_type, resp_iov); } int @@ -1278,7 +1277,8 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, rqst.rq_iov = new_iov; rqst.rq_nvec = n_vec + 1; - rc = cifs_send_recv(xid, ses, &rqst, resp_buf_type, flags, resp_iov); + rc = cifs_send_recv(xid, ses, ses->server, + &rqst, resp_buf_type, flags, resp_iov); if (n_vec + 1 > CIFS_MAX_IOV_SIZE) kfree(new_iov); return rc; From edb161353680e6d488d94cbcaf967745bee98d17 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 31 May 2020 14:36:56 -0500 Subject: [PATCH 365/427] smb3: remove static checker warning Remove static checker warning pointed out by Dan Carpenter: The patch feeaec621c09: "cifs: multichannel: move channel selection above transport layer" from Apr 24, 2020, leads to the following static checker warning: fs/cifs/smb2pdu.c:149 smb2_hdr_assemble() error: we previously assumed 'tcon->ses' could be null (see line 133) Reported-by: Dan Carpenter CC: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index fe5a8452b213..2497e0e428ac 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -85,7 +85,7 @@ static const int smb2_req_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { int smb3_encryption_required(const struct cifs_tcon *tcon) { - if (!tcon) + if (!tcon || !tcon->ses) return 0; if ((tcon->ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) || (tcon->share_flags & SHI1005_FLAGS_ENCRYPT_DATA)) From 8eec79540d2b9cec385707be45f6e9388b34020f Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Fri, 24 Apr 2020 15:24:05 +0200 Subject: [PATCH 366/427] cifs: multichannel: use pointer for binding channel Add a cifs_chan pointer in struct cifs_ses that points to the channel currently being bound if ses->binding is true. Previously it was always the channel past the established count. This will make reconnecting (and rebinding) a channel easier later on. Signed-off-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 15 ++++++++++++--- fs/cifs/sess.c | 3 ++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index c0cbbd0bbb1d..e133bb3e172f 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1030,6 +1030,7 @@ struct cifs_ses { #define CIFS_MAX_CHANNELS 16 struct cifs_chan chans[CIFS_MAX_CHANNELS]; + struct cifs_chan *binding_chan; size_t chan_count; size_t chan_max; atomic_t chan_seq; /* round robin state */ @@ -1037,23 +1038,31 @@ struct cifs_ses { /* * When binding a new channel, we need to access the channel which isn't fully - * established yet (one past the established count) + * established yet. */ static inline struct cifs_chan *cifs_ses_binding_channel(struct cifs_ses *ses) { if (ses->binding) - return &ses->chans[ses->chan_count]; + return ses->binding_chan; else return NULL; } +/* + * Returns the server pointer of the session. When binding a new + * channel this returns the last channel which isn't fully established + * yet. + * + * This function should be use for negprot/sess.setup codepaths. For + * the other requests see cifs_pick_channel(). + */ static inline struct TCP_Server_Info *cifs_ses_server(struct cifs_ses *ses) { if (ses->binding) - return ses->chans[ses->chan_count].server; + return ses->binding_chan->server; else return ses->server; } diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 0ae25cc77fc0..1ffdd7dadc55 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -231,7 +231,7 @@ cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface) mutex_lock(&ses->session_mutex); - chan = &ses->chans[ses->chan_count]; + chan = ses->binding_chan = &ses->chans[ses->chan_count]; chan->server = cifs_get_tcp_session(&vol); if (IS_ERR(chan->server)) { rc = PTR_ERR(chan->server); @@ -276,6 +276,7 @@ cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface) atomic_set(&ses->chan_seq, 0); out: ses->binding = false; + ses->binding_chan = NULL; mutex_unlock(&ses->session_mutex); if (rc && chan->server) From 2f58967979409ea3ec799343aa35e9007f735a3b Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Fri, 24 Apr 2020 16:55:31 +0200 Subject: [PATCH 367/427] cifs: multichannel: try to rebind when reconnecting a channel first steps in trying to make channels properly reconnect. * add cifs_ses_find_chan() function to find the enclosing cifs_chan struct it belongs to * while we have the session lock and are redoing negprot and sess.setup in smb2_reconnect() redo the binding of channels. Signed-off-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 2 ++ fs/cifs/sess.c | 16 ++++++++++++++++ fs/cifs/smb2pdu.c | 16 ++++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index a25a46237f9f..bd92070ca30c 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -593,6 +593,8 @@ void cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc); extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page, unsigned int *len, unsigned int *offset); +struct cifs_chan * +cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server); int cifs_try_adding_channels(struct cifs_ses *ses); int cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 1ffdd7dadc55..5d05bd2822d2 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -150,6 +150,22 @@ int cifs_try_adding_channels(struct cifs_ses *ses) return ses->chan_count - old_chan_count; } +/* + * If server is a channel of ses, return the corresponding enclosing + * cifs_chan otherwise return NULL. + */ +struct cifs_chan * +cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server) +{ + int i; + + for (i = 0; i < ses->chan_count; i++) { + if (ses->chans[i].server == server) + return &ses->chans[i]; + } + return NULL; +} + int cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface) { diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 2497e0e428ac..ded96b529a4d 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -375,15 +375,31 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, goto out; } + /* + * If we are reconnecting an extra channel, bind + */ + if (server->is_channel) { + ses->binding = true; + ses->binding_chan = cifs_ses_find_chan(ses, server); + } + rc = cifs_negotiate_protocol(0, tcon->ses); if (!rc && tcon->ses->need_reconnect) { rc = cifs_setup_session(0, tcon->ses, nls_codepage); if ((rc == -EACCES) && !tcon->retry) { rc = -EHOSTDOWN; + ses->binding = false; + ses->binding_chan = NULL; mutex_unlock(&tcon->ses->session_mutex); goto failed; } } + /* + * End of channel binding + */ + ses->binding = false; + ses->binding_chan = NULL; + if (rc || !tcon->need_reconnect) { mutex_unlock(&tcon->ses->session_mutex); goto out; From 331cc667a99c633abbbebeab4675beae713fb331 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 2 Jun 2020 23:30:50 -0500 Subject: [PATCH 368/427] cifs: update internal module version number To 2.27 Signed-off-by: Steve French --- fs/cifs/cifsfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index c9e2e6bbca13..c7a311d28d3d 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -156,5 +156,5 @@ extern int cifs_truncate_page(struct address_space *mapping, loff_t from); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.26" +#define CIFS_VERSION "2.27" #endif /* _CIFSFS_H */ From a1c979f330cb82cae7a3b19464f9815e43060fe3 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 27 May 2020 07:04:46 -0400 Subject: [PATCH 369/427] dm bufio: delete unused and inefficient dm_bufio_discard_buffers There is no user for this interface. If in future it is needed it can be reimplemented to walk the rbtree of buffers instead of doing block-by-block lookups. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 26 -------------------------- include/linux/dm-bufio.h | 7 ------- 2 files changed, 33 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index bf289be1ee3a..993e624e506c 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1358,32 +1358,6 @@ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t c } EXPORT_SYMBOL_GPL(dm_bufio_issue_discard); -/* - * Free the specified range of buffers. If a buffer is held by other process, it - * is not freed. If a buffer is dirty, it is discarded without writeback. - * Finally, send the discard request to the device. - */ -int dm_bufio_discard_buffers(struct dm_bufio_client *c, sector_t block, sector_t count) -{ - sector_t i; - - for (i = block; i < block + count; i++) { - struct dm_buffer *b; - dm_bufio_lock(c); - b = __find(c, i); - if (b && likely(!b->hold_count)) { - wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); - wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); - __unlink_buffer(b); - __free_buffer_wake(b); - } - dm_bufio_unlock(c); - } - - return dm_bufio_issue_discard(c, block, count); -} -EXPORT_SYMBOL_GPL(dm_bufio_discard_buffers); - /* * We first delete any other buffer that may be at that new location. * diff --git a/include/linux/dm-bufio.h b/include/linux/dm-bufio.h index 07e1f163e299..5ec6bfbde9ae 100644 --- a/include/linux/dm-bufio.h +++ b/include/linux/dm-bufio.h @@ -123,13 +123,6 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c); */ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count); -/* - * Free the specified range of buffers. If a buffer is held by other process, it - * is not freed. If a buffer is dirty, it is discarded without writeback. - * Finally, send the discard request to the device. - */ -int dm_bufio_discard_buffers(struct dm_bufio_client *c, sector_t block, sector_t count); - /* * Like dm_bufio_release but also move the buffer to the new * block. dm_bufio_write_dirty_buffers is needed to commit the new block. From 42413b49804b250ced70dac8815388af2d4ad872 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Fri, 5 Jun 2020 08:50:28 +0200 Subject: [PATCH 370/427] ima: Directly free *entry in ima_alloc_init_template() if digests is NULL To support multiple template digests, the static array entry->digest has been replaced with a dynamically allocated array in commit aa724fe18a8a ("ima: Switch to dynamically allocated buffer for template digests"). The array is allocated in ima_alloc_init_template() and if the returned pointer is NULL, ima_free_template_entry() is called. However, (*entry)->template_desc is not yet initialized while it is used by ima_free_template_entry(). This patch fixes the issue by directly freeing *entry without calling ima_free_template_entry(). Fixes: aa724fe18a8a ("ima: Switch to dynamically allocated buffer for template digests") Reported-by: syzbot+223310b454ba6b75974e@syzkaller.appspotmail.com Signed-off-by: Roberto Sassu Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_api.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index 78e0b0a7723e..bf22de8b7ce0 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -55,8 +55,9 @@ int ima_alloc_init_template(struct ima_event_data *event_data, digests = kcalloc(NR_BANKS(ima_tpm_chip) + ima_extra_slots, sizeof(*digests), GFP_NOFS); if (!digests) { - result = -ENOMEM; - goto out; + kfree(*entry); + *entry = NULL; + return -ENOMEM; } (*entry)->digests = digests; From 40e9c5ac4e3d670799a247944c330e5126935a7c Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 2 Jun 2020 09:48:10 -0400 Subject: [PATCH 371/427] dm integrity: add status line documentation Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- Documentation/admin-guide/device-mapper/dm-integrity.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/admin-guide/device-mapper/dm-integrity.rst b/Documentation/admin-guide/device-mapper/dm-integrity.rst index 8439d2ae689b..9edd45593abd 100644 --- a/Documentation/admin-guide/device-mapper/dm-integrity.rst +++ b/Documentation/admin-guide/device-mapper/dm-integrity.rst @@ -193,6 +193,14 @@ should not be changed when reloading the target because the layout of disk data depend on them and the reloaded target would be non-functional. +Status line: + +1. the number of integrity mismatches +2. provided data sectors - that is the number of sectors that the user + could use +3. the current recalculating position (or '-' if we didn't recalculate) + + The layout of the formatted block device: * reserved sectors From 88f878e58879acfdad03e08776c9802f9cd6f26a Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 2 Jun 2020 15:34:39 +0200 Subject: [PATCH 372/427] dm bufio: clean up rbtree block ordering dm-bufio uses unnatural ordering in the rb-tree - blocks with smaller numbers were put to the right node and blocks with bigger numbers were put to the left node. Reverse that logic so that it's natural. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 993e624e506c..ff19add97e0b 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -256,7 +256,7 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) if (b->block == block) return b; - n = (b->block < block) ? n->rb_left : n->rb_right; + n = block < b->block ? n->rb_left : n->rb_right; } return NULL; @@ -276,8 +276,8 @@ static void __insert(struct dm_bufio_client *c, struct dm_buffer *b) } parent = *new; - new = (found->block < b->block) ? - &((*new)->rb_left) : &((*new)->rb_right); + new = b->block < found->block ? + &found->node.rb_left : &found->node.rb_right; } rb_link_node(&b->node, parent, new); From 33a180623b6c35f2727daecb63763955af3af1df Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 2 Jun 2020 15:34:40 +0200 Subject: [PATCH 373/427] dm bufio: introduce forget_buffer_locked Introduce a function forget_buffer_locked that forgets a range of buffers. It is more efficient than calling forget_buffer in a loop. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 60 +++++++++++++++++++++++++++++++++++++--- include/linux/dm-bufio.h | 7 +++++ 2 files changed, 63 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index ff19add97e0b..95f6c544aa01 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -262,6 +262,29 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) return NULL; } +static struct dm_buffer *__find_next(struct dm_bufio_client *c, sector_t block) +{ + struct rb_node *n = c->buffer_tree.rb_node; + struct dm_buffer *b; + struct dm_buffer *best = NULL; + + while (n) { + b = container_of(n, struct dm_buffer, node); + + if (b->block == block) + return b; + + if (block <= b->block) { + n = n->rb_left; + best = b; + } else { + n = n->rb_right; + } + } + + return best; +} + static void __insert(struct dm_bufio_client *c, struct dm_buffer *b) { struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL; @@ -1434,6 +1457,14 @@ retry: } EXPORT_SYMBOL_GPL(dm_bufio_release_move); +static void forget_buffer_locked(struct dm_buffer *b) +{ + if (likely(!b->hold_count) && likely(!b->state)) { + __unlink_buffer(b); + __free_buffer_wake(b); + } +} + /* * Free the given buffer. * @@ -1447,15 +1478,36 @@ void dm_bufio_forget(struct dm_bufio_client *c, sector_t block) dm_bufio_lock(c); b = __find(c, block); - if (b && likely(!b->hold_count) && likely(!b->state)) { - __unlink_buffer(b); - __free_buffer_wake(b); - } + if (b) + forget_buffer_locked(b); dm_bufio_unlock(c); } EXPORT_SYMBOL_GPL(dm_bufio_forget); +void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks) +{ + struct dm_buffer *b; + sector_t end_block = block + n_blocks; + + while (block < end_block) { + dm_bufio_lock(c); + + b = __find_next(c, block); + if (b) { + block = b->block + 1; + forget_buffer_locked(b); + } + + dm_bufio_unlock(c); + + if (!b) + break; + } + +} +EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers); + void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n) { c->minimum_buffers = n; diff --git a/include/linux/dm-bufio.h b/include/linux/dm-bufio.h index 5ec6bfbde9ae..29d255fdd5d6 100644 --- a/include/linux/dm-bufio.h +++ b/include/linux/dm-bufio.h @@ -136,6 +136,13 @@ void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block); */ void dm_bufio_forget(struct dm_bufio_client *c, sector_t block); +/* + * Free the given range of buffers. + * This is just a hint, if the buffer is in use or dirty, this function + * does nothing. + */ +void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks); + /* * Set the minimum number of buffers before cleanup happens. */ From 334b4fc17275667b38ebfd719714dab0edb83c6a Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 2 Jun 2020 15:34:41 +0200 Subject: [PATCH 374/427] dm ebs: use dm_bufio_forget_buffers Use dm_bufio_forget_buffers instead of a block-by-block loop that calls dm_bufio_forget. dm_bufio_forget_buffers is faster than the loop because it searches for used buffers using rb-tree. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-ebs-target.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index ae3f5fad3b39..44451276f128 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -167,8 +167,8 @@ static void __ebs_forget_bio(struct ebs_c *ec, struct bio *bio) sector_t blocks, sector = bio->bi_iter.bi_sector; blocks = __nr_blocks(ec, bio); - for (; blocks--; sector += ec->u_bs) - dm_bufio_forget(ec->bufio, __sector_to_block(ec, sector)); + + dm_bufio_forget_buffers(ec->bufio, __sector_to_block(ec, sector), blocks); } /* Worker funtion to process incoming bios. */ From 35d0c96e422a484bbc5d4921fa20dcc880bfba2c Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:42 +0200 Subject: [PATCH 375/427] dm zoned: add debugging message for reading superblocks Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 4a2e351365c5..ef1524d5928a 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1105,6 +1105,10 @@ static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) */ static int dmz_read_sb(struct dmz_metadata *zmd, unsigned int set) { + dmz_zmd_debug(zmd, "read superblock set %d dev %s block %llu", + set, zmd->sb[set].dev->name, + zmd->sb[set].block); + return dmz_rdwr_block(zmd->sb[set].dev, REQ_OP_READ, zmd->sb[set].block, zmd->sb[set].mblk->page); } From 1565929b870fe166c5a57a85d6cb5a2bfe1e6c84 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:43 +0200 Subject: [PATCH 376/427] dm zoned: avoid unnecessary device recalulation for secondary superblock The secondary superblock must reside on the same device as the primary superblock, so there is no need to re-calculate the device. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index ef1524d5928a..043ed882970a 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1135,7 +1135,7 @@ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) /* Bad first super block: search for the second one */ zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks; zmd->sb[1].zone = zmd->sb[0].zone + 1; - zmd->sb[1].dev = dmz_zone_to_dev(zmd, zmd->sb[1].zone); + zmd->sb[1].dev = zmd->sb[0].dev; for (i = 0; i < zmd->nr_rnd_zones - 1; i++) { if (dmz_read_sb(zmd, 1) != 0) break; @@ -1144,7 +1144,6 @@ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) return 0; } zmd->sb[1].block += zone_nr_blocks; - zmd->sb[1].dev = dmz_zone_to_dev(zmd, zmd->sb[1].zone + i); } dmz_free_mblock(zmd, mblk); @@ -1263,7 +1262,7 @@ static int dmz_load_sb(struct dmz_metadata *zmd) if (!zmd->sb[1].zone) zmd->sb[1].zone = zmd->sb[0].zone + zmd->nr_meta_zones; zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); - zmd->sb[1].dev = dmz_zone_to_dev(zmd, zmd->sb[1].zone); + zmd->sb[1].dev = zmd->sb[0].dev; ret = dmz_get_sb(zmd, 1); } else ret = dmz_lookup_secondary_sb(zmd); From c3ff479dde9f77d044c164f3ff5443bbe2b6c72d Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:44 +0200 Subject: [PATCH 377/427] dm zoned: improve logging messages for reclaim Instead of just reporting the errno, add some more verbose debugging message in the reclaim path. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-reclaim.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 571bc1d41bab..fd4d47dfcea1 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -371,8 +371,11 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) /* Get a data zone */ dzone = dmz_get_zone_for_reclaim(zmd, dmz_target_idle(zrc)); - if (!dzone) + if (!dzone) { + DMDEBUG("(%s): No zone found to reclaim", + dmz_metadata_label(zmd)); return -EBUSY; + } start = jiffies; if (dmz_is_cache(dzone) || dmz_is_rnd(dzone)) { @@ -416,6 +419,12 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) } out: if (ret) { + if (ret == -EINTR) + DMDEBUG("(%s): reclaim zone %u interrupted", + dmz_metadata_label(zmd), rzone->id); + else + DMDEBUG("(%s): Failed to reclaim zone %u, err %d", + dmz_metadata_label(zmd), rzone->id, ret); dmz_unlock_zone_reclaim(dzone); return ret; } @@ -519,8 +528,6 @@ static void dmz_reclaim_work(struct work_struct *work) ret = dmz_do_reclaim(zrc); if (ret && ret != -EINTR) { - DMDEBUG("(%s): Reclaim error %d", - dmz_metadata_label(zmd), ret); if (!dmz_check_dev(zmd)) return; } From aec67b4ffa4bea4a02063d9a0f379e5795d6f5dc Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:45 +0200 Subject: [PATCH 378/427] dm zoned: add a 'reserved' zone flag Instead of counting the number of reserved zones in dmz_free_zone(), mark the zone as 'reserved' during allocation and simplify dmz_free_zone(). Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 4 ++-- drivers/md/dm-zoned.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 043ed882970a..0982ab1758a6 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1743,6 +1743,7 @@ next: atomic_inc(&zmd->unmap_nr_rnd); } else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) { list_add_tail(&dzone->link, &zmd->reserved_seq_zones_list); + set_bit(DMZ_RESERVED, &dzone->flags); atomic_inc(&zmd->nr_reserved_seq_zones); zmd->nr_seq--; } else { @@ -2168,8 +2169,7 @@ void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone) } else if (dmz_is_rnd(zone)) { list_add_tail(&zone->link, &zmd->unmap_rnd_list); atomic_inc(&zmd->unmap_nr_rnd); - } else if (atomic_read(&zmd->nr_reserved_seq_zones) < - zmd->nr_reserved_seq) { + } else if (dmz_is_reserved(zone)) { list_add_tail(&zone->link, &zmd->reserved_seq_zones_list); atomic_inc(&zmd->nr_reserved_seq_zones); } else { diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 8083607b9535..3451b5a768b4 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -123,6 +123,7 @@ enum { DMZ_META, DMZ_DATA, DMZ_BUF, + DMZ_RESERVED, /* Zone internal state */ DMZ_RECLAIM, @@ -140,6 +141,7 @@ enum { #define dmz_is_offline(z) test_bit(DMZ_OFFLINE, &(z)->flags) #define dmz_is_readonly(z) test_bit(DMZ_READ_ONLY, &(z)->flags) #define dmz_in_reclaim(z) test_bit(DMZ_RECLAIM, &(z)->flags) +#define dmz_is_reserved(z) test_bit(DMZ_RESERVED, &(z)->flags) #define dmz_seq_write_err(z) test_bit(DMZ_SEQ_WRITE_ERR, &(z)->flags) #define dmz_reclaim_should_terminate(z) \ test_bit(DMZ_RECLAIM_TERMINATE, &(z)->flags) From a92fbc446d1a93950b7e25bec6ad75dd26f01ba8 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:46 +0200 Subject: [PATCH 379/427] dm zoned: convert to xarray The zones array is getting really large, and large arrays tend to wreak havoc with the CPU caches. So convert it to xarray to become more cache friendly. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Colin Ian King # fix leak in dmz_insert Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 122 ++++++++++++++++++++++++--------- 1 file changed, 90 insertions(+), 32 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 0982ab1758a6..b235283a8846 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -172,7 +172,7 @@ struct dmz_metadata { unsigned int nr_chunks; /* Zone information array */ - struct dm_zone *zones; + struct xarray zones; struct dmz_sb sb[3]; unsigned int mblk_primary; @@ -327,6 +327,32 @@ unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd) return atomic_read(&zmd->unmap_nr_seq); } +static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id) +{ + return xa_load(&zmd->zones, zone_id); +} + +static struct dm_zone *dmz_insert(struct dmz_metadata *zmd, + unsigned int zone_id) +{ + struct dm_zone *zone = kzalloc(sizeof(struct dm_zone), GFP_KERNEL); + + if (!zone) + return ERR_PTR(-ENOMEM); + + if (xa_insert(&zmd->zones, zone_id, zone, GFP_KERNEL)) { + kfree(zone); + return ERR_PTR(-EBUSY); + } + + INIT_LIST_HEAD(&zone->link); + atomic_set(&zone->refcount, 0); + zone->id = zone_id; + zone->chunk = DMZ_MAP_UNMAPPED; + + return zone; +} + const char *dmz_metadata_label(struct dmz_metadata *zmd) { return (const char *)zmd->label; @@ -1122,6 +1148,7 @@ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) { unsigned int zone_nr_blocks = zmd->zone_nr_blocks; struct dmz_mblock *mblk; + unsigned int zone_id = zmd->sb[0].zone->id; int i; /* Allocate a block */ @@ -1134,16 +1161,15 @@ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) /* Bad first super block: search for the second one */ zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks; - zmd->sb[1].zone = zmd->sb[0].zone + 1; + zmd->sb[1].zone = dmz_get(zmd, zone_id + 1); zmd->sb[1].dev = zmd->sb[0].dev; - for (i = 0; i < zmd->nr_rnd_zones - 1; i++) { + for (i = 1; i < zmd->nr_rnd_zones; i++) { if (dmz_read_sb(zmd, 1) != 0) break; - if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC) { - zmd->sb[1].zone += i; + if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC) return 0; - } zmd->sb[1].block += zone_nr_blocks; + zmd->sb[1].zone = dmz_get(zmd, zone_id + i); } dmz_free_mblock(zmd, mblk); @@ -1259,8 +1285,12 @@ static int dmz_load_sb(struct dmz_metadata *zmd) /* Read and check secondary super block */ if (ret == 0) { sb_good[0] = true; - if (!zmd->sb[1].zone) - zmd->sb[1].zone = zmd->sb[0].zone + zmd->nr_meta_zones; + if (!zmd->sb[1].zone) { + unsigned int zone_id = + zmd->sb[0].zone->id + zmd->nr_meta_zones; + + zmd->sb[1].zone = dmz_get(zmd, zone_id); + } zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); zmd->sb[1].dev = zmd->sb[0].dev; ret = dmz_get_sb(zmd, 1); @@ -1341,7 +1371,11 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) struct dmz_metadata *zmd = data; struct dmz_dev *dev = zmd->nr_devs > 1 ? &zmd->dev[1] : &zmd->dev[0]; int idx = num + dev->zone_offset; - struct dm_zone *zone = &zmd->zones[idx]; + struct dm_zone *zone; + + zone = dmz_insert(zmd, idx); + if (IS_ERR(zone)) + return PTR_ERR(zone); if (blkz->len != zmd->zone_nr_sectors) { if (zmd->sb_version > 1) { @@ -1353,11 +1387,6 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) return -ENXIO; } - INIT_LIST_HEAD(&zone->link); - atomic_set(&zone->refcount, 0); - zone->id = idx; - zone->chunk = DMZ_MAP_UNMAPPED; - switch (blkz->type) { case BLK_ZONE_TYPE_CONVENTIONAL: set_bit(DMZ_RND, &zone->flags); @@ -1397,18 +1426,17 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) return 0; } -static void dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev) +static int dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev) { int idx; sector_t zone_offset = 0; for(idx = 0; idx < dev->nr_zones; idx++) { - struct dm_zone *zone = &zmd->zones[idx]; + struct dm_zone *zone; - INIT_LIST_HEAD(&zone->link); - atomic_set(&zone->refcount, 0); - zone->id = idx; - zone->chunk = DMZ_MAP_UNMAPPED; + zone = dmz_insert(zmd, idx); + if (IS_ERR(zone)) + return PTR_ERR(zone); set_bit(DMZ_CACHE, &zone->flags); zone->wp_block = 0; zmd->nr_cache_zones++; @@ -1420,6 +1448,7 @@ static void dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev) } zone_offset += zmd->zone_nr_sectors; } + return 0; } /* @@ -1427,8 +1456,15 @@ static void dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev) */ static void dmz_drop_zones(struct dmz_metadata *zmd) { - kfree(zmd->zones); - zmd->zones = NULL; + int idx; + + for(idx = 0; idx < zmd->nr_zones; idx++) { + struct dm_zone *zone = xa_load(&zmd->zones, idx); + + kfree(zone); + xa_erase(&zmd->zones, idx); + } + xa_destroy(&zmd->zones); } /* @@ -1460,20 +1496,25 @@ static int dmz_init_zones(struct dmz_metadata *zmd) DMERR("(%s): No zones found", zmd->devname); return -ENXIO; } - zmd->zones = kcalloc(zmd->nr_zones, sizeof(struct dm_zone), GFP_KERNEL); - if (!zmd->zones) - return -ENOMEM; + xa_init(&zmd->zones); DMDEBUG("(%s): Using %zu B for zone information", zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones); if (zmd->nr_devs > 1) { - dmz_emulate_zones(zmd, &zmd->dev[0]); + ret = dmz_emulate_zones(zmd, &zmd->dev[0]); + if (ret < 0) { + DMDEBUG("(%s): Failed to emulate zones, error %d", + zmd->devname, ret); + dmz_drop_zones(zmd); + return ret; + } + /* * Primary superblock zone is always at zone 0 when multiple * drives are present. */ - zmd->sb[0].zone = &zmd->zones[0]; + zmd->sb[0].zone = dmz_get(zmd, 0); zoned_dev = &zmd->dev[1]; } @@ -1576,11 +1617,6 @@ static int dmz_handle_seq_write_err(struct dmz_metadata *zmd, return 0; } -static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id) -{ - return &zmd->zones[zone_id]; -} - /* * Reset a zone write pointer. */ @@ -1662,6 +1698,11 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) } dzone = dmz_get(zmd, dzone_id); + if (!dzone) { + dmz_zmd_err(zmd, "Chunk %u mapping: data zone %u not present", + chunk, dzone_id); + return -EIO; + } set_bit(DMZ_DATA, &dzone->flags); dzone->chunk = chunk; dmz_get_zone_weight(zmd, dzone); @@ -1685,6 +1726,11 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) } bzone = dmz_get(zmd, bzone_id); + if (!bzone) { + dmz_zmd_err(zmd, "Chunk %u mapping: buffer zone %u not present", + chunk, bzone_id); + return -EIO; + } if (!dmz_is_rnd(bzone) && !dmz_is_cache(bzone)) { dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone %u", chunk, bzone_id); @@ -1715,6 +1761,8 @@ next: */ for (i = 0; i < zmd->nr_zones; i++) { dzone = dmz_get(zmd, i); + if (!dzone) + continue; if (dmz_is_meta(dzone)) continue; if (dmz_is_offline(dzone)) @@ -1978,6 +2026,10 @@ again: } else { /* The chunk is already mapped: get the mapping zone */ dzone = dmz_get(zmd, dzone_id); + if (!dzone) { + dzone = ERR_PTR(-EIO); + goto out; + } if (dzone->chunk != chunk) { dzone = ERR_PTR(-EIO); goto out; @@ -2794,6 +2846,12 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, /* Set metadata zones starting from sb_zone */ for (i = 0; i < zmd->nr_meta_zones << 1; i++) { zone = dmz_get(zmd, zmd->sb[0].zone->id + i); + if (!zone) { + dmz_zmd_err(zmd, + "metadata zone %u not present", i); + ret = -ENXIO; + goto err; + } if (!dmz_is_rnd(zone) && !dmz_is_cache(zone)) { dmz_zmd_err(zmd, "metadata zone %d is not random", i); From 5d2c74f3ddc010b5812e556715f7605201eff101 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:47 +0200 Subject: [PATCH 380/427] dm zoned: allocate temporary superblock for tertiary devices Checking the tertiary superblock just consists of validating UUIDs, crcs, and the generation number; it doesn't have contents which would be required during the actual operation. So allocate a temporary superblock when checking tertiary devices to avoid having to store it together with the 'real' superblocks. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 109 ++++++++++++++++++--------------- 1 file changed, 61 insertions(+), 48 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index b235283a8846..525ac0d80287 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -174,7 +174,7 @@ struct dmz_metadata { /* Zone information array */ struct xarray zones; - struct dmz_sb sb[3]; + struct dmz_sb sb[2]; unsigned int mblk_primary; unsigned int sb_version; u64 sb_gen; @@ -1016,10 +1016,11 @@ err: /* * Check super block. */ -static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) +static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_sb *dsb, + bool tertiary) { - struct dmz_super *sb = zmd->sb[set].sb; - struct dmz_dev *dev = zmd->sb[set].dev; + struct dmz_super *sb = dsb->sb; + struct dmz_dev *dev = dsb->dev; unsigned int nr_meta_zones, nr_data_zones; u32 crc, stored_crc; u64 gen; @@ -1036,7 +1037,7 @@ static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) DMZ_META_VER, zmd->sb_version); return -EINVAL; } - if ((zmd->sb_version < 1) && (set == 2)) { + if (zmd->sb_version < 2 && tertiary) { dmz_dev_err(dev, "Tertiary superblocks are not supported"); return -EINVAL; } @@ -1080,7 +1081,7 @@ static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) return -ENXIO; } - if (set == 2) { + if (tertiary) { /* * Generation number should be 0, but it doesn't * really matter if it isn't. @@ -1129,14 +1130,13 @@ static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) /* * Read the first or second super block from disk. */ -static int dmz_read_sb(struct dmz_metadata *zmd, unsigned int set) +static int dmz_read_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set) { dmz_zmd_debug(zmd, "read superblock set %d dev %s block %llu", - set, zmd->sb[set].dev->name, - zmd->sb[set].block); + set, sb->dev->name, sb->block); - return dmz_rdwr_block(zmd->sb[set].dev, REQ_OP_READ, - zmd->sb[set].block, zmd->sb[set].mblk->page); + return dmz_rdwr_block(sb->dev, REQ_OP_READ, + sb->block, sb->mblk->page); } /* @@ -1164,7 +1164,7 @@ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) zmd->sb[1].zone = dmz_get(zmd, zone_id + 1); zmd->sb[1].dev = zmd->sb[0].dev; for (i = 1; i < zmd->nr_rnd_zones; i++) { - if (dmz_read_sb(zmd, 1) != 0) + if (dmz_read_sb(zmd, &zmd->sb[1], 1) != 0) break; if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC) return 0; @@ -1181,9 +1181,9 @@ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) } /* - * Read the first or second super block from disk. + * Read a super block from disk. */ -static int dmz_get_sb(struct dmz_metadata *zmd, unsigned int set) +static int dmz_get_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set) { struct dmz_mblock *mblk; int ret; @@ -1193,14 +1193,14 @@ static int dmz_get_sb(struct dmz_metadata *zmd, unsigned int set) if (!mblk) return -ENOMEM; - zmd->sb[set].mblk = mblk; - zmd->sb[set].sb = mblk->data; + sb->mblk = mblk; + sb->sb = mblk->data; /* Read super block */ - ret = dmz_read_sb(zmd, set); + ret = dmz_read_sb(zmd, sb, set); if (ret) { dmz_free_mblock(zmd, mblk); - zmd->sb[set].mblk = NULL; + sb->mblk = NULL; return ret; } @@ -1274,13 +1274,13 @@ static int dmz_load_sb(struct dmz_metadata *zmd) /* Read and check the primary super block */ zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); zmd->sb[0].dev = dmz_zone_to_dev(zmd, zmd->sb[0].zone); - ret = dmz_get_sb(zmd, 0); + ret = dmz_get_sb(zmd, &zmd->sb[0], 0); if (ret) { dmz_dev_err(zmd->sb[0].dev, "Read primary super block failed"); return ret; } - ret = dmz_check_sb(zmd, 0); + ret = dmz_check_sb(zmd, &zmd->sb[0], false); /* Read and check secondary super block */ if (ret == 0) { @@ -1293,7 +1293,7 @@ static int dmz_load_sb(struct dmz_metadata *zmd) } zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); zmd->sb[1].dev = zmd->sb[0].dev; - ret = dmz_get_sb(zmd, 1); + ret = dmz_get_sb(zmd, &zmd->sb[1], 1); } else ret = dmz_lookup_secondary_sb(zmd); @@ -1302,7 +1302,7 @@ static int dmz_load_sb(struct dmz_metadata *zmd) return ret; } - ret = dmz_check_sb(zmd, 1); + ret = dmz_check_sb(zmd, &zmd->sb[1], false); if (ret == 0) sb_good[1] = true; @@ -1347,20 +1347,40 @@ static int dmz_load_sb(struct dmz_metadata *zmd) "Using super block %u (gen %llu)", zmd->mblk_primary, zmd->sb_gen); - if ((zmd->sb_version > 1) && zmd->sb[2].zone) { - zmd->sb[2].block = dmz_start_block(zmd, zmd->sb[2].zone); - zmd->sb[2].dev = dmz_zone_to_dev(zmd, zmd->sb[2].zone); - ret = dmz_get_sb(zmd, 2); - if (ret) { - dmz_dev_err(zmd->sb[2].dev, - "Read tertiary super block failed"); - return ret; + if (zmd->sb_version > 1) { + int i; + struct dmz_sb *sb; + + sb = kzalloc(sizeof(struct dmz_sb), GFP_KERNEL); + if (!sb) + return -ENOMEM; + for (i = 1; i < zmd->nr_devs; i++) { + sb->block = 0; + sb->zone = dmz_get(zmd, zmd->dev[i].zone_offset); + sb->dev = &zmd->dev[i]; + if (!dmz_is_meta(sb->zone)) { + dmz_dev_err(sb->dev, + "Tertiary super block zone %u not marked as metadata zone", + sb->zone->id); + ret = -EINVAL; + goto out_kfree; + } + ret = dmz_get_sb(zmd, sb, i + 1); + if (ret) { + dmz_dev_err(sb->dev, + "Read tertiary super block failed"); + dmz_free_mblock(zmd, sb->mblk); + goto out_kfree; + } + ret = dmz_check_sb(zmd, sb, true); + dmz_free_mblock(zmd, sb->mblk); + if (ret == -EINVAL) + goto out_kfree; } - ret = dmz_check_sb(zmd, 2); - if (ret == -EINVAL) - return ret; + out_kfree: + kfree(sb); } - return 0; + return ret; } /* @@ -1417,12 +1437,15 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) zmd->sb[0].zone = zone; } } - if (zmd->nr_devs > 1 && !zmd->sb[2].zone) { - /* Tertiary superblock zone */ - zmd->sb[2].zone = zone; + if (zmd->nr_devs > 1 && num == 0) { + /* + * Tertiary superblock zones are always at the + * start of the zoned devices, so mark them + * as metadata zone. + */ + set_bit(DMZ_META, &zone->flags); } } - return 0; } @@ -2860,16 +2883,6 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, } set_bit(DMZ_META, &zone->flags); } - if (zmd->sb[2].zone) { - zone = dmz_get(zmd, zmd->sb[2].zone->id); - if (!zone) { - dmz_zmd_err(zmd, - "Tertiary metadata zone not present"); - ret = -ENXIO; - goto err; - } - set_bit(DMZ_META, &zone->flags); - } /* Load mapping table */ ret = dmz_load_mapping(zmd); if (ret) From 8f22272af7a72763fe9f6b710cdcc380fed80f75 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:48 +0200 Subject: [PATCH 381/427] dm zoned: add device pointer to struct dm_zone Add a pointer, to the containing device, within struct dm_zone and kill dmz_zone_to_dev(). Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 39 +++++++++------------------------- drivers/md/dm-zoned-reclaim.c | 13 +++++------- drivers/md/dm-zoned-target.c | 2 +- drivers/md/dm-zoned.h | 4 +++- 4 files changed, 19 insertions(+), 39 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 525ac0d80287..68d44506e6f3 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -229,16 +229,10 @@ struct dmz_metadata { */ static unsigned int dmz_dev_zone_id(struct dmz_metadata *zmd, struct dm_zone *zone) { - unsigned int zone_id; - if (WARN_ON(!zone)) return 0; - zone_id = zone->id; - if (zmd->nr_devs > 1 && - (zone_id >= zmd->dev[1].zone_offset)) - zone_id -= zmd->dev[1].zone_offset; - return zone_id; + return zone->id - zone->dev->zone_offset; } sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone) @@ -255,18 +249,6 @@ sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone) return (sector_t)zone_id << zmd->zone_nr_blocks_shift; } -struct dmz_dev *dmz_zone_to_dev(struct dmz_metadata *zmd, struct dm_zone *zone) -{ - if (WARN_ON(!zone)) - return &zmd->dev[0]; - - if (zmd->nr_devs > 1 && - zone->id >= zmd->dev[1].zone_offset) - return &zmd->dev[1]; - - return &zmd->dev[0]; -} - unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd) { return zmd->zone_nr_blocks; @@ -333,7 +315,7 @@ static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id) } static struct dm_zone *dmz_insert(struct dmz_metadata *zmd, - unsigned int zone_id) + unsigned int zone_id, struct dmz_dev *dev) { struct dm_zone *zone = kzalloc(sizeof(struct dm_zone), GFP_KERNEL); @@ -349,6 +331,7 @@ static struct dm_zone *dmz_insert(struct dmz_metadata *zmd, atomic_set(&zone->refcount, 0); zone->id = zone_id; zone->chunk = DMZ_MAP_UNMAPPED; + zone->dev = dev; return zone; } @@ -1273,7 +1256,7 @@ static int dmz_load_sb(struct dmz_metadata *zmd) /* Read and check the primary super block */ zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); - zmd->sb[0].dev = dmz_zone_to_dev(zmd, zmd->sb[0].zone); + zmd->sb[0].dev = zmd->sb[0].zone->dev; ret = dmz_get_sb(zmd, &zmd->sb[0], 0); if (ret) { dmz_dev_err(zmd->sb[0].dev, "Read primary super block failed"); @@ -1393,7 +1376,7 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) int idx = num + dev->zone_offset; struct dm_zone *zone; - zone = dmz_insert(zmd, idx); + zone = dmz_insert(zmd, idx, dev); if (IS_ERR(zone)) return PTR_ERR(zone); @@ -1457,7 +1440,7 @@ static int dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev) for(idx = 0; idx < dev->nr_zones; idx++) { struct dm_zone *zone; - zone = dmz_insert(zmd, idx); + zone = dmz_insert(zmd, idx, dev); if (IS_ERR(zone)) return PTR_ERR(zone); set_bit(DMZ_CACHE, &zone->flags); @@ -1583,7 +1566,7 @@ static int dmz_update_zone_cb(struct blk_zone *blkz, unsigned int idx, */ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) { - struct dmz_dev *dev = dmz_zone_to_dev(zmd, zone); + struct dmz_dev *dev = zone->dev; unsigned int noio_flag; int ret; @@ -1620,7 +1603,7 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) static int dmz_handle_seq_write_err(struct dmz_metadata *zmd, struct dm_zone *zone) { - struct dmz_dev *dev = dmz_zone_to_dev(zmd, zone); + struct dmz_dev *dev = zone->dev; unsigned int wp = 0; int ret; @@ -1657,7 +1640,7 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) return 0; if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) { - struct dmz_dev *dev = dmz_zone_to_dev(zmd, zone); + struct dmz_dev *dev = zone->dev; ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, dmz_start_sect(zmd, zone), @@ -2218,9 +2201,7 @@ again: goto again; } if (dmz_is_meta(zone)) { - struct dmz_dev *dev = dmz_zone_to_dev(zmd, zone); - - dmz_dev_warn(dev, "Zone %u has metadata", zone->id); + dmz_zmd_warn(zmd, "Zone %u has metadata", zone->id); zone = NULL; goto again; } diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index fd4d47dfcea1..e9e3b730e258 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -58,7 +58,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, sector_t block) { struct dmz_metadata *zmd = zrc->metadata; - struct dmz_dev *dev = dmz_zone_to_dev(zmd, zone); + struct dmz_dev *dev = zone->dev; sector_t wp_block = zone->wp_block; unsigned int nr_blocks; int ret; @@ -116,7 +116,6 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, struct dm_zone *src_zone, struct dm_zone *dst_zone) { struct dmz_metadata *zmd = zrc->metadata; - struct dmz_dev *src_dev, *dst_dev; struct dm_io_region src, dst; sector_t block = 0, end_block; sector_t nr_blocks; @@ -130,17 +129,15 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, else end_block = dmz_zone_nr_blocks(zmd); src_zone_block = dmz_start_block(zmd, src_zone); - src_dev = dmz_zone_to_dev(zmd, src_zone); dst_zone_block = dmz_start_block(zmd, dst_zone); - dst_dev = dmz_zone_to_dev(zmd, dst_zone); if (dmz_is_seq(dst_zone)) set_bit(DM_KCOPYD_WRITE_SEQ, &flags); while (block < end_block) { - if (src_dev->flags & DMZ_BDEV_DYING) + if (src_zone->dev->flags & DMZ_BDEV_DYING) return -EIO; - if (dst_dev->flags & DMZ_BDEV_DYING) + if (dst_zone->dev->flags & DMZ_BDEV_DYING) return -EIO; if (dmz_reclaim_should_terminate(src_zone)) @@ -163,11 +160,11 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, return ret; } - src.bdev = src_dev->bdev; + src.bdev = src_zone->dev->bdev; src.sector = dmz_blk2sect(src_zone_block + block); src.count = dmz_blk2sect(nr_blocks); - dst.bdev = dst_dev->bdev; + dst.bdev = dst_zone->dev->bdev; dst.sector = dmz_blk2sect(dst_zone_block + block); dst.count = src.count; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 2770e293a97b..087dd4801663 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -123,7 +123,7 @@ static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone, { struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); - struct dmz_dev *dev = dmz_zone_to_dev(dmz->metadata, zone); + struct dmz_dev *dev = zone->dev; struct bio *clone; if (dev->flags & DMZ_BDEV_DYING) diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 3451b5a768b4..316344bf07bd 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -80,6 +80,9 @@ struct dm_zone { /* For listing the zone depending on its state */ struct list_head link; + /* Device containing this zone */ + struct dmz_dev *dev; + /* Zone type and state */ unsigned long flags; @@ -190,7 +193,6 @@ const char *dmz_metadata_label(struct dmz_metadata *zmd); sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone); sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone); unsigned int dmz_nr_chunks(struct dmz_metadata *zmd); -struct dmz_dev *dmz_zone_to_dev(struct dmz_metadata *zmd, struct dm_zone *zone); bool dmz_check_dev(struct dmz_metadata *zmd); bool dmz_dev_is_dying(struct dmz_metadata *zmd); From 18979819b57ecbc598a8e27d925ab4bb9e145cf0 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:49 +0200 Subject: [PATCH 382/427] dm zoned: add metadata pointer to struct dmz_dev Add a metadata pointer within struct dmz_dev and use it as argument for blkdev_report_zones() instead of the metadata itself. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 14 +++++++++----- drivers/md/dm-zoned.h | 7 ++++--- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 68d44506e6f3..71f263a78515 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1371,8 +1371,8 @@ static int dmz_load_sb(struct dmz_metadata *zmd) */ static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) { - struct dmz_metadata *zmd = data; - struct dmz_dev *dev = zmd->nr_devs > 1 ? &zmd->dev[1] : &zmd->dev[0]; + struct dmz_dev *dev = data; + struct dmz_metadata *zmd = dev->metadata; int idx = num + dev->zone_offset; struct dm_zone *zone; @@ -1495,8 +1495,12 @@ static int dmz_init_zones(struct dmz_metadata *zmd) /* Allocate zone array */ zmd->nr_zones = 0; - for (i = 0; i < zmd->nr_devs; i++) - zmd->nr_zones += zmd->dev[i].nr_zones; + for (i = 0; i < zmd->nr_devs; i++) { + struct dmz_dev *dev = &zmd->dev[i]; + + dev->metadata = zmd; + zmd->nr_zones += dev->nr_zones; + } if (!zmd->nr_zones) { DMERR("(%s): No zones found", zmd->devname); @@ -1531,7 +1535,7 @@ static int dmz_init_zones(struct dmz_metadata *zmd) * first randomly writable zone. */ ret = blkdev_report_zones(zoned_dev->bdev, 0, BLK_ALL_ZONES, - dmz_init_zone, zmd); + dmz_init_zone, zoned_dev); if (ret < 0) { DMDEBUG("(%s): Failed to report zones, error %d", zmd->devname, ret); diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 316344bf07bd..983f5b5e9fa0 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -45,11 +45,15 @@ #define dmz_bio_block(bio) dmz_sect2blk((bio)->bi_iter.bi_sector) #define dmz_bio_blocks(bio) dmz_sect2blk(bio_sectors(bio)) +struct dmz_metadata; +struct dmz_reclaim; + /* * Zoned block device information. */ struct dmz_dev { struct block_device *bdev; + struct dmz_metadata *metadata; char name[BDEVNAME_SIZE]; uuid_t uuid; @@ -170,9 +174,6 @@ enum { #define dmz_dev_debug(dev, format, args...) \ DMDEBUG("(%s): " format, (dev)->name, ## args) -struct dmz_metadata; -struct dmz_reclaim; - /* * Functions defined in dm-zoned-metadata.c */ From f97809aec58995a87a9a30cb45c9a6148377df64 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:50 +0200 Subject: [PATCH 383/427] dm zoned: per-device reclaim Instead of having one reclaim workqueue for the entire set we should be allocating a reclaim workqueue per device; doing so will reduce contention and should boost performance for a multi-device setup. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-reclaim.c | 66 ++++++++++++++++++++++------------- drivers/md/dm-zoned-target.c | 41 ++++++++++++++-------- drivers/md/dm-zoned.h | 38 ++++++++++---------- 3 files changed, 88 insertions(+), 57 deletions(-) diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index e9e3b730e258..09843645248a 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -21,6 +21,8 @@ struct dmz_reclaim { struct dm_kcopyd_throttle kc_throttle; int kc_err; + int dev_idx; + unsigned long flags; /* Last target access time */ @@ -198,8 +200,8 @@ static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone) struct dmz_metadata *zmd = zrc->metadata; int ret; - DMDEBUG("(%s): Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)", - dmz_metadata_label(zmd), + DMDEBUG("(%s/%u): Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)", + dmz_metadata_label(zmd), zrc->dev_idx, dzone->chunk, bzone->id, dmz_weight(bzone), dzone->id, dmz_weight(dzone)); @@ -237,8 +239,8 @@ static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) struct dmz_metadata *zmd = zrc->metadata; int ret = 0; - DMDEBUG("(%s): Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)", - dmz_metadata_label(zmd), + DMDEBUG("(%s/%u): Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)", + dmz_metadata_label(zmd), zrc->dev_idx, chunk, dzone->id, dmz_weight(dzone), bzone->id, dmz_weight(bzone)); @@ -295,8 +297,8 @@ again: if (!szone) return -ENOSPC; - DMDEBUG("(%s): Chunk %u, move %s zone %u (weight %u) to %s zone %u", - dmz_metadata_label(zmd), chunk, + DMDEBUG("(%s/%u): Chunk %u, move %s zone %u (weight %u) to %s zone %u", + dmz_metadata_label(zmd), zrc->dev_idx, chunk, dmz_is_cache(dzone) ? "cache" : "rnd", dzone->id, dmz_weight(dzone), dmz_is_rnd(szone) ? "rnd" : "seq", szone->id); @@ -369,8 +371,8 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) /* Get a data zone */ dzone = dmz_get_zone_for_reclaim(zmd, dmz_target_idle(zrc)); if (!dzone) { - DMDEBUG("(%s): No zone found to reclaim", - dmz_metadata_label(zmd)); + DMDEBUG("(%s/%u): No zone found to reclaim", + dmz_metadata_label(zmd), zrc->dev_idx); return -EBUSY; } @@ -417,24 +419,26 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) out: if (ret) { if (ret == -EINTR) - DMDEBUG("(%s): reclaim zone %u interrupted", - dmz_metadata_label(zmd), rzone->id); + DMDEBUG("(%s/%u): reclaim zone %u interrupted", + dmz_metadata_label(zmd), zrc->dev_idx, + rzone->id); else - DMDEBUG("(%s): Failed to reclaim zone %u, err %d", - dmz_metadata_label(zmd), rzone->id, ret); + DMDEBUG("(%s/%u): Failed to reclaim zone %u, err %d", + dmz_metadata_label(zmd), zrc->dev_idx, + rzone->id, ret); dmz_unlock_zone_reclaim(dzone); return ret; } ret = dmz_flush_metadata(zrc->metadata); if (ret) { - DMDEBUG("(%s): Metadata flush for zone %u failed, err %d", - dmz_metadata_label(zmd), rzone->id, ret); + DMDEBUG("(%s/%u): Metadata flush for zone %u failed, err %d", + dmz_metadata_label(zmd), zrc->dev_idx, rzone->id, ret); return ret; } - DMDEBUG("(%s): Reclaimed zone %u in %u ms", - dmz_metadata_label(zmd), + DMDEBUG("(%s/%u): Reclaimed zone %u in %u ms", + dmz_metadata_label(zmd), zrc->dev_idx, rzone->id, jiffies_to_msecs(jiffies - start)); return 0; } @@ -461,10 +465,20 @@ static unsigned int dmz_reclaim_percentage(struct dmz_reclaim *zrc) */ static bool dmz_should_reclaim(struct dmz_reclaim *zrc, unsigned int p_unmap) { - unsigned int nr_reclaim = dmz_nr_rnd_zones(zrc->metadata); + unsigned int nr_reclaim; - if (dmz_nr_cache_zones(zrc->metadata)) + nr_reclaim = dmz_nr_rnd_zones(zrc->metadata); + + if (dmz_nr_cache_zones(zrc->metadata)) { + /* + * The first device in a multi-device + * setup only contains cache zones, so + * never start reclaim there. + */ + if (zrc->dev_idx == 0) + return false; nr_reclaim += dmz_nr_cache_zones(zrc->metadata); + } /* Reclaim when idle */ if (dmz_target_idle(zrc) && nr_reclaim) @@ -488,7 +502,7 @@ static void dmz_reclaim_work(struct work_struct *work) { struct dmz_reclaim *zrc = container_of(work, struct dmz_reclaim, work.work); struct dmz_metadata *zmd = zrc->metadata; - unsigned int p_unmap; + unsigned int p_unmap, nr_unmap_rnd = 0, nr_rnd = 0; int ret; if (dmz_dev_is_dying(zmd)) @@ -514,8 +528,11 @@ static void dmz_reclaim_work(struct work_struct *work) zrc->kc_throttle.throttle = min(75U, 100U - p_unmap / 2); } - DMDEBUG("(%s): Reclaim (%u): %s, %u%% free zones (%u/%u cache %u/%u random)", - dmz_metadata_label(zmd), + nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd); + nr_rnd = dmz_nr_rnd_zones(zmd); + + DMDEBUG("(%s/%u): Reclaim (%u): %s, %u%% free zones (%u/%u cache %u/%u random)", + dmz_metadata_label(zmd), zrc->dev_idx, zrc->kc_throttle.throttle, (dmz_target_idle(zrc) ? "Idle" : "Busy"), p_unmap, dmz_nr_unmap_cache_zones(zmd), @@ -536,7 +553,7 @@ static void dmz_reclaim_work(struct work_struct *work) * Initialize reclaim. */ int dmz_ctr_reclaim(struct dmz_metadata *zmd, - struct dmz_reclaim **reclaim) + struct dmz_reclaim **reclaim, int idx) { struct dmz_reclaim *zrc; int ret; @@ -547,6 +564,7 @@ int dmz_ctr_reclaim(struct dmz_metadata *zmd, zrc->metadata = zmd; zrc->atime = jiffies; + zrc->dev_idx = idx; /* Reclaim kcopyd client */ zrc->kc = dm_kcopyd_client_create(&zrc->kc_throttle); @@ -558,8 +576,8 @@ int dmz_ctr_reclaim(struct dmz_metadata *zmd, /* Reclaim work */ INIT_DELAYED_WORK(&zrc->work, dmz_reclaim_work); - zrc->wq = alloc_ordered_workqueue("dmz_rwq_%s", WQ_MEM_RECLAIM, - dmz_metadata_label(zmd)); + zrc->wq = alloc_ordered_workqueue("dmz_rwq_%s_%d", WQ_MEM_RECLAIM, + dmz_metadata_label(zmd), idx); if (!zrc->wq) { ret = -ENOMEM; goto err; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 087dd4801663..97d63d8e6c19 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -41,6 +41,7 @@ struct dm_chunk_work { */ struct dmz_target { struct dm_dev *ddev[DMZ_MAX_DEVS]; + unsigned int nr_ddevs; unsigned long flags; @@ -50,9 +51,6 @@ struct dmz_target { /* For metadata handling */ struct dmz_metadata *metadata; - /* For reclaim */ - struct dmz_reclaim *reclaim; - /* For chunk work */ struct radix_tree_root chunk_rxtree; struct workqueue_struct *chunk_wq; @@ -404,14 +402,15 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); struct dmz_metadata *zmd = dmz->metadata; struct dm_zone *zone; - int ret; + int i, ret; /* * Write may trigger a zone allocation. So make sure the * allocation can succeed. */ if (bio_op(bio) == REQ_OP_WRITE) - dmz_schedule_reclaim(dmz->reclaim); + for (i = 0; i < dmz->nr_ddevs; i++) + dmz_schedule_reclaim(dmz->dev[i].reclaim); dmz_lock_metadata(zmd); @@ -431,6 +430,7 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, if (zone) { dmz_activate_zone(zone); bioctx->zone = zone; + dmz_reclaim_bio_acc(zone->dev->reclaim); } switch (bio_op(bio)) { @@ -577,7 +577,6 @@ static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) bio_list_add(&cw->bio_list, bio); - dmz_reclaim_bio_acc(dmz->reclaim); if (queue_work(dmz->chunk_wq, &cw->work)) dmz_get_chunk_work(cw); out: @@ -822,7 +821,7 @@ static int dmz_fixup_devices(struct dm_target *ti) static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct dmz_target *dmz; - int ret; + int ret, i; /* Check arguments */ if (argc < 1 || argc > 2) { @@ -842,6 +841,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) kfree(dmz); return -ENOMEM; } + dmz->nr_ddevs = argc; ti->private = dmz; /* Get the target zoned block device */ @@ -916,10 +916,12 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD); /* Initialize reclaim */ - ret = dmz_ctr_reclaim(dmz->metadata, &dmz->reclaim); - if (ret) { - ti->error = "Zone reclaim initialization failed"; - goto err_fwq; + for (i = 0; i < dmz->nr_ddevs; i++) { + ret = dmz_ctr_reclaim(dmz->metadata, &dmz->dev[i].reclaim, i); + if (ret) { + ti->error = "Zone reclaim initialization failed"; + goto err_fwq; + } } DMINFO("(%s): Target device: %llu 512-byte logical sectors (%llu blocks)", @@ -952,11 +954,13 @@ err: static void dmz_dtr(struct dm_target *ti) { struct dmz_target *dmz = ti->private; + int i; flush_workqueue(dmz->chunk_wq); destroy_workqueue(dmz->chunk_wq); - dmz_dtr_reclaim(dmz->reclaim); + for (i = 0; i < dmz->nr_ddevs; i++) + dmz_dtr_reclaim(dmz->dev[i].reclaim); cancel_delayed_work_sync(&dmz->flush_work); destroy_workqueue(dmz->flush_wq); @@ -1025,9 +1029,11 @@ static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) static void dmz_suspend(struct dm_target *ti) { struct dmz_target *dmz = ti->private; + int i; flush_workqueue(dmz->chunk_wq); - dmz_suspend_reclaim(dmz->reclaim); + for (i = 0; i < dmz->nr_ddevs; i++) + dmz_suspend_reclaim(dmz->dev[i].reclaim); cancel_delayed_work_sync(&dmz->flush_work); } @@ -1037,9 +1043,11 @@ static void dmz_suspend(struct dm_target *ti) static void dmz_resume(struct dm_target *ti) { struct dmz_target *dmz = ti->private; + int i; queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD); - dmz_resume_reclaim(dmz->reclaim); + for (i = 0; i < dmz->nr_ddevs; i++) + dmz_resume_reclaim(dmz->dev[i].reclaim); } static int dmz_iterate_devices(struct dm_target *ti, @@ -1100,7 +1108,10 @@ static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv, int r = -EINVAL; if (!strcasecmp(argv[0], "reclaim")) { - dmz_schedule_reclaim(dmz->reclaim); + int i; + + for (i = 0; i < dmz->nr_ddevs; i++) + dmz_schedule_reclaim(dmz->dev[i].reclaim); r = 0; } else DMERR("unrecognized message %s", argv[0]); diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 983f5b5e9fa0..0cc3459f78ce 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -54,6 +54,7 @@ struct dmz_reclaim; struct dmz_dev { struct block_device *bdev; struct dmz_metadata *metadata; + struct dmz_reclaim *reclaim; char name[BDEVNAME_SIZE]; uuid_t uuid; @@ -229,23 +230,6 @@ static inline void dmz_activate_zone(struct dm_zone *zone) atomic_inc(&zone->refcount); } -/* - * Deactivate a zone. This decrement the zone reference counter - * indicating that all BIOs to the zone have completed when the count is 0. - */ -static inline void dmz_deactivate_zone(struct dm_zone *zone) -{ - atomic_dec(&zone->refcount); -} - -/* - * Test if a zone is active, that is, has a refcount > 0. - */ -static inline bool dmz_is_active(struct dm_zone *zone) -{ - return atomic_read(&zone->refcount); -} - int dmz_lock_zone_reclaim(struct dm_zone *zone); void dmz_unlock_zone_reclaim(struct dm_zone *zone); struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, bool idle); @@ -272,7 +256,7 @@ int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, /* * Functions defined in dm-zoned-reclaim.c */ -int dmz_ctr_reclaim(struct dmz_metadata *zmd, struct dmz_reclaim **zrc); +int dmz_ctr_reclaim(struct dmz_metadata *zmd, struct dmz_reclaim **zrc, int idx); void dmz_dtr_reclaim(struct dmz_reclaim *zrc); void dmz_suspend_reclaim(struct dmz_reclaim *zrc); void dmz_resume_reclaim(struct dmz_reclaim *zrc); @@ -285,4 +269,22 @@ void dmz_schedule_reclaim(struct dmz_reclaim *zrc); bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev); bool dmz_check_bdev(struct dmz_dev *dmz_dev); +/* + * Deactivate a zone. This decrement the zone reference counter + * indicating that all BIOs to the zone have completed when the count is 0. + */ +static inline void dmz_deactivate_zone(struct dm_zone *zone) +{ + dmz_reclaim_bio_acc(zone->dev->reclaim); + atomic_dec(&zone->refcount); +} + +/* + * Test if a zone is active, that is, has a refcount > 0. + */ +static inline bool dmz_is_active(struct dm_zone *zone) +{ + return atomic_read(&zone->refcount); +} + #endif /* DM_ZONED_H */ From bd82fdabf162fec1404c4e22988b178c4f3dd23b Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:51 +0200 Subject: [PATCH 384/427] dm zoned: move random and sequential zones into struct dmz_dev Random and sequential zones should be part of the respective device structure to make arbitration between devices possible. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 139 +++++++++++++++++++-------------- drivers/md/dm-zoned-reclaim.c | 15 ++-- drivers/md/dm-zoned-target.c | 25 ++++-- drivers/md/dm-zoned.h | 18 ++++- 4 files changed, 119 insertions(+), 78 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 71f263a78515..ce17bf3628c6 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -192,21 +192,12 @@ struct dmz_metadata { /* Zone allocation management */ struct mutex map_lock; struct dmz_mblock **map_mblk; - unsigned int nr_rnd; - atomic_t unmap_nr_rnd; - struct list_head unmap_rnd_list; - struct list_head map_rnd_list; unsigned int nr_cache; atomic_t unmap_nr_cache; struct list_head unmap_cache_list; struct list_head map_cache_list; - unsigned int nr_seq; - atomic_t unmap_nr_seq; - struct list_head unmap_seq_list; - struct list_head map_seq_list; - atomic_t nr_reserved_seq_zones; struct list_head reserved_seq_zones_list; @@ -279,14 +270,14 @@ unsigned int dmz_nr_chunks(struct dmz_metadata *zmd) return zmd->nr_chunks; } -unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd) +unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd, int idx) { - return zmd->nr_rnd; + return zmd->dev[idx].nr_rnd; } -unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd) +unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd, int idx) { - return atomic_read(&zmd->unmap_nr_rnd); + return atomic_read(&zmd->dev[idx].unmap_nr_rnd); } unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd) @@ -299,14 +290,14 @@ unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd) return atomic_read(&zmd->unmap_nr_cache); } -unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd) +unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd, int idx) { - return zmd->nr_seq; + return zmd->dev[idx].nr_seq; } -unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd) +unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd, int idx) { - return atomic_read(&zmd->unmap_nr_seq); + return atomic_read(&zmd->dev[idx].unmap_nr_seq); } static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id) @@ -1500,6 +1491,14 @@ static int dmz_init_zones(struct dmz_metadata *zmd) dev->metadata = zmd; zmd->nr_zones += dev->nr_zones; + + atomic_set(&dev->unmap_nr_rnd, 0); + INIT_LIST_HEAD(&dev->unmap_rnd_list); + INIT_LIST_HEAD(&dev->map_rnd_list); + + atomic_set(&dev->unmap_nr_seq, 0); + INIT_LIST_HEAD(&dev->unmap_seq_list); + INIT_LIST_HEAD(&dev->map_seq_list); } if (!zmd->nr_zones) { @@ -1720,9 +1719,9 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) if (dmz_is_cache(dzone)) list_add_tail(&dzone->link, &zmd->map_cache_list); else if (dmz_is_rnd(dzone)) - list_add_tail(&dzone->link, &zmd->map_rnd_list); + list_add_tail(&dzone->link, &dzone->dev->map_rnd_list); else - list_add_tail(&dzone->link, &zmd->map_seq_list); + list_add_tail(&dzone->link, &dzone->dev->map_seq_list); /* Check buffer zone */ bzone_id = le32_to_cpu(dmap[e].bzone_id); @@ -1756,7 +1755,7 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) if (dmz_is_cache(bzone)) list_add_tail(&bzone->link, &zmd->map_cache_list); else - list_add_tail(&bzone->link, &zmd->map_rnd_list); + list_add_tail(&bzone->link, &bzone->dev->map_rnd_list); next: chunk++; e++; @@ -1781,9 +1780,9 @@ next: if (dmz_is_cache(dzone)) zmd->nr_cache++; else if (dmz_is_rnd(dzone)) - zmd->nr_rnd++; + dzone->dev->nr_rnd++; else - zmd->nr_seq++; + dzone->dev->nr_seq++; if (dmz_is_data(dzone)) { /* Already initialized */ @@ -1797,16 +1796,18 @@ next: list_add_tail(&dzone->link, &zmd->unmap_cache_list); atomic_inc(&zmd->unmap_nr_cache); } else if (dmz_is_rnd(dzone)) { - list_add_tail(&dzone->link, &zmd->unmap_rnd_list); - atomic_inc(&zmd->unmap_nr_rnd); + list_add_tail(&dzone->link, + &dzone->dev->unmap_rnd_list); + atomic_inc(&dzone->dev->unmap_nr_rnd); } else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) { list_add_tail(&dzone->link, &zmd->reserved_seq_zones_list); set_bit(DMZ_RESERVED, &dzone->flags); atomic_inc(&zmd->nr_reserved_seq_zones); - zmd->nr_seq--; + dzone->dev->nr_seq--; } else { - list_add_tail(&dzone->link, &zmd->unmap_seq_list); - atomic_inc(&zmd->unmap_nr_seq); + list_add_tail(&dzone->link, + &dzone->dev->unmap_seq_list); + atomic_inc(&dzone->dev->unmap_nr_seq); } } @@ -1840,13 +1841,13 @@ static void __dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone) list_del_init(&zone->link); if (dmz_is_seq(zone)) { /* LRU rotate sequential zone */ - list_add_tail(&zone->link, &zmd->map_seq_list); + list_add_tail(&zone->link, &zone->dev->map_seq_list); } else if (dmz_is_cache(zone)) { /* LRU rotate cache zone */ list_add_tail(&zone->link, &zmd->map_cache_list); } else { /* LRU rotate random zone */ - list_add_tail(&zone->link, &zmd->map_rnd_list); + list_add_tail(&zone->link, &zone->dev->map_rnd_list); } } @@ -1928,14 +1929,24 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, { struct dm_zone *dzone = NULL; struct dm_zone *zone; - struct list_head *zone_list = &zmd->map_rnd_list; + struct list_head *zone_list; /* If we have cache zones select from the cache zone list */ if (zmd->nr_cache) { zone_list = &zmd->map_cache_list; /* Try to relaim random zones, too, when idle */ - if (idle && list_empty(zone_list)) - zone_list = &zmd->map_rnd_list; + if (idle && list_empty(zone_list)) { + int i; + + for (i = 1; i < zmd->nr_devs; i++) { + zone_list = &zmd->dev[i].map_rnd_list; + if (!list_empty(zone_list)) + break; + } + } + } else { + /* Otherwise the random zones are on the first disk */ + zone_list = &zmd->dev[0].map_rnd_list; } list_for_each_entry(zone, zone_list, link) { @@ -1956,12 +1967,17 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd) { struct dm_zone *zone; + int i; - list_for_each_entry(zone, &zmd->map_seq_list, link) { - if (!zone->bzone) - continue; - if (dmz_lock_zone_reclaim(zone)) - return zone; + for (i = 0; i < zmd->nr_devs; i++) { + struct dmz_dev *dev = &zmd->dev[i]; + + list_for_each_entry(zone, &dev->map_seq_list, link) { + if (!zone->bzone) + continue; + if (dmz_lock_zone_reclaim(zone)) + return zone; + } } return NULL; @@ -2147,7 +2163,7 @@ again: if (dmz_is_cache(bzone)) list_add_tail(&bzone->link, &zmd->map_cache_list); else - list_add_tail(&bzone->link, &zmd->map_rnd_list); + list_add_tail(&bzone->link, &bzone->dev->map_rnd_list); out: dmz_unlock_map(zmd); @@ -2162,21 +2178,27 @@ struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags) { struct list_head *list; struct dm_zone *zone; + unsigned int dev_idx = 0; +again: if (flags & DMZ_ALLOC_CACHE) list = &zmd->unmap_cache_list; else if (flags & DMZ_ALLOC_RND) - list = &zmd->unmap_rnd_list; + list = &zmd->dev[dev_idx].unmap_rnd_list; else - list = &zmd->unmap_seq_list; + list = &zmd->dev[dev_idx].unmap_seq_list; -again: if (list_empty(list)) { /* * No free zone: return NULL if this is for not reclaim. */ if (!(flags & DMZ_ALLOC_RECLAIM)) return NULL; + if (dev_idx < zmd->nr_devs) { + dev_idx++; + goto again; + } + /* * Fallback to the reserved sequential zones */ @@ -2195,9 +2217,9 @@ again: if (dmz_is_cache(zone)) atomic_dec(&zmd->unmap_nr_cache); else if (dmz_is_rnd(zone)) - atomic_dec(&zmd->unmap_nr_rnd); + atomic_dec(&zone->dev->unmap_nr_rnd); else - atomic_dec(&zmd->unmap_nr_seq); + atomic_dec(&zone->dev->unmap_nr_seq); if (dmz_is_offline(zone)) { dmz_zmd_warn(zmd, "Zone %u is offline", zone->id); @@ -2227,14 +2249,14 @@ void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone) list_add_tail(&zone->link, &zmd->unmap_cache_list); atomic_inc(&zmd->unmap_nr_cache); } else if (dmz_is_rnd(zone)) { - list_add_tail(&zone->link, &zmd->unmap_rnd_list); - atomic_inc(&zmd->unmap_nr_rnd); + list_add_tail(&zone->link, &zone->dev->unmap_rnd_list); + atomic_inc(&zone->dev->unmap_nr_rnd); } else if (dmz_is_reserved(zone)) { list_add_tail(&zone->link, &zmd->reserved_seq_zones_list); atomic_inc(&zmd->nr_reserved_seq_zones); } else { - list_add_tail(&zone->link, &zmd->unmap_seq_list); - atomic_inc(&zmd->unmap_nr_seq); + list_add_tail(&zone->link, &zone->dev->unmap_seq_list); + atomic_inc(&zone->dev->unmap_nr_seq); } wake_up_all(&zmd->free_wq); @@ -2254,9 +2276,9 @@ void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone, if (dmz_is_cache(dzone)) list_add_tail(&dzone->link, &zmd->map_cache_list); else if (dmz_is_rnd(dzone)) - list_add_tail(&dzone->link, &zmd->map_rnd_list); + list_add_tail(&dzone->link, &dzone->dev->map_rnd_list); else - list_add_tail(&dzone->link, &zmd->map_seq_list); + list_add_tail(&dzone->link, &dzone->dev->map_seq_list); } /* @@ -2824,18 +2846,11 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, INIT_LIST_HEAD(&zmd->mblk_dirty_list); mutex_init(&zmd->map_lock); - atomic_set(&zmd->unmap_nr_rnd, 0); - INIT_LIST_HEAD(&zmd->unmap_rnd_list); - INIT_LIST_HEAD(&zmd->map_rnd_list); atomic_set(&zmd->unmap_nr_cache, 0); INIT_LIST_HEAD(&zmd->unmap_cache_list); INIT_LIST_HEAD(&zmd->map_cache_list); - atomic_set(&zmd->unmap_nr_seq, 0); - INIT_LIST_HEAD(&zmd->unmap_seq_list); - INIT_LIST_HEAD(&zmd->map_seq_list); - atomic_set(&zmd->nr_reserved_seq_zones, 0); INIT_LIST_HEAD(&zmd->reserved_seq_zones_list); @@ -2904,10 +2919,14 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, zmd->nr_data_zones, zmd->nr_chunks); dmz_zmd_debug(zmd, " %u cache zones (%u unmapped)", zmd->nr_cache, atomic_read(&zmd->unmap_nr_cache)); - dmz_zmd_debug(zmd, " %u random zones (%u unmapped)", - zmd->nr_rnd, atomic_read(&zmd->unmap_nr_rnd)); - dmz_zmd_debug(zmd, " %u sequential zones (%u unmapped)", - zmd->nr_seq, atomic_read(&zmd->unmap_nr_seq)); + for (i = 0; i < zmd->nr_devs; i++) { + dmz_zmd_debug(zmd, " %u random zones (%u unmapped)", + dmz_nr_rnd_zones(zmd, i), + dmz_nr_unmap_rnd_zones(zmd, i)); + dmz_zmd_debug(zmd, " %u sequential zones (%u unmapped)", + dmz_nr_seq_zones(zmd, i), + dmz_nr_unmap_seq_zones(zmd, i)); + } dmz_zmd_debug(zmd, " %u reserved sequential data zones", zmd->nr_reserved_seq); dmz_zmd_debug(zmd, "Format:"); diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 09843645248a..18edf1b9bf52 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -447,15 +447,14 @@ static unsigned int dmz_reclaim_percentage(struct dmz_reclaim *zrc) { struct dmz_metadata *zmd = zrc->metadata; unsigned int nr_cache = dmz_nr_cache_zones(zmd); - unsigned int nr_rnd = dmz_nr_rnd_zones(zmd); unsigned int nr_unmap, nr_zones; if (nr_cache) { nr_zones = nr_cache; nr_unmap = dmz_nr_unmap_cache_zones(zmd); } else { - nr_zones = nr_rnd; - nr_unmap = dmz_nr_unmap_rnd_zones(zmd); + nr_zones = dmz_nr_rnd_zones(zmd, zrc->dev_idx); + nr_unmap = dmz_nr_unmap_rnd_zones(zmd, zrc->dev_idx); } return nr_unmap * 100 / nr_zones; } @@ -467,7 +466,7 @@ static bool dmz_should_reclaim(struct dmz_reclaim *zrc, unsigned int p_unmap) { unsigned int nr_reclaim; - nr_reclaim = dmz_nr_rnd_zones(zrc->metadata); + nr_reclaim = dmz_nr_rnd_zones(zrc->metadata, zrc->dev_idx); if (dmz_nr_cache_zones(zrc->metadata)) { /* @@ -528,8 +527,8 @@ static void dmz_reclaim_work(struct work_struct *work) zrc->kc_throttle.throttle = min(75U, 100U - p_unmap / 2); } - nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd); - nr_rnd = dmz_nr_rnd_zones(zmd); + nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd, zrc->dev_idx); + nr_rnd = dmz_nr_rnd_zones(zmd, zrc->dev_idx); DMDEBUG("(%s/%u): Reclaim (%u): %s, %u%% free zones (%u/%u cache %u/%u random)", dmz_metadata_label(zmd), zrc->dev_idx, @@ -537,8 +536,8 @@ static void dmz_reclaim_work(struct work_struct *work) (dmz_target_idle(zrc) ? "Idle" : "Busy"), p_unmap, dmz_nr_unmap_cache_zones(zmd), dmz_nr_cache_zones(zmd), - dmz_nr_unmap_rnd_zones(zmd), - dmz_nr_rnd_zones(zmd)); + dmz_nr_unmap_rnd_zones(zmd, zrc->dev_idx), + dmz_nr_rnd_zones(zmd, zrc->dev_idx)); ret = dmz_do_reclaim(zrc); if (ret && ret != -EINTR) { diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 97d63d8e6c19..aa3d26d16441 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -1075,17 +1075,30 @@ static void dmz_status(struct dm_target *ti, status_type_t type, ssize_t sz = 0; char buf[BDEVNAME_SIZE]; struct dmz_dev *dev; + int i; switch (type) { case STATUSTYPE_INFO: - DMEMIT("%u zones %u/%u cache %u/%u random %u/%u sequential", + DMEMIT("%u zones %u/%u cache", dmz_nr_zones(dmz->metadata), dmz_nr_unmap_cache_zones(dmz->metadata), - dmz_nr_cache_zones(dmz->metadata), - dmz_nr_unmap_rnd_zones(dmz->metadata), - dmz_nr_rnd_zones(dmz->metadata), - dmz_nr_unmap_seq_zones(dmz->metadata), - dmz_nr_seq_zones(dmz->metadata)); + dmz_nr_cache_zones(dmz->metadata)); + for (i = 0; i < DMZ_MAX_DEVS; i++) { + if (!dmz->ddev[i]) + continue; + /* + * For a multi-device setup the first device + * contains only cache zones. + */ + if ((i == 0) && + (dmz_nr_cache_zones(dmz->metadata) > 0)) + continue; + DMEMIT(" %u/%u random %u/%u sequential", + dmz_nr_unmap_rnd_zones(dmz->metadata, i), + dmz_nr_rnd_zones(dmz->metadata, i), + dmz_nr_unmap_seq_zones(dmz->metadata, i), + dmz_nr_seq_zones(dmz->metadata, i)); + } break; case STATUSTYPE_TABLE: dev = &dmz->dev[0]; diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 0cc3459f78ce..f2a760f62db5 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -67,6 +67,16 @@ struct dmz_dev { unsigned int flags; sector_t zone_nr_sectors; + + unsigned int nr_rnd; + atomic_t unmap_nr_rnd; + struct list_head unmap_rnd_list; + struct list_head map_rnd_list; + + unsigned int nr_seq; + atomic_t unmap_nr_seq; + struct list_head unmap_seq_list; + struct list_head map_seq_list; }; #define dmz_bio_chunk(zmd, bio) ((bio)->bi_iter.bi_sector >> \ @@ -213,10 +223,10 @@ void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone); unsigned int dmz_nr_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd); -unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd); -unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd); -unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd); -unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd); +unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd, int idx); +unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd, int idx); +unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd, int idx); +unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd, int idx); unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd); unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd); unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd); From 4dba12881f882b629774796bb8655f5b1415d803 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:52 +0200 Subject: [PATCH 385/427] dm zoned: support arbitrary number of devices Remove the hard-coded limit of two devices and support an unlimited number of additional zoned devices. Signed-off-by: Hannes Reinecke Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 15 ++++- drivers/md/dm-zoned-target.c | 104 +++++++++++++++++++-------------- 2 files changed, 74 insertions(+), 45 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index ce17bf3628c6..49bafc86aa9a 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1525,7 +1525,20 @@ static int dmz_init_zones(struct dmz_metadata *zmd) */ zmd->sb[0].zone = dmz_get(zmd, 0); - zoned_dev = &zmd->dev[1]; + for (i = 1; i < zmd->nr_devs; i++) { + zoned_dev = &zmd->dev[i]; + + ret = blkdev_report_zones(zoned_dev->bdev, 0, + BLK_ALL_ZONES, + dmz_init_zone, zoned_dev); + if (ret < 0) { + DMDEBUG("(%s): Failed to report zones, error %d", + zmd->devname, ret); + dmz_drop_zones(zmd); + return ret; + } + } + return 0; } /* diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index aa3d26d16441..fc30c78dffdb 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -13,8 +13,6 @@ #define DMZ_MIN_BIOS 8192 -#define DMZ_MAX_DEVS 2 - /* * Zone BIO context. */ @@ -40,10 +38,10 @@ struct dm_chunk_work { * Target descriptor. */ struct dmz_target { - struct dm_dev *ddev[DMZ_MAX_DEVS]; + struct dm_dev **ddev; unsigned int nr_ddevs; - unsigned long flags; + unsigned int flags; /* Zoned block device information */ struct dmz_dev *dev; @@ -764,7 +762,7 @@ static void dmz_put_zoned_device(struct dm_target *ti) struct dmz_target *dmz = ti->private; int i; - for (i = 0; i < DMZ_MAX_DEVS; i++) { + for (i = 0; i < dmz->nr_ddevs; i++) { if (dmz->ddev[i]) { dm_put_device(ti, dmz->ddev[i]); dmz->ddev[i] = NULL; @@ -777,21 +775,35 @@ static int dmz_fixup_devices(struct dm_target *ti) struct dmz_target *dmz = ti->private; struct dmz_dev *reg_dev, *zoned_dev; struct request_queue *q; + sector_t zone_nr_sectors = 0; + int i; /* - * When we have two devices, the first one must be a regular block - * device and the second a zoned block device. + * When we have more than on devices, the first one must be a + * regular block device and the others zoned block devices. */ - if (dmz->ddev[0] && dmz->ddev[1]) { + if (dmz->nr_ddevs > 1) { reg_dev = &dmz->dev[0]; if (!(reg_dev->flags & DMZ_BDEV_REGULAR)) { ti->error = "Primary disk is not a regular device"; return -EINVAL; } - zoned_dev = &dmz->dev[1]; - if (zoned_dev->flags & DMZ_BDEV_REGULAR) { - ti->error = "Secondary disk is not a zoned device"; - return -EINVAL; + for (i = 1; i < dmz->nr_ddevs; i++) { + zoned_dev = &dmz->dev[i]; + if (zoned_dev->flags & DMZ_BDEV_REGULAR) { + ti->error = "Secondary disk is not a zoned device"; + return -EINVAL; + } + q = bdev_get_queue(zoned_dev->bdev); + if (zone_nr_sectors && + zone_nr_sectors != blk_queue_zone_sectors(q)) { + ti->error = "Zone nr sectors mismatch"; + return -EINVAL; + } + zone_nr_sectors = blk_queue_zone_sectors(q); + zoned_dev->zone_nr_sectors = zone_nr_sectors; + zoned_dev->nr_zones = + blkdev_nr_zones(zoned_dev->bdev->bd_disk); } } else { reg_dev = NULL; @@ -800,17 +812,24 @@ static int dmz_fixup_devices(struct dm_target *ti) ti->error = "Disk is not a zoned device"; return -EINVAL; } + q = bdev_get_queue(zoned_dev->bdev); + zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q); + zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk); } - q = bdev_get_queue(zoned_dev->bdev); - zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q); - zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk); if (reg_dev) { - reg_dev->zone_nr_sectors = zoned_dev->zone_nr_sectors; + sector_t zone_offset; + + reg_dev->zone_nr_sectors = zone_nr_sectors; reg_dev->nr_zones = DIV_ROUND_UP_SECTOR_T(reg_dev->capacity, reg_dev->zone_nr_sectors); - zoned_dev->zone_offset = reg_dev->nr_zones; + reg_dev->zone_offset = 0; + zone_offset = reg_dev->nr_zones; + for (i = 1; i < dmz->nr_ddevs; i++) { + dmz->dev[i].zone_offset = zone_offset; + zone_offset += dmz->dev[i].nr_zones; + } } return 0; } @@ -824,7 +843,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) int ret, i; /* Check arguments */ - if (argc < 1 || argc > 2) { + if (argc < 1) { ti->error = "Invalid argument count"; return -EINVAL; } @@ -835,32 +854,31 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->error = "Unable to allocate the zoned target descriptor"; return -ENOMEM; } - dmz->dev = kcalloc(2, sizeof(struct dmz_dev), GFP_KERNEL); + dmz->dev = kcalloc(argc, sizeof(struct dmz_dev), GFP_KERNEL); if (!dmz->dev) { ti->error = "Unable to allocate the zoned device descriptors"; kfree(dmz); return -ENOMEM; } + dmz->ddev = kcalloc(argc, sizeof(struct dm_dev *), GFP_KERNEL); + if (!dmz->ddev) { + ti->error = "Unable to allocate the dm device descriptors"; + ret = -ENOMEM; + goto err; + } dmz->nr_ddevs = argc; + ti->private = dmz; /* Get the target zoned block device */ - ret = dmz_get_zoned_device(ti, argv[0], 0, argc); - if (ret) - goto err; - - if (argc == 2) { - ret = dmz_get_zoned_device(ti, argv[1], 1, argc); - if (ret) { - dmz_put_zoned_device(ti); - goto err; - } + for (i = 0; i < argc; i++) { + ret = dmz_get_zoned_device(ti, argv[i], i, argc); + if (ret) + goto err_dev; } ret = dmz_fixup_devices(ti); - if (ret) { - dmz_put_zoned_device(ti); - goto err; - } + if (ret) + goto err_dev; /* Initialize metadata */ ret = dmz_ctr_metadata(dmz->dev, argc, &dmz->metadata, @@ -1056,13 +1074,13 @@ static int dmz_iterate_devices(struct dm_target *ti, struct dmz_target *dmz = ti->private; unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata); sector_t capacity; - int r; + int i, r; - capacity = dmz->dev[0].capacity & ~(zone_nr_sectors - 1); - r = fn(ti, dmz->ddev[0], 0, capacity, data); - if (!r && dmz->ddev[1]) { - capacity = dmz->dev[1].capacity & ~(zone_nr_sectors - 1); - r = fn(ti, dmz->ddev[1], 0, capacity, data); + for (i = 0; i < dmz->nr_ddevs; i++) { + capacity = dmz->dev[i].capacity & ~(zone_nr_sectors - 1); + r = fn(ti, dmz->ddev[i], 0, capacity, data); + if (r) + break; } return r; } @@ -1083,9 +1101,7 @@ static void dmz_status(struct dm_target *ti, status_type_t type, dmz_nr_zones(dmz->metadata), dmz_nr_unmap_cache_zones(dmz->metadata), dmz_nr_cache_zones(dmz->metadata)); - for (i = 0; i < DMZ_MAX_DEVS; i++) { - if (!dmz->ddev[i]) - continue; + for (i = 0; i < dmz->nr_ddevs; i++) { /* * For a multi-device setup the first device * contains only cache zones. @@ -1104,8 +1120,8 @@ static void dmz_status(struct dm_target *ti, status_type_t type, dev = &dmz->dev[0]; format_dev_t(buf, dev->bdev->bd_dev); DMEMIT("%s", buf); - if (dmz->dev[1].bdev) { - dev = &dmz->dev[1]; + for (i = 1; i < dmz->nr_ddevs; i++) { + dev = &dmz->dev[i]; format_dev_t(buf, dev->bdev->bd_dev); DMEMIT(" %s", buf); } From 22c1ef66c4cbb82baf81a28abedfe8ad20ad9126 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:53 +0200 Subject: [PATCH 386/427] dm zoned: allocate zone by device index When allocating a zone, pass in an indicator on which device the zone should be allocated; this increases performance for a multi-device setup because reclaim will now allocate zones on the device for which reclaim is running. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 17 +++++++++++------ drivers/md/dm-zoned-reclaim.c | 3 ++- drivers/md/dm-zoned.h | 3 ++- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 49bafc86aa9a..1a6cdab3e4ef 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -2050,7 +2050,7 @@ again: goto out; /* Allocate a random zone */ - dzone = dmz_alloc_zone(zmd, alloc_flags); + dzone = dmz_alloc_zone(zmd, 0, alloc_flags); if (!dzone) { if (dmz_dev_is_dying(zmd)) { dzone = ERR_PTR(-EIO); @@ -2156,7 +2156,7 @@ again: goto out; /* Allocate a random zone */ - bzone = dmz_alloc_zone(zmd, alloc_flags); + bzone = dmz_alloc_zone(zmd, 0, alloc_flags); if (!bzone) { if (dmz_dev_is_dying(zmd)) { bzone = ERR_PTR(-EIO); @@ -2187,11 +2187,12 @@ out: * Get an unmapped (free) zone. * This must be called with the mapping lock held. */ -struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags) +struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned int dev_idx, + unsigned long flags) { struct list_head *list; struct dm_zone *zone; - unsigned int dev_idx = 0; + int i = 0; again: if (flags & DMZ_ALLOC_CACHE) @@ -2207,8 +2208,12 @@ again: */ if (!(flags & DMZ_ALLOC_RECLAIM)) return NULL; - if (dev_idx < zmd->nr_devs) { - dev_idx++; + /* + * Try to allocate from other devices + */ + if (i < zmd->nr_devs) { + dev_idx = (dev_idx + 1) % zmd->nr_devs; + i++; goto again; } diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 18edf1b9bf52..5a04e34d17a9 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -288,7 +288,8 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) /* Get a free random or sequential zone */ dmz_lock_map(zmd); again: - szone = dmz_alloc_zone(zmd, alloc_flags | DMZ_ALLOC_RECLAIM); + szone = dmz_alloc_zone(zmd, zrc->dev_idx, + alloc_flags | DMZ_ALLOC_RECLAIM); if (!szone && alloc_flags == DMZ_ALLOC_SEQ && dmz_nr_cache_zones(zmd)) { alloc_flags = DMZ_ALLOC_RND; goto again; diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index f2a760f62db5..ec020bb1caf7 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -214,7 +214,8 @@ bool dmz_dev_is_dying(struct dmz_metadata *zmd); #define DMZ_ALLOC_SEQ 0x04 #define DMZ_ALLOC_RECLAIM 0x10 -struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags); +struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, + unsigned int dev_idx, unsigned long flags); void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone); void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *zone, From 69875d443bc3bb1b2e1f77fe3da5ad5c8c729aa2 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:54 +0200 Subject: [PATCH 387/427] dm zoned: select reclaim zone based on device index per-device reclaim should select zones on that device only. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 50 ++++++++++++++-------------------- drivers/md/dm-zoned-reclaim.c | 3 +- drivers/md/dm-zoned-target.c | 1 + drivers/md/dm-zoned.h | 5 +++- 4 files changed, 27 insertions(+), 32 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 1a6cdab3e4ef..0cb90799b8ce 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1938,7 +1938,7 @@ static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone) * Select a cache or random write zone for reclaim. */ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, - bool idle) + unsigned int idx, bool idle) { struct dm_zone *dzone = NULL; struct dm_zone *zone; @@ -1948,24 +1948,17 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, if (zmd->nr_cache) { zone_list = &zmd->map_cache_list; /* Try to relaim random zones, too, when idle */ - if (idle && list_empty(zone_list)) { - int i; - - for (i = 1; i < zmd->nr_devs; i++) { - zone_list = &zmd->dev[i].map_rnd_list; - if (!list_empty(zone_list)) - break; - } - } - } else { - /* Otherwise the random zones are on the first disk */ - zone_list = &zmd->dev[0].map_rnd_list; - } + if (idle && list_empty(zone_list)) + zone_list = &zmd->dev[idx].map_rnd_list; + } else + zone_list = &zmd->dev[idx].map_rnd_list; list_for_each_entry(zone, zone_list, link) { - if (dmz_is_buf(zone)) + if (dmz_is_buf(zone)) { dzone = zone->bzone; - else + if (dzone->dev->dev_idx != idx) + continue; + } else dzone = zone; if (dmz_lock_zone_reclaim(dzone)) return dzone; @@ -1977,20 +1970,16 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, /* * Select a buffered sequential zone for reclaim. */ -static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd) +static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd, + unsigned int idx) { struct dm_zone *zone; - int i; - for (i = 0; i < zmd->nr_devs; i++) { - struct dmz_dev *dev = &zmd->dev[i]; - - list_for_each_entry(zone, &dev->map_seq_list, link) { - if (!zone->bzone) - continue; - if (dmz_lock_zone_reclaim(zone)) - return zone; - } + list_for_each_entry(zone, &zmd->dev[idx].map_seq_list, link) { + if (!zone->bzone) + continue; + if (dmz_lock_zone_reclaim(zone)) + return zone; } return NULL; @@ -1999,7 +1988,8 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd) /* * Select a zone for reclaim. */ -struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, bool idle) +struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, + unsigned int dev_idx, bool idle) { struct dm_zone *zone; @@ -2013,9 +2003,9 @@ struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, bool idle) */ dmz_lock_map(zmd); if (list_empty(&zmd->reserved_seq_zones_list)) - zone = dmz_get_seq_zone_for_reclaim(zmd); + zone = dmz_get_seq_zone_for_reclaim(zmd, dev_idx); else - zone = dmz_get_rnd_zone_for_reclaim(zmd, idle); + zone = dmz_get_rnd_zone_for_reclaim(zmd, dev_idx, idle); dmz_unlock_map(zmd); return zone; diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 5a04e34d17a9..2261b4dd60b7 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -370,7 +370,8 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) int ret; /* Get a data zone */ - dzone = dmz_get_zone_for_reclaim(zmd, dmz_target_idle(zrc)); + dzone = dmz_get_zone_for_reclaim(zmd, zrc->dev_idx, + dmz_target_idle(zrc)); if (!dzone) { DMDEBUG("(%s/%u): No zone found to reclaim", dmz_metadata_label(zmd), zrc->dev_idx); diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index fc30c78dffdb..a907a9446c0b 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -738,6 +738,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path, dev = &dmz->dev[idx]; } dev->bdev = bdev; + dev->dev_idx = idx; (void)bdevname(dev->bdev, dev->name); dev->capacity = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index ec020bb1caf7..22f11440b423 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -61,6 +61,8 @@ struct dmz_dev { sector_t capacity; + unsigned int dev_idx; + unsigned int nr_zones; unsigned int zone_offset; @@ -243,7 +245,8 @@ static inline void dmz_activate_zone(struct dm_zone *zone) int dmz_lock_zone_reclaim(struct dm_zone *zone); void dmz_unlock_zone_reclaim(struct dm_zone *zone); -struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, bool idle); +struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, + unsigned int dev_idx, bool idle); struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, int op); From 2094045fe5b5dda98c4ec6cb1ac7b12ba4382856 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:55 +0200 Subject: [PATCH 388/427] dm zoned: prefer full zones for reclaim Prefer full zones when selecting the next zone for reclaim. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 0cb90799b8ce..59a34895f5a8 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1941,7 +1941,7 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, unsigned int idx, bool idle) { struct dm_zone *dzone = NULL; - struct dm_zone *zone; + struct dm_zone *zone, *last = NULL; struct list_head *zone_list; /* If we have cache zones select from the cache zone list */ @@ -1958,6 +1958,13 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, dzone = zone->bzone; if (dzone->dev->dev_idx != idx) continue; + if (!last) { + last = dzone; + continue; + } + if (last->weight < dzone->weight) + continue; + dzone = last; } else dzone = zone; if (dmz_lock_zone_reclaim(dzone)) From 27d49ac1dd751897506ba51df7226fc0ce7ef681 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:56 +0200 Subject: [PATCH 389/427] dm zoned: check superblock location When specifying several devices the superblock location must be checked to ensure the devices are specified in the correct order. Signed-off-by: Hannes Reinecke Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 59a34895f5a8..314ce31a2c43 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -997,7 +997,7 @@ static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_sb *dsb, struct dmz_dev *dev = dsb->dev; unsigned int nr_meta_zones, nr_data_zones; u32 crc, stored_crc; - u64 gen; + u64 gen, sb_block; if (le32_to_cpu(sb->magic) != DMZ_MAGIC) { dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)", @@ -1026,6 +1026,14 @@ static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_sb *dsb, return -ENXIO; } + sb_block = le64_to_cpu(sb->sb_block); + if (sb_block != (u64)dsb->zone->id << zmd->zone_nr_blocks_shift ) { + dmz_dev_err(dev, "Invalid superblock position " + "(is %llu expected %llu)", + sb_block, + (u64)dsb->zone->id << zmd->zone_nr_blocks_shift); + return -EINVAL; + } if (zmd->sb_version > 1) { uuid_t sb_uuid; From a862e4e2154289fc12aa9e70f33614d9c70f3be4 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 26 May 2020 16:06:56 -0400 Subject: [PATCH 390/427] dm mpath: simplify __must_push_back Remove micro-optimization that infers device is between presuspend and resume (was done purely to avoid call to dm_noflush_suspending, which isn't expensive anyway). Remove flags argument since they are no longer checked. And remove must_push_back_bio() since it was simply a call to __must_push_back(). Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 95f16d816585..4c34d037aa35 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -457,33 +457,15 @@ do { \ /* * Check whether bios must be queued in the device-mapper core rather * than here in the target. - * - * If MPATHF_QUEUE_IF_NO_PATH and MPATHF_SAVED_QUEUE_IF_NO_PATH hold - * the same value then we are not between multipath_presuspend() - * and multipath_resume() calls and we have no need to check - * for the DMF_NOFLUSH_SUSPENDING flag. */ -static bool __must_push_back(struct multipath *m, unsigned long flags) +static bool __must_push_back(struct multipath *m) { - return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &flags) != - test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &flags)) && - dm_noflush_suspending(m->ti)); + return dm_noflush_suspending(m->ti); } -/* - * Following functions use READ_ONCE to get atomic access to - * all m->flags to avoid taking spinlock - */ static bool must_push_back_rq(struct multipath *m) { - unsigned long flags = READ_ONCE(m->flags); - return test_bit(MPATHF_QUEUE_IF_NO_PATH, &flags) || __must_push_back(m, flags); -} - -static bool must_push_back_bio(struct multipath *m) -{ - unsigned long flags = READ_ONCE(m->flags); - return __must_push_back(m, flags); + return test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || __must_push_back(m); } /* @@ -620,7 +602,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, return DM_MAPIO_SUBMITTED; if (!pgpath) { - if (must_push_back_bio(m)) + if (__must_push_back(m)) return DM_MAPIO_REQUEUE; dm_report_EIO(m); return DM_MAPIO_KILL; @@ -1642,7 +1624,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, if (atomic_read(&m->nr_valid_paths) == 0 && !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { - if (must_push_back_bio(m)) { + if (__must_push_back(m)) { r = DM_ENDIO_REQUEUE; } else { dm_report_EIO(m); From 553ec94cb4b4937b48f81e27de33f71325d1a227 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 27 May 2020 16:32:51 -0400 Subject: [PATCH 391/427] dm mpath: restrict queue_if_no_path state machine Do not allow saving disabled queue_if_no_path if already saved as enabled; implies multiple suspends (which shouldn't ever happen). Log if this unlikely scenario is ever triggered. Also, only write MPATHF_SAVED_QUEUE_IF_NO_PATH during presuspend or if "fail_if_no_path" message. MPATHF_SAVED_QUEUE_IF_NO_PATH is no longer always modified, e.g.: even if queue_if_no_path()'s save_old_value argument wasn't set. This just implies a bit tighter control over the management of MPATHF_SAVED_QUEUE_IF_NO_PATH. Side-effect is multipath_resume() doesn't reset MPATHF_QUEUE_IF_NO_PATH unless MPATHF_SAVED_QUEUE_IF_NO_PATH was set (during presuspend); and at that time the MPATHF_SAVED_QUEUE_IF_NO_PATH bit gets cleared. So MPATHF_SAVED_QUEUE_IF_NO_PATH's use is much more narrow in scope. Last, but not least, do _not_ disable queue_if_no_path during noflush suspend. There is no need/benefit to saving off queue_if_no_path via MPATHF_SAVED_QUEUE_IF_NO_PATH and clearing MPATHF_QUEUE_IF_NO_PATH for noflush suspend -- by avoiding this needless queue_if_no_path flag churn there is less potential for MPATHF_QUEUE_IF_NO_PATH to get lost. Which avoids potential for IOs to be errored back up to userspace during DM multipath's handling of path failures. That said, this last change papers over a reported issue concerning request-based dm-multipath's interaction with blk-mq, relative to suspend and resume: multipath_endio is being called _before_ multipath_resume. This should never happen if DM suspend's blk_mq_quiesce_queue() + dm_wait_for_completion() is genuinely waiting for all inflight blk-mq requests to complete. Similarly: drivers/md/dm.c:__dm_resume() clearly calls dm_table_resume_targets() _before_ dm_start_queue()'s blk_mq_unquiesce_queue() is called. If the queue isn't even restarted until after multipath_resume(); the BIG question that still needs answering is: how can multipath_end_io beat multipath_resume in a race!? Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 4c34d037aa35..bc846cf7b0d8 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -695,12 +695,25 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, bool save_old_value) { unsigned long flags; + bool queue_if_no_path_bit, saved_queue_if_no_path_bit; spin_lock_irqsave(&m->lock, flags); - assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags, - (save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) || - (!save_old_value && queue_if_no_path)); + + queue_if_no_path_bit = test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); + saved_queue_if_no_path_bit = test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); + + if (save_old_value) { + if (unlikely(!queue_if_no_path_bit && saved_queue_if_no_path_bit)) { + DMERR("%s: QIFNP disabled but saved as enabled, saving again loses state, not saving!", + dm_device_name(dm_table_get_md(m->ti->table))); + } else + assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path_bit); + } else if (!queue_if_no_path && saved_queue_if_no_path_bit) { + /* due to "fail_if_no_path" message, need to honor it. */ + clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); + } assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path); + spin_unlock_irqrestore(&m->lock, flags); if (!queue_if_no_path) { @@ -1653,16 +1666,19 @@ done: } /* - * Suspend can't complete until all the I/O is processed so if - * the last path fails we must error any remaining I/O. - * Note that if the freeze_bdev fails while suspending, the - * queue_if_no_path state is lost - userspace should reset it. + * Suspend with flush can't complete until all the I/O is processed + * so if the last path fails we must error any remaining I/O. + * - Note that if the freeze_bdev fails while suspending, the + * queue_if_no_path state is lost - userspace should reset it. + * Otherwise, during noflush suspend, queue_if_no_path will not change. */ static void multipath_presuspend(struct dm_target *ti) { struct multipath *m = ti->private; - queue_if_no_path(m, false, true); + /* FIXME: bio-based shouldn't need to always disable queue_if_no_path */ + if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti)) + queue_if_no_path(m, false, true); } static void multipath_postsuspend(struct dm_target *ti) @@ -1683,8 +1699,10 @@ static void multipath_resume(struct dm_target *ti) unsigned long flags; spin_lock_irqsave(&m->lock, flags); - assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, - test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)); + if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) { + set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); + clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); + } spin_unlock_irqrestore(&m->lock, flags); } From 4c3f48380fedbd714fc95958f503c1b5adf3ee6b Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 29 May 2020 15:59:13 -0400 Subject: [PATCH 392/427] dm mpath: enhance queue_if_no_path debugging Add more DMDEBUG that shows arguments passed and caller, and another that shows state of related flags at end of queue_if_no_path(). Also add queue_if_no_path DMDEBUG to multipath_resume(). Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index bc846cf7b0d8..b17da3046611 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -692,10 +692,14 @@ static void process_queued_bios(struct work_struct *work) * If we run out of usable paths, should we queue I/O or error it? */ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, - bool save_old_value) + bool save_old_value, const char *caller) { unsigned long flags; bool queue_if_no_path_bit, saved_queue_if_no_path_bit; + const char *dm_dev_name = dm_device_name(dm_table_get_md(m->ti->table)); + + DMDEBUG("%s: %s caller=%s queue_if_no_path=%d save_old_value=%d", + dm_dev_name, __func__, caller, queue_if_no_path, save_old_value); spin_lock_irqsave(&m->lock, flags); @@ -705,7 +709,7 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, if (save_old_value) { if (unlikely(!queue_if_no_path_bit && saved_queue_if_no_path_bit)) { DMERR("%s: QIFNP disabled but saved as enabled, saving again loses state, not saving!", - dm_device_name(dm_table_get_md(m->ti->table))); + dm_dev_name); } else assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path_bit); } else if (!queue_if_no_path && saved_queue_if_no_path_bit) { @@ -714,6 +718,12 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, } assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path); + DMDEBUG("%s: after %s changes; QIFNP = %d; SQIFNP = %d; DNFS = %d", + dm_dev_name, __func__, + test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags), + test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags), + dm_noflush_suspending(m->ti)); + spin_unlock_irqrestore(&m->lock, flags); if (!queue_if_no_path) { @@ -734,7 +744,7 @@ static void queue_if_no_path_timeout_work(struct timer_list *t) struct mapped_device *md = dm_table_get_md(m->ti->table); DMWARN("queue_if_no_path timeout on %s, failing queued IO", dm_device_name(md)); - queue_if_no_path(m, false, false); + queue_if_no_path(m, false, false, __func__); } /* @@ -1074,7 +1084,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) argc--; if (!strcasecmp(arg_name, "queue_if_no_path")) { - r = queue_if_no_path(m, true, false); + r = queue_if_no_path(m, true, false, __func__); continue; } @@ -1678,7 +1688,7 @@ static void multipath_presuspend(struct dm_target *ti) /* FIXME: bio-based shouldn't need to always disable queue_if_no_path */ if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti)) - queue_if_no_path(m, false, true); + queue_if_no_path(m, false, true, __func__); } static void multipath_postsuspend(struct dm_target *ti) @@ -1703,6 +1713,12 @@ static void multipath_resume(struct dm_target *ti) set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); } + + DMDEBUG("%s: %s finished; QIFNP = %d; SQIFNP = %d", + dm_device_name(dm_table_get_md(m->ti->table)), __func__, + test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags), + test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)); + spin_unlock_irqrestore(&m->lock, flags); } @@ -1862,13 +1878,13 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv, if (argc == 1) { if (!strcasecmp(argv[0], "queue_if_no_path")) { - r = queue_if_no_path(m, true, false); + r = queue_if_no_path(m, true, false, __func__); spin_lock_irqsave(&m->lock, flags); enable_nopath_timeout(m); spin_unlock_irqrestore(&m->lock, flags); goto out; } else if (!strcasecmp(argv[0], "fail_if_no_path")) { - r = queue_if_no_path(m, false, false); + r = queue_if_no_path(m, false, false, __func__); disable_nopath_timeout(m); goto out; } From 04867370ec40b708bd335df641182ddab0c59685 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 4 Jun 2020 10:44:34 -0400 Subject: [PATCH 393/427] dm mpath: add DM device name to Failing/Reinstating path log messages When there are many DM multipath devices it really helps to have additional context for which DM device a failed or reinstated path is part of. Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index b17da3046611..78cff42d987e 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1285,7 +1285,9 @@ static int fail_path(struct pgpath *pgpath) if (!pgpath->is_active) goto out; - DMWARN("Failing path %s.", pgpath->path.dev->name); + DMWARN("%s: Failing path %s.", + dm_device_name(dm_table_get_md(m->ti->table)), + pgpath->path.dev->name); pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); pgpath->is_active = false; @@ -1324,7 +1326,9 @@ static int reinstate_path(struct pgpath *pgpath) if (pgpath->is_active) goto out; - DMWARN("Reinstating path %s.", pgpath->path.dev->name); + DMWARN("%s: Reinstating path %s.", + dm_device_name(dm_table_get_md(m->ti->table)), + pgpath->path.dev->name); r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path); if (r) From 64611a15ca9da91ff532982429c44686f4593b5f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 4 Jun 2020 12:01:26 -0700 Subject: [PATCH 394/427] dm crypt: avoid truncating the logical block size queue_limits::logical_block_size got changed from unsigned short to unsigned int, but it was forgotten to update crypt_io_hints() to use the new type. Fix it. Fixes: ad6bf88a6c19 ("block: fix an integer overflow in logical block size") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Reviewed-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 71c651465bdd..000ddfab5ba0 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3312,7 +3312,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) limits->max_segment_size = PAGE_SIZE; limits->logical_block_size = - max_t(unsigned short, limits->logical_block_size, cc->sector_size); + max_t(unsigned, limits->logical_block_size, cc->sector_size); limits->physical_block_size = max_t(unsigned, limits->physical_block_size, cc->sector_size); limits->io_min = max_t(unsigned, limits->io_min, cc->sector_size); From f5152f4ded3ce6d332d5e4f9d7e325c3b81cae1b Mon Sep 17 00:00:00 2001 From: Erwan Velu Date: Sat, 6 Jun 2020 11:35:50 +0200 Subject: [PATCH 395/427] firmware/dmi: Report DMI Bios & EC firmware release Some vendors like HPe or Dell, encode the release version of their BIOS in the "System BIOS {Major|Minor} Release" fields of Type 0. This information is used to know which bios release actually runs. It could be used for some quirks, debugging sessions or inventory tasks. A typical output for a Dell system running the 65.27 bios is : [root@t1700 ~]# cat /sys/devices/virtual/dmi/id/bios_release 65.27 [root@t1700 ~]# Servers that have a BMC encode the release version of their firmware in the "Embedded Controller Firmware {Major|Minor} Release" fields of Type 0. This information is used to know which BMC release actually runs. It could be used for some quirks, debugging sessions or inventory tasks. A typical output for a Dell system running the 3.75 bmc release is : [root@t1700 ~]# cat /sys/devices/virtual/dmi/id/ec_firmware_release 3.75 [root@t1700 ~]# Signed-off-by: Erwan Velu Signed-off-by: Jean Delvare --- drivers/firmware/dmi-id.c | 6 ++++++ drivers/firmware/dmi_scan.c | 30 ++++++++++++++++++++++++++++++ include/linux/mod_devicetable.h | 2 ++ scripts/mod/file2alias.c | 2 ++ 4 files changed, 40 insertions(+) diff --git a/drivers/firmware/dmi-id.c b/drivers/firmware/dmi-id.c index ff39f64f2aae..86d71b0212b1 100644 --- a/drivers/firmware/dmi-id.c +++ b/drivers/firmware/dmi-id.c @@ -42,6 +42,8 @@ DEFINE_DMI_ATTR_WITH_SHOW(bios_vendor, 0444, DMI_BIOS_VENDOR); DEFINE_DMI_ATTR_WITH_SHOW(bios_version, 0444, DMI_BIOS_VERSION); DEFINE_DMI_ATTR_WITH_SHOW(bios_date, 0444, DMI_BIOS_DATE); DEFINE_DMI_ATTR_WITH_SHOW(sys_vendor, 0444, DMI_SYS_VENDOR); +DEFINE_DMI_ATTR_WITH_SHOW(bios_release, 0444, DMI_BIOS_RELEASE); +DEFINE_DMI_ATTR_WITH_SHOW(ec_firmware_release, 0444, DMI_EC_FIRMWARE_RELEASE); DEFINE_DMI_ATTR_WITH_SHOW(product_name, 0444, DMI_PRODUCT_NAME); DEFINE_DMI_ATTR_WITH_SHOW(product_version, 0444, DMI_PRODUCT_VERSION); DEFINE_DMI_ATTR_WITH_SHOW(product_serial, 0400, DMI_PRODUCT_SERIAL); @@ -78,6 +80,8 @@ static ssize_t get_modalias(char *buffer, size_t buffer_size) { "bvn", DMI_BIOS_VENDOR }, { "bvr", DMI_BIOS_VERSION }, { "bd", DMI_BIOS_DATE }, + { "br", DMI_BIOS_RELEASE }, + { "efr", DMI_EC_FIRMWARE_RELEASE }, { "svn", DMI_SYS_VENDOR }, { "pn", DMI_PRODUCT_NAME }, { "pvr", DMI_PRODUCT_VERSION }, @@ -187,6 +191,8 @@ static void __init dmi_id_init_attr_table(void) ADD_DMI_ATTR(bios_vendor, DMI_BIOS_VENDOR); ADD_DMI_ATTR(bios_version, DMI_BIOS_VERSION); ADD_DMI_ATTR(bios_date, DMI_BIOS_DATE); + ADD_DMI_ATTR(bios_release, DMI_BIOS_RELEASE); + ADD_DMI_ATTR(ec_firmware_release, DMI_EC_FIRMWARE_RELEASE); ADD_DMI_ATTR(sys_vendor, DMI_SYS_VENDOR); ADD_DMI_ATTR(product_name, DMI_PRODUCT_NAME); ADD_DMI_ATTR(product_version, DMI_PRODUCT_VERSION); diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index f59163cb7cba..5066d1f1d687 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -186,6 +186,34 @@ static void __init dmi_save_ident(const struct dmi_header *dm, int slot, dmi_ident[slot] = p; } +static void __init dmi_save_release(const struct dmi_header *dm, int slot, + int index) +{ + const u8 *minor, *major; + char *s; + + /* If the table doesn't have the field, let's return */ + if (dmi_ident[slot] || dm->length < index) + return; + + minor = (u8 *) dm + index; + major = (u8 *) dm + index - 1; + + /* As per the spec, if the system doesn't support this field, + * the value is FF + */ + if (*major == 0xFF && *minor == 0xFF) + return; + + s = dmi_alloc(8); + if (!s) + return; + + sprintf(s, "%u.%u", *major, *minor); + + dmi_ident[slot] = s; +} + static void __init dmi_save_uuid(const struct dmi_header *dm, int slot, int index) { @@ -444,6 +472,8 @@ static void __init dmi_decode(const struct dmi_header *dm, void *dummy) dmi_save_ident(dm, DMI_BIOS_VENDOR, 4); dmi_save_ident(dm, DMI_BIOS_VERSION, 5); dmi_save_ident(dm, DMI_BIOS_DATE, 8); + dmi_save_release(dm, DMI_BIOS_RELEASE, 21); + dmi_save_release(dm, DMI_EC_FIRMWARE_RELEASE, 23); break; case 1: /* System Information */ dmi_save_ident(dm, DMI_SYS_VENDOR, 4); diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 4c2ddd0941a7..4b3d0a4945df 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -532,6 +532,8 @@ enum dmi_field { DMI_BIOS_VENDOR, DMI_BIOS_VERSION, DMI_BIOS_DATE, + DMI_BIOS_RELEASE, + DMI_EC_FIRMWARE_RELEASE, DMI_SYS_VENDOR, DMI_PRODUCT_NAME, DMI_PRODUCT_VERSION, diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 02d5d79da284..9599e2a3f1e6 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -954,6 +954,8 @@ static const struct dmifield { { "bvn", DMI_BIOS_VENDOR }, { "bvr", DMI_BIOS_VERSION }, { "bd", DMI_BIOS_DATE }, + { "br", DMI_BIOS_RELEASE }, + { "efr", DMI_EC_FIRMWARE_RELEASE }, { "svn", DMI_SYS_VENDOR }, { "pn", DMI_PRODUCT_NAME }, { "pvr", DMI_PRODUCT_VERSION }, From 52c3416db00d970c91a6992ab6e5ff48e077ad29 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:05 +0900 Subject: [PATCH 396/427] modpost: track if the symbol origin is a dump file or ELF object The meaning of sym->kernel is obscure; it is set for in-kernel symbols loaded from Modules.symvers. This happens only when we are building external modules, and it is used to determine whether to dump symbols to $(KBUILD_EXTMOD)/Modules.symvers It is clearer to remember whether the symbol or module came from a dump file or ELF object. This changes the KBUILD_EXTRA_SYMBOLS behavior. Previously, symbols loaded from KBUILD_EXTRA_SYMBOLS are accumulated into the current $(KBUILD_EXTMOD)/Modules.symvers Going forward, they will be only used to check symbol references, but not dumped into the current $(KBUILD_EXTMOD)/Modules.symvers. I believe this makes more sense. sym->vmlinux will have no user. Remove it too. Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 15 +++++---------- scripts/mod/modpost.h | 1 + 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 5224a02edbf2..60f35b89cea2 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -161,9 +161,6 @@ struct symbol { int crc_valid; char *namespace; unsigned int weak:1; - unsigned int vmlinux:1; /* 1 if symbol is defined in vmlinux */ - unsigned int kernel:1; /* 1 if symbol is from kernel - * (only for external modules) **/ unsigned int is_static:1; /* 1 if symbol is not global */ enum export export; /* Type of export */ char name[]; @@ -398,8 +395,6 @@ static struct symbol *sym_add_exported(const char *name, struct module *mod, } s->module = mod; - s->vmlinux = is_vmlinux(mod->name); - s->kernel = 0; s->export = export; return s; } @@ -2427,7 +2422,7 @@ static void write_if_changed(struct buffer *b, const char *fname) /* parse Module.symvers file. line format: * 0x12345678symbolmoduleexportnamespace **/ -static void read_dump(const char *fname, unsigned int kernel) +static void read_dump(const char *fname) { unsigned long size, pos = 0; void *file = grab_file(fname, &size); @@ -2465,9 +2460,9 @@ static void read_dump(const char *fname, unsigned int kernel) have_vmlinux = 1; mod = new_module(modname); mod->skip = 1; + mod->from_dump = 1; } s = sym_add_exported(symname, mod, export_no(export)); - s->kernel = kernel; s->is_static = 0; sym_set_crc(symname, crc); sym_update_namespace(symname, namespace); @@ -2487,7 +2482,7 @@ static int dump_sym(struct symbol *sym) { if (!external_module) return 1; - if (sym->vmlinux || sym->kernel) + if (sym->module->from_dump) return 0; return 1; } @@ -2606,11 +2601,11 @@ int main(int argc, char **argv) } if (kernel_read) - read_dump(kernel_read, 1); + read_dump(kernel_read); while (extsym_start) { struct ext_sym_list *tmp; - read_dump(extsym_start->file, 0); + read_dump(extsym_start->file); tmp = extsym_start->next; free(extsym_start); extsym_start = tmp; diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index 39f6c29fb568..933a88c733bc 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -119,6 +119,7 @@ struct module { const char *name; int gpl_compatible; struct symbol *unres; + int from_dump; /* 1 if module was loaded from *.symvers */ int seen; int skip; int has_init; From ce2ddd6d6ab3b343837d5c8e17538a5f67fa400e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:06 +0900 Subject: [PATCH 397/427] modpost: allow to pass -i option multiple times to remove -e option Now that there is no difference between -i and -e, they can be unified. Make modpost accept the -i option multiple times, then remove -e. I will reuse -e for a different purpose. Signed-off-by: Masahiro Yamada --- scripts/Makefile.modpost | 2 +- scripts/mod/modpost.c | 9 +-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index 3334f100a490..7e07adab929c 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -50,7 +50,7 @@ MODPOST = scripts/mod/modpost \ $(if $(CONFIG_MODVERSIONS),-m) \ $(if $(CONFIG_MODULE_SRCVERSION_ALL),-a) \ $(if $(KBUILD_EXTMOD),-i,-o) $(kernelsymfile) \ - $(if $(KBUILD_EXTMOD),$(addprefix -e ,$(KBUILD_EXTRA_SYMBOLS))) \ + $(if $(KBUILD_EXTMOD),$(addprefix -i ,$(KBUILD_EXTRA_SYMBOLS))) \ $(if $(KBUILD_EXTMOD),-o $(modulesymfile)) \ $(if $(CONFIG_SECTION_MISMATCH_WARN_ONLY),,-E) \ $(if $(KBUILD_MODPOST_WARN),-w) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 60f35b89cea2..28d8f5377c62 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2544,7 +2544,6 @@ int main(int argc, char **argv) { struct module *mod; struct buffer buf = { }; - char *kernel_read = NULL; char *missing_namespace_deps = NULL; char *dump_write = NULL, *files_source = NULL; int opt; @@ -2553,13 +2552,9 @@ int main(int argc, char **argv) struct ext_sym_list *extsym_start = NULL; struct ext_sym_list **extsym_iter = &extsym_start; - while ((opt = getopt(argc, argv, "i:e:mnsT:o:awENd:")) != -1) { + while ((opt = getopt(argc, argv, "i:mnsT:o:awENd:")) != -1) { switch (opt) { case 'i': - kernel_read = optarg; - external_module = 1; - break; - case 'e': external_module = 1; *extsym_iter = NOFAIL(calloc(1, sizeof(**extsym_iter))); (*extsym_iter)->file = optarg; @@ -2600,8 +2595,6 @@ int main(int argc, char **argv) } } - if (kernel_read) - read_dump(kernel_read); while (extsym_start) { struct ext_sym_list *tmp; From 7924799ed2ddaa393c2ef0c4cd13a81d122bffde Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:07 +0900 Subject: [PATCH 398/427] modpost: rename ext_sym_list to dump_list The -i option is used to include Modules.symver as well as files from $(KBUILD_EXTRA_SYMBOLS). Make the struct and variable names more generic. Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 28d8f5377c62..b8e521f50b2d 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2535,8 +2535,8 @@ static void write_namespace_deps_files(const char *fname) free(ns_deps_buf.p); } -struct ext_sym_list { - struct ext_sym_list *next; +struct dump_list { + struct dump_list *next; const char *file; }; @@ -2549,16 +2549,17 @@ int main(int argc, char **argv) int opt; int err; int n; - struct ext_sym_list *extsym_start = NULL; - struct ext_sym_list **extsym_iter = &extsym_start; + struct dump_list *dump_read_start = NULL; + struct dump_list **dump_read_iter = &dump_read_start; while ((opt = getopt(argc, argv, "i:mnsT:o:awENd:")) != -1) { switch (opt) { case 'i': external_module = 1; - *extsym_iter = NOFAIL(calloc(1, sizeof(**extsym_iter))); - (*extsym_iter)->file = optarg; - extsym_iter = &(*extsym_iter)->next; + *dump_read_iter = + NOFAIL(calloc(1, sizeof(**dump_read_iter))); + (*dump_read_iter)->file = optarg; + dump_read_iter = &(*dump_read_iter)->next; break; case 'm': modversions = 1; @@ -2595,13 +2596,13 @@ int main(int argc, char **argv) } } - while (extsym_start) { - struct ext_sym_list *tmp; + while (dump_read_start) { + struct dump_list *tmp; - read_dump(extsym_start->file); - tmp = extsym_start->next; - free(extsym_start); - extsym_start = tmp; + read_dump(dump_read_start->file); + tmp = dump_read_start->next; + free(dump_read_start); + dump_read_start = tmp; } while (optind < argc) From e3fb4df7fe4e8636de32a1e4ff5ebc75257f5570 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:08 +0900 Subject: [PATCH 399/427] modpost: re-add -e to set external_module flag Previously, the -i option had two functions; load a symbol dump file, and set the external_module flag. I want to assign a dedicate option for each of them. Going forward, the -i is used to load a symbol dump file, and the -e to set the external_module flag. With this, we will be able to use -i for loading in-kernel symbols. Signed-off-by: Masahiro Yamada --- scripts/Makefile.modpost | 4 ++++ scripts/mod/modpost.c | 6 ++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index 7e07adab929c..4d79afe997ad 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -79,6 +79,10 @@ src := $(obj) # Include the module's Makefile to find KBUILD_EXTRA_SYMBOLS include $(if $(wildcard $(KBUILD_EXTMOD)/Kbuild), \ $(KBUILD_EXTMOD)/Kbuild, $(KBUILD_EXTMOD)/Makefile) + +# modpost option for external modules +MODPOST += -e + endif # modpost options for modules (both in-kernel and external) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index b8e521f50b2d..4a2f27d97bf1 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2552,10 +2552,12 @@ int main(int argc, char **argv) struct dump_list *dump_read_start = NULL; struct dump_list **dump_read_iter = &dump_read_start; - while ((opt = getopt(argc, argv, "i:mnsT:o:awENd:")) != -1) { + while ((opt = getopt(argc, argv, "ei:mnsT:o:awENd:")) != -1) { switch (opt) { - case 'i': + case 'e': external_module = 1; + break; + case 'i': *dump_read_iter = NOFAIL(calloc(1, sizeof(**dump_read_iter))); (*dump_read_iter)->file = optarg; From bcfedae7d92886597e581ec32dfbf698cbccb4a1 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:09 +0900 Subject: [PATCH 400/427] modpost: print symbol dump file as the build target in short log The symbol dump *.symvers is the output of modpost. Print it in the short log. Signed-off-by: Masahiro Yamada --- scripts/Makefile.modpost | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index 4d79afe997ad..e766e134b0f3 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -44,25 +44,26 @@ include include/config/auto.conf include scripts/Kbuild.include kernelsymfile := $(objtree)/Module.symvers -modulesymfile := $(KBUILD_EXTMOD)/Module.symvers MODPOST = scripts/mod/modpost \ $(if $(CONFIG_MODVERSIONS),-m) \ $(if $(CONFIG_MODULE_SRCVERSION_ALL),-a) \ - $(if $(KBUILD_EXTMOD),-i,-o) $(kernelsymfile) \ + $(if $(KBUILD_EXTMOD),-i $(kernelsymfile)) \ $(if $(KBUILD_EXTMOD),$(addprefix -i ,$(KBUILD_EXTRA_SYMBOLS))) \ - $(if $(KBUILD_EXTMOD),-o $(modulesymfile)) \ $(if $(CONFIG_SECTION_MISMATCH_WARN_ONLY),,-E) \ - $(if $(KBUILD_MODPOST_WARN),-w) + $(if $(KBUILD_MODPOST_WARN),-w) \ + -o $@ ifdef MODPOST_VMLINUX -quiet_cmd_modpost = MODPOST vmlinux.o - cmd_modpost = $(MODPOST) vmlinux.o +quiet_cmd_modpost = MODPOST $@ + cmd_modpost = $(MODPOST) $< -__modpost: +Module.symvers: vmlinux.o $(call cmd,modpost) +__modpost: Module.symvers + else MODPOST += -s \ @@ -70,6 +71,8 @@ MODPOST += -s \ ifeq ($(KBUILD_EXTMOD),) MODPOST += $(wildcard vmlinux) +output-symdump := Module.symvers + else # set src + obj - they may be used in the modules's Makefile @@ -83,6 +86,8 @@ include $(if $(wildcard $(KBUILD_EXTMOD)/Kbuild), \ # modpost option for external modules MODPOST += -e +output-symdump := $(KBUILD_EXTMOD)/Module.symvers + endif # modpost options for modules (both in-kernel and external) @@ -94,20 +99,22 @@ ifneq ($(findstring i,$(filter-out --%,$(MAKEFLAGS))),) MODPOST += -n endif -# find all modules listed in modules.order -modules := $(sort $(shell cat $(MODORDER))) - -# Read out modules.order instead of expanding $(modules) to pass in modpost. +# Read out modules.order to pass in modpost. # Otherwise, allmodconfig would fail with "Argument list too long". -quiet_cmd_modpost = MODPOST $(words $(modules)) modules +quiet_cmd_modpost = MODPOST $@ cmd_modpost = sed 's/ko$$/o/' $(MODORDER) | $(MODPOST) -T - -__modpost: +$(output-symdump): FORCE $(call cmd,modpost) + +__modpost: $(output-symdump) ifneq ($(KBUILD_MODPOST_NOFINAL),1) $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modfinal endif +PHONY += FORCE +FORCE: + endif .PHONY: $(PHONY) From f1005b30ade716eb9286613aeb1d33b5c7852a91 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:10 +0900 Subject: [PATCH 401/427] modpost: refactor -i option calculation Prepare to use -i for in-tree modpost as well. Signed-off-by: Masahiro Yamada --- scripts/Makefile.modpost | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index e766e134b0f3..79e850c8ce01 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -43,13 +43,9 @@ __modpost: include include/config/auto.conf include scripts/Kbuild.include -kernelsymfile := $(objtree)/Module.symvers - MODPOST = scripts/mod/modpost \ $(if $(CONFIG_MODVERSIONS),-m) \ $(if $(CONFIG_MODULE_SRCVERSION_ALL),-a) \ - $(if $(KBUILD_EXTMOD),-i $(kernelsymfile)) \ - $(if $(KBUILD_EXTMOD),$(addprefix -i ,$(KBUILD_EXTRA_SYMBOLS))) \ $(if $(CONFIG_SECTION_MISMATCH_WARN_ONLY),,-E) \ $(if $(KBUILD_MODPOST_WARN),-w) \ -o $@ @@ -86,12 +82,14 @@ include $(if $(wildcard $(KBUILD_EXTMOD)/Kbuild), \ # modpost option for external modules MODPOST += -e +input-symdump := Module.symvers $(KBUILD_EXTRA_SYMBOLS) output-symdump := $(KBUILD_EXTMOD)/Module.symvers endif # modpost options for modules (both in-kernel and external) MODPOST += \ + $(addprefix -i ,$(input-symdump)) \ $(if $(CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS)$(KBUILD_NSDEPS),-N) # 'make -i -k' ignores compile errors, and builds as many modules as possible. From 269a535ca931b754a40dda3ab60514e68773c759 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:11 +0900 Subject: [PATCH 402/427] modpost: generate vmlinux.symvers and reuse it for the second modpost The full build runs modpost twice, first for vmlinux.o and second for modules. The first pass dumps all the vmlinux symbols into Module.symvers, but the second pass parses vmlinux again instead of reusing the dump file, presumably because it needs to avoid accumulating stale symbols. Loading symbol info from a dump file is faster than parsing an ELF object. Besides, modpost deals with various issues to parse vmlinux in the second pass. A solution is to make the first pass dumps symbols into a separate file, vmlinux.symvers. The second pass reads it, and parses module .o files. The merged symbol information is dumped into Module.symvers in the same way as before. This makes further modpost cleanups possible. Also, it fixes the problem of 'make vmlinux', which previously overwrote Module.symvers, throwing away module symbols. I slightly touched scripts/link-vmlinux.sh so that vmlinux is re-linked when you cross this commit. Otherwise, vmlinux.symvers would not be generated. Signed-off-by: Masahiro Yamada --- .gitignore | 1 + Documentation/dontdiff | 1 + Makefile | 2 +- scripts/Makefile.modpost | 7 ++++--- scripts/link-vmlinux.sh | 2 -- 5 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 2258e906f01c..87b9dd8a163b 100644 --- a/.gitignore +++ b/.gitignore @@ -56,6 +56,7 @@ modules.order /linux /vmlinux /vmlinux.32 +/vmlinux.symvers /vmlinux-gdb.py /vmlinuz /System.map diff --git a/Documentation/dontdiff b/Documentation/dontdiff index 72fc2e9e2b63..ef9519c32c55 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff @@ -251,6 +251,7 @@ vmlinux-* vmlinux.aout vmlinux.bin.all vmlinux.lds +vmlinux.symvers vmlinuz voffset.h vsyscall.lds diff --git a/Makefile b/Makefile index b0bbf8453b66..9768140f8d5a 100644 --- a/Makefile +++ b/Makefile @@ -1416,7 +1416,7 @@ endif # CONFIG_MODULES # make distclean Remove editor backup files, patch leftover files and the like # Directories & files removed with 'make clean' -CLEAN_FILES += include/ksym \ +CLEAN_FILES += include/ksym vmlinux.symvers \ modules.builtin modules.builtin.modinfo modules.nsdeps # Directories & files removed with 'make mrproper' diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index 79e850c8ce01..896c799911c5 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -55,10 +55,10 @@ ifdef MODPOST_VMLINUX quiet_cmd_modpost = MODPOST $@ cmd_modpost = $(MODPOST) $< -Module.symvers: vmlinux.o +vmlinux.symvers: vmlinux.o $(call cmd,modpost) -__modpost: Module.symvers +__modpost: vmlinux.symvers else @@ -66,7 +66,8 @@ MODPOST += -s \ $(if $(KBUILD_NSDEPS),-d $(MODULES_NSDEPS)) ifeq ($(KBUILD_EXTMOD),) -MODPOST += $(wildcard vmlinux) + +input-symdump := vmlinux.symvers output-symdump := Module.symvers else diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index d09ab4afbda4..d5af6be50b50 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -218,8 +218,6 @@ on_signals() } trap on_signals HUP INT QUIT TERM -# -# # Use "make V=1" to debug this script case "${KBUILD_VERBOSE}" in *1*) From 436b2ac603d58504f38041a0cd8adb5aeace992b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:12 +0900 Subject: [PATCH 403/427] modpost: invoke modpost only when input files are updated Currently, the second pass of modpost is always invoked when you run 'make' or 'make modules' even if none of modules is changed. Use if_changed to invoke it only when it is necessary. Signed-off-by: Masahiro Yamada --- scripts/Makefile.modpost | 20 ++++++++++++++++---- scripts/mod/modpost.c | 32 +++++++++++++++++++++----------- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index 896c799911c5..f29a02196b72 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -90,7 +90,7 @@ endif # modpost options for modules (both in-kernel and external) MODPOST += \ - $(addprefix -i ,$(input-symdump)) \ + $(addprefix -i ,$(wildcard $(input-symdump))) \ $(if $(CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS)$(KBUILD_NSDEPS),-N) # 'make -i -k' ignores compile errors, and builds as many modules as possible. @@ -98,13 +98,18 @@ ifneq ($(findstring i,$(filter-out --%,$(MAKEFLAGS))),) MODPOST += -n endif +$(input-symdump): + @: + # Read out modules.order to pass in modpost. # Otherwise, allmodconfig would fail with "Argument list too long". quiet_cmd_modpost = MODPOST $@ - cmd_modpost = sed 's/ko$$/o/' $(MODORDER) | $(MODPOST) -T - + cmd_modpost = sed 's/ko$$/o/' $< | $(MODPOST) -T - -$(output-symdump): FORCE - $(call cmd,modpost) +$(output-symdump): $(MODORDER) $(input-symdump) FORCE + $(call if_changed,modpost) + +targets += $(output-symdump) __modpost: $(output-symdump) ifneq ($(KBUILD_MODPOST_NOFINAL),1) @@ -114,6 +119,13 @@ endif PHONY += FORCE FORCE: +existing-targets := $(wildcard $(sort $(targets))) + +-include $(foreach f,$(existing-targets),$(dir $(f)).$(notdir $(f)).cmd) + +PHONY += FORCE +FORCE: + endif .PHONY: $(PHONY) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 4a2f27d97bf1..b839c48689df 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2375,6 +2375,25 @@ static void add_srcversion(struct buffer *b, struct module *mod) } } +static void write_buf(struct buffer *b, const char *fname) +{ + FILE *file; + + file = fopen(fname, "w"); + if (!file) { + perror(fname); + exit(1); + } + if (fwrite(b->p, 1, b->pos, file) != b->pos) { + perror(fname); + exit(1); + } + if (fclose(file) != 0) { + perror(fname); + exit(1); + } +} + static void write_if_changed(struct buffer *b, const char *fname) { char *tmp; @@ -2407,16 +2426,7 @@ static void write_if_changed(struct buffer *b, const char *fname) close_write: fclose(file); write: - file = fopen(fname, "w"); - if (!file) { - perror(fname); - exit(1); - } - if (fwrite(b->p, 1, b->pos, file) != b->pos) { - perror(fname); - exit(1); - } - fclose(file); + write_buf(b, fname); } /* parse Module.symvers file. line format: @@ -2508,7 +2518,7 @@ static void write_dump(const char *fname) symbol = symbol->next; } } - write_if_changed(&buf, fname); + write_buf(&buf, fname); free(buf.p); } From 7e8a3235823bcb779acf92de630edd5ddffaf886 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:13 +0900 Subject: [PATCH 404/427] modpost: show warning if vmlinux is not found when processing modules check_exports() does not print warnings about unresolved symbols if vmlinux is missing because there would be too many. This situation happens when you do 'make modules' from the clean tree, or compile external modules against a kernel tree that has not been completely built. It is dangerous to not check unresolved symbols because you might be building useless modules. At least it should be warned. Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index b839c48689df..3df26789c2e6 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2001,8 +2001,6 @@ static void read_symbols(const char *modname) mod = new_module(modname); - /* When there's no vmlinux, don't print warnings about - * unresolved symbols (since there'll be too many ;) */ if (is_vmlinux(modname)) { have_vmlinux = 1; mod->skip = 1; @@ -2623,6 +2621,13 @@ int main(int argc, char **argv) if (files_source) read_symbols_from_files(files_source); + /* + * When there's no vmlinux, don't print warnings about + * unresolved symbols (since there'll be too many ;) + */ + if (!have_vmlinux) + warn("Symbol info of vmlinux is missing. Unresolved symbol check will be entirely skipped.\n"); + err = 0; for (mod = modules; mod; mod = mod->next) { From 48a0f72797bdc6b428f951aff265f5aecc2bda49 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:14 +0900 Subject: [PATCH 405/427] modpost: show warning if any of symbol dump files is missing If modpost fails to load a symbol dump file, it cannot check unresolved symbols, hence module dependency will not be added. Nor CRCs can be added. Currently, external module builds check only $(objtree)/Module.symvers, but it should check files specified by KBUILD_EXTRA_SYMBOLS as well. Move the warning message from the top Makefile to scripts/Makefile.modpost and print the warning if any dump file is missing. Signed-off-by: Masahiro Yamada --- Makefile | 10 +--------- scripts/Makefile.modpost | 5 ++++- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index 9768140f8d5a..ee3ed9dfca2c 100644 --- a/Makefile +++ b/Makefile @@ -1649,17 +1649,9 @@ else # KBUILD_EXTMOD # We are always building modules KBUILD_MODULES := 1 -PHONY += $(objtree)/Module.symvers -$(objtree)/Module.symvers: - @test -e $(objtree)/Module.symvers || ( \ - echo; \ - echo " WARNING: Symbol version dump $(objtree)/Module.symvers"; \ - echo " is missing; modules will have no dependencies and modversions."; \ - echo ) - build-dirs := $(KBUILD_EXTMOD) PHONY += modules -modules: descend $(objtree)/Module.symvers +modules: descend $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost PHONY += modules_install diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index f29a02196b72..e47f87557f09 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -98,8 +98,11 @@ ifneq ($(findstring i,$(filter-out --%,$(MAKEFLAGS))),) MODPOST += -n endif +# Clear VPATH to not search for *.symvers in $(srctree). Check only $(objtree). +VPATH := $(input-symdump): - @: + @echo >&2 'WARNING: Symbol version dump "$@" is missing.' + @echo >&2 ' Modules may not have dependencies or modversions.' # Read out modules.order to pass in modpost. # Otherwise, allmodconfig would fail with "Argument list too long". From f693153519607449d3e270d9e6af20b032543c05 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:15 +0900 Subject: [PATCH 406/427] modpost: drop RCS/CVS $Revision handling in MODULE_VERSION() As far as I understood, this code gets rid of '$Revision$' or '$Revision:' of CVS, RCS or whatever in MODULE_VERSION() tags. Remove the primeval code. Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 3 -- scripts/mod/modpost.h | 4 --- scripts/mod/sumversion.c | 66 ---------------------------------------- 3 files changed, 73 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 3df26789c2e6..fbb3d3391e52 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2066,9 +2066,6 @@ static void read_symbols(const char *modname) check_sec_ref(mod, modname, &info); version = get_modinfo(&info, "version"); - if (version) - maybe_frob_rcs_version(modname, version, info.modinfo, - version - (char *)info.hdr); if (version || (all_versions && !is_vmlinux(modname))) get_src_version(modname, mod->srcversion, sizeof(mod->srcversion)-1); diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index 933a88c733bc..e5eace03a2b3 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -188,10 +188,6 @@ void handle_moddevtable(struct module *mod, struct elf_info *info, void add_moddevtable(struct buffer *buf, struct module *mod); /* sumversion.c */ -void maybe_frob_rcs_version(const char *modfilename, - char *version, - void *modinfo, - unsigned long modinfo_offset); void get_src_version(const char *modname, char sum[], unsigned sumlen); /* from modpost.c */ diff --git a/scripts/mod/sumversion.c b/scripts/mod/sumversion.c index 63062024ce0e..f27f22420cbc 100644 --- a/scripts/mod/sumversion.c +++ b/scripts/mod/sumversion.c @@ -429,69 +429,3 @@ void get_src_version(const char *modname, char sum[], unsigned sumlen) release: release_file(file, len); } - -static void write_version(const char *filename, const char *sum, - unsigned long offset) -{ - int fd; - - fd = open(filename, O_RDWR); - if (fd < 0) { - warn("changing sum in %s failed: %s\n", - filename, strerror(errno)); - return; - } - - if (lseek(fd, offset, SEEK_SET) == (off_t)-1) { - warn("changing sum in %s:%lu failed: %s\n", - filename, offset, strerror(errno)); - goto out; - } - - if (write(fd, sum, strlen(sum)+1) != strlen(sum)+1) { - warn("writing sum in %s failed: %s\n", - filename, strerror(errno)); - goto out; - } -out: - close(fd); -} - -static int strip_rcs_crap(char *version) -{ - unsigned int len, full_len; - - if (strncmp(version, "$Revision", strlen("$Revision")) != 0) - return 0; - - /* Space for version string follows. */ - full_len = strlen(version) + strlen(version + strlen(version) + 1) + 2; - - /* Move string to start with version number: prefix will be - * $Revision$ or $Revision: */ - len = strlen("$Revision"); - if (version[len] == ':' || version[len] == '$') - len++; - while (isspace(version[len])) - len++; - memmove(version, version+len, full_len-len); - full_len -= len; - - /* Preserve up to next whitespace. */ - len = 0; - while (version[len] && !isspace(version[len])) - len++; - memmove(version + len, version + strlen(version), - full_len - strlen(version)); - return 1; -} - -/* Clean up RCS-style version numbers. */ -void maybe_frob_rcs_version(const char *modfilename, - char *version, - void *modinfo, - unsigned long version_offset) -{ - if (strip_rcs_crap(version)) - write_version(modfilename, version, version_offset); -} From 4ddea2f8e825a86e94011ebc32eb1dce220b2316 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:16 +0900 Subject: [PATCH 407/427] modpost: do not call get_modinfo() for vmlinux(.o) The three calls of get_modinfo() ("license", "import_ns", "version") always return NULL for vmlinux(.o) because the built-in module info is prefixed with __MODULE_INFO_PREFIX. It is harmless to call get_modinfo(), but there is no point to search for what apparently does not exist. Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 45 +++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index fbb3d3391e52..a5da633af700 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2006,25 +2006,26 @@ static void read_symbols(const char *modname) mod->skip = 1; } - license = get_modinfo(&info, "license"); - if (!license && !is_vmlinux(modname)) - warn("missing MODULE_LICENSE() in %s\n" - "see include/linux/module.h for " - "more information\n", modname); - while (license) { - if (license_is_gpl_compatible(license)) - mod->gpl_compatible = 1; - else { - mod->gpl_compatible = 0; - break; + if (!is_vmlinux(modname)) { + license = get_modinfo(&info, "license"); + if (!license) + warn("missing MODULE_LICENSE() in %s\n", modname); + while (license) { + if (license_is_gpl_compatible(license)) + mod->gpl_compatible = 1; + else { + mod->gpl_compatible = 0; + break; + } + license = get_next_modinfo(&info, "license", license); } - license = get_next_modinfo(&info, "license", license); - } - namespace = get_modinfo(&info, "import_ns"); - while (namespace) { - add_namespace(&mod->imported_namespaces, namespace); - namespace = get_next_modinfo(&info, "import_ns", namespace); + namespace = get_modinfo(&info, "import_ns"); + while (namespace) { + add_namespace(&mod->imported_namespaces, namespace); + namespace = get_next_modinfo(&info, "import_ns", + namespace); + } } for (sym = info.symtab_start; sym < info.symtab_stop; sym++) { @@ -2065,10 +2066,12 @@ static void read_symbols(const char *modname) if (!is_vmlinux(modname) || vmlinux_section_warnings) check_sec_ref(mod, modname, &info); - version = get_modinfo(&info, "version"); - if (version || (all_versions && !is_vmlinux(modname))) - get_src_version(modname, mod->srcversion, - sizeof(mod->srcversion)-1); + if (!is_vmlinux(modname)) { + version = get_modinfo(&info, "version"); + if (version || all_versions) + get_src_version(modname, mod->srcversion, + sizeof(mod->srcversion) - 1); + } parse_elf_finish(&info); From ac5100f54329676469688d1b5415cd8d6428c909 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:17 +0900 Subject: [PATCH 408/427] modpost: add read_text_file() and get_line() helpers modpost uses grab_file() to open a file, but it is not suitable for a text file because the mmap'ed file is not terminated by null byte. Actually, I see some issues for the use of grab_file(). The new helper, read_text_file() loads the whole file content into a malloc'ed buffer, and appends a null byte. Then, get_line() reads each line. To handle text files, I intend to replace as follows: grab_file() -> read_text_file() get_new_line() -> get_line() Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 49 +++++++++++++++++++++++++++++++++++++++++++ scripts/mod/modpost.h | 2 ++ 2 files changed, 51 insertions(+) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index a5da633af700..0a844902998e 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -112,6 +112,55 @@ void *do_nofail(void *ptr, const char *expr) return ptr; } +char *read_text_file(const char *filename) +{ + struct stat st; + size_t nbytes; + int fd; + char *buf; + + fd = open(filename, O_RDONLY); + if (fd < 0) { + perror(filename); + exit(1); + } + + if (fstat(fd, &st) < 0) { + perror(filename); + exit(1); + } + + buf = NOFAIL(malloc(st.st_size + 1)); + + nbytes = st.st_size; + + while (nbytes) { + ssize_t bytes_read; + + bytes_read = read(fd, buf, nbytes); + if (bytes_read < 0) { + perror(filename); + exit(1); + } + + nbytes -= bytes_read; + } + buf[st.st_size] = '\0'; + + close(fd); + + return buf; +} + +char *get_line(char **stringp) +{ + /* do not return the unwanted extra line at EOF */ + if (*stringp && **stringp == '\0') + return NULL; + + return strsep(stringp, "\n"); +} + /* A list of all modules we processed */ static struct module *modules; diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index e5eace03a2b3..f4412febcd13 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -191,6 +191,8 @@ void add_moddevtable(struct buffer *buf, struct module *mod); void get_src_version(const char *modname, char sum[], unsigned sumlen); /* from modpost.c */ +char *read_text_file(const char *filename); +char *get_line(char **stringp); void *grab_file(const char *filename, unsigned long *size); char* get_next_line(unsigned long *pos, void *file, unsigned long size); void release_file(void *file, unsigned long size); From f531c1b5de65bc687bdcca69e7649fe2db5b6d87 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:18 +0900 Subject: [PATCH 409/427] modpost: fix potential mmap'ed file overrun in get_src_version() I do not know how reliably this function works, but it looks dangerous to me. strchr(sources, '\n'); ... continues searching until it finds '\n' or it reaches the '\0' terminator. In other words, 'sources' should be a null-terminated string. However, grab_file() just mmaps a file, so 'sources' is not terminated with null byte. If the file does not contain '\n' at all, strchr() will go beyond the mmap'ed memory. Use read_text_file(), which loads the file content into a malloc'ed buffer, appending null byte. Here we are interested only in the first line of *.mod files. Use get_line() helper to get the first line. This also makes missing *.mod file a fatal error. Commit 4be40e22233c ("kbuild: do not emit src version warning for non-modules") ignored missing *.mod files. I do not fully understand what that commit addressed, but commit 91341d4b2c19 ("kbuild: introduce new option to enhance section mismatch analysis") introduced partial section checks by using modpost. built-in.o was parsed by modpost. Even modules had a problem because *.mod files were created after the modpost check. Commit b7dca6dd1e59 ("kbuild: create *.mod with full directory path and remove MODVERDIR") stopped doing that. Now that modpost is only invoked after the directory descend, *.mod files should always exist at the modpost stage. Signed-off-by: Masahiro Yamada --- scripts/mod/sumversion.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/scripts/mod/sumversion.c b/scripts/mod/sumversion.c index f27f22420cbc..5fb142db6195 100644 --- a/scripts/mod/sumversion.c +++ b/scripts/mod/sumversion.c @@ -392,40 +392,34 @@ out: /* Calc and record src checksum. */ void get_src_version(const char *modname, char sum[], unsigned sumlen) { - void *file; - unsigned long len; + char *buf, *pos, *firstline; struct md4_ctx md; - char *sources, *end, *fname; + char *fname; char filelist[PATH_MAX + 1]; /* objects for a module are listed in the first line of *.mod file. */ snprintf(filelist, sizeof(filelist), "%.*smod", (int)strlen(modname) - 1, modname); - file = grab_file(filelist, &len); - if (!file) - /* not a module or .mod file missing - ignore */ - return; + buf = read_text_file(filelist); - sources = file; - - end = strchr(sources, '\n'); - if (!end) { + pos = buf; + firstline = get_line(&pos); + if (!firstline) { warn("bad ending versions file for %s\n", modname); - goto release; + goto free; } - *end = '\0'; md4_init(&md); - while ((fname = strsep(&sources, " ")) != NULL) { + while ((fname = strsep(&firstline, " "))) { if (!*fname) continue; if (!(is_static_library(fname)) && !parse_source_files(fname, &md)) - goto release; + goto free; } md4_final_ascii(&md, sum, sumlen); -release: - release_file(file, len); +free: + free(buf); } From 7c8f5662c502b7b967399fef8a64532ec43b063d Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:19 +0900 Subject: [PATCH 410/427] modpost: avoid false-positive file open error One problem of grab_file() is that it cannot distinguish the following two cases: - It cannot read the file (the file does not exist, or read permission is not set) - It can read the file, but the file size is zero This is because grab_file() calls mmap(), which requires the mapped length is greater than 0. Hence, grab_file() fails for both cases. If an empty header file were included for checksum calculation, the following warning would be printed: WARNING: modpost: could not open ...: Invalid argument An empty file is a valid source file, so it should not fail. Use read_text_file() instead. It can read a zero-length file. Then, parse_file() will succeed with doing nothing. Going forward, the first case (it cannot read the file) is a fatal error. If the source file from which an object was compiled is missing, something went wrong. Signed-off-by: Masahiro Yamada --- scripts/mod/sumversion.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/mod/sumversion.c b/scripts/mod/sumversion.c index 5fb142db6195..9f77c9dfce20 100644 --- a/scripts/mod/sumversion.c +++ b/scripts/mod/sumversion.c @@ -258,9 +258,8 @@ static int parse_file(const char *fname, struct md4_ctx *md) char *file; unsigned long i, len; - file = grab_file(fname, &len); - if (!file) - return 0; + file = read_text_file(fname); + len = strlen(file); for (i = 0; i < len; i++) { /* Collapse and ignore \ and CR. */ @@ -287,7 +286,7 @@ static int parse_file(const char *fname, struct md4_ctx *md) add_char(file[i], md); } - release_file(file, len); + free(file); return 1; } /* Check whether the file is a static library or not */ From 70f30cfe5b892fcb7f98e7df72ed6ccfe3225628 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:20 +0900 Subject: [PATCH 411/427] modpost: use read_text_file() and get_line() for reading text files grab_file() mmaps a file, but it is not so efficient here because get_next_line() copies every line to the temporary buffer anyway. read_text_file() and get_line() are simpler. get_line() exploits the library function strchr(). Going forward, the missing *.symvers or *.cmd is a fatal error. This should not happen because scripts/Makefile.modpost guards the -i option files with $(wildcard $(input-symdump)). Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 15 ++++++++------- scripts/mod/sumversion.c | 16 ++++++---------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 0a844902998e..4fdf992e9729 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2481,15 +2481,16 @@ static void write_if_changed(struct buffer *b, const char *fname) **/ static void read_dump(const char *fname) { - unsigned long size, pos = 0; - void *file = grab_file(fname, &size); - char *line; + char *buf, *pos, *line; - if (!file) + buf = read_text_file(fname); + if (!buf) /* No symbol versions, silently ignore */ return; - while ((line = get_next_line(&pos, file, size))) { + pos = buf; + + while ((line = get_line(&pos))) { char *symname, *namespace, *modname, *d, *export; unsigned int crc; struct module *mod; @@ -2524,10 +2525,10 @@ static void read_dump(const char *fname) sym_set_crc(symname, crc); sym_update_namespace(symname, namespace); } - release_file(file, size); + free(buf); return; fail: - release_file(file, size); + free(buf); fatal("parse error in symbol dump file\n"); } diff --git a/scripts/mod/sumversion.c b/scripts/mod/sumversion.c index 9f77c9dfce20..d587f40f1117 100644 --- a/scripts/mod/sumversion.c +++ b/scripts/mod/sumversion.c @@ -303,9 +303,8 @@ static int is_static_library(const char *objfile) * to figure out source files. */ static int parse_source_files(const char *objfile, struct md4_ctx *md) { - char *cmd, *file, *line, *dir; + char *cmd, *file, *line, *dir, *pos; const char *base; - unsigned long flen, pos = 0; int dirlen, ret = 0, check_files = 0; cmd = NOFAIL(malloc(strlen(objfile) + sizeof("..cmd"))); @@ -323,14 +322,12 @@ static int parse_source_files(const char *objfile, struct md4_ctx *md) strncpy(dir, objfile, dirlen); dir[dirlen] = '\0'; - file = grab_file(cmd, &flen); - if (!file) { - warn("could not find %s for %s\n", cmd, objfile); - goto out; - } + file = read_text_file(cmd); + + pos = file; /* Sum all files in the same dir or subdirs. */ - while ((line = get_next_line(&pos, file, flen)) != NULL) { + while ((line = get_line(&pos))) { char* p = line; if (strncmp(line, "source_", sizeof("source_")-1) == 0) { @@ -381,8 +378,7 @@ static int parse_source_files(const char *objfile, struct md4_ctx *md) /* Everyone parsed OK */ ret = 1; out_file: - release_file(file, flen); -out: + free(file); free(dir); free(cmd); return ret; From 75893572d45399cefbb88443d0878adae9cb0b41 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:21 +0900 Subject: [PATCH 412/427] modpost: remove get_next_text() and make {grab,release_}file static get_next_line() is no longer used. Remove. grab_file() and release_file() are only used in modpost.c. Make them static. Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 38 ++------------------------------------ scripts/mod/modpost.h | 3 --- 2 files changed, 2 insertions(+), 39 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 4fdf992e9729..93019349f022 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -463,7 +463,7 @@ static void sym_set_crc(const char *name, unsigned int crc) s->crc_valid = 1; } -void *grab_file(const char *filename, unsigned long *size) +static void *grab_file(const char *filename, unsigned long *size) { struct stat st; void *map = MAP_FAILED; @@ -485,41 +485,7 @@ failed: return map; } -/** - * Return a copy of the next line in a mmap'ed file. - * spaces in the beginning of the line is trimmed away. - * Return a pointer to a static buffer. - **/ -char *get_next_line(unsigned long *pos, void *file, unsigned long size) -{ - static char line[4096]; - int skip = 1; - size_t len = 0; - signed char *p = (signed char *)file + *pos; - char *s = line; - - for (; *pos < size ; (*pos)++) { - if (skip && isspace(*p)) { - p++; - continue; - } - skip = 0; - if (*p != '\n' && (*pos < size)) { - len++; - *s++ = *p++; - if (len > 4095) - break; /* Too long, stop */ - } else { - /* End of string */ - *s = '\0'; - return line; - } - } - /* End of buffer */ - return NULL; -} - -void release_file(void *file, unsigned long size) +static void release_file(void *file, unsigned long size) { munmap(file, size); } diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index f4412febcd13..bc524506d2f9 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -193,9 +193,6 @@ void get_src_version(const char *modname, char sum[], unsigned sumlen); /* from modpost.c */ char *read_text_file(const char *filename); char *get_line(char **stringp); -void *grab_file(const char *filename, unsigned long *size); -char* get_next_line(unsigned long *pos, void *file, unsigned long size); -void release_file(void *file, unsigned long size); enum loglevel { LOG_WARN, From 467b82d7cee4373aa7bc47fd3043e2fa0a3440f5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:22 +0900 Subject: [PATCH 413/427] modpost: remove -s option The -s option was added by commit 8d8d8289df65 ("kbuild: do not do section mismatch checks on vmlinux in 2nd pass"). Now that the second pass does not parse vmlinux, this option is unneeded. Signed-off-by: Masahiro Yamada --- scripts/Makefile.modpost | 2 +- scripts/mod/modpost.c | 10 ++-------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index e47f87557f09..4938a6f368c0 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -62,7 +62,7 @@ __modpost: vmlinux.symvers else -MODPOST += -s \ +MODPOST += \ $(if $(KBUILD_NSDEPS),-d $(MODULES_NSDEPS)) ifeq ($(KBUILD_EXTMOD),) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 93019349f022..b667f531a645 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -30,8 +30,6 @@ static int have_vmlinux = 0; static int all_versions = 0; /* If we are modposting external module set to 1 */ static int external_module = 0; -/* Warn about section mismatch in vmlinux if set to 1 */ -static int vmlinux_section_warnings = 1; /* Only warn about unresolved symbols */ static int warn_unresolved = 0; /* How a symbol is exported */ @@ -2078,8 +2076,7 @@ static void read_symbols(const char *modname) } } - if (!is_vmlinux(modname) || vmlinux_section_warnings) - check_sec_ref(mod, modname, &info); + check_sec_ref(mod, modname, &info); if (!is_vmlinux(modname)) { version = get_modinfo(&info, "version"); @@ -2576,7 +2573,7 @@ int main(int argc, char **argv) struct dump_list *dump_read_start = NULL; struct dump_list **dump_read_iter = &dump_read_start; - while ((opt = getopt(argc, argv, "ei:mnsT:o:awENd:")) != -1) { + while ((opt = getopt(argc, argv, "ei:mnT:o:awENd:")) != -1) { switch (opt) { case 'e': external_module = 1; @@ -2599,9 +2596,6 @@ int main(int argc, char **argv) case 'a': all_versions = 1; break; - case 's': - vmlinux_section_warnings = 0; - break; case 'T': files_source = optarg; break; From 859c926aea29353bced3a456c2f73753040b437e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:23 +0900 Subject: [PATCH 414/427] modpost: move -d option in scripts/Makefile.modpost Collect options for modules into a single place. Signed-off-by: Masahiro Yamada --- scripts/Makefile.modpost | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index 4938a6f368c0..3651cbf6ad49 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -62,9 +62,6 @@ __modpost: vmlinux.symvers else -MODPOST += \ - $(if $(KBUILD_NSDEPS),-d $(MODULES_NSDEPS)) - ifeq ($(KBUILD_EXTMOD),) input-symdump := vmlinux.symvers @@ -91,6 +88,7 @@ endif # modpost options for modules (both in-kernel and external) MODPOST += \ $(addprefix -i ,$(wildcard $(input-symdump))) \ + $(if $(KBUILD_NSDEPS),-d $(MODULES_NSDEPS)) \ $(if $(CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS)$(KBUILD_NSDEPS),-N) # 'make -i -k' ignores compile errors, and builds as many modules as possible. From 3379576dd6e708f66498d49b4cec5f9b198791a0 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:24 +0900 Subject: [PATCH 415/427] modpost: remove mod->is_dot_o struct member Previously, there were two cases where mod->is_dot_o is unset: [1] the executable 'vmlinux' in the second pass of modpost [2] modules loaded by read_dump() I think [1] was intended usage to distinguish 'vmlinux.o' and 'vmlinux'. Now that modpost does not parse the executable 'vmlinux', this case does not happen. [2] is obscure, maybe a bug. Module.symver stores module paths without extension. So, none of modules loaded by read_dump() has the .o suffix, and new_module() unsets ->is_dot_o. Anyway, it is not a big deal because handle_symbol() is not called for the case. To sum up, all the parsed ELF files are .o files. mod->is_dot_o is unneeded. Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 14 ++------------ scripts/mod/modpost.h | 1 - 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index b667f531a645..bc00bbac50bb 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -182,10 +182,8 @@ static struct module *new_module(const char *modname) p = NOFAIL(strdup(modname)); /* strip trailing .o */ - if (strends(p, ".o")) { + if (strends(p, ".o")) p[strlen(p) - 2] = '\0'; - mod->is_dot_o = 1; - } /* add to list */ mod->name = p; @@ -716,8 +714,7 @@ static void handle_symbol(struct module *mod, struct elf_info *info, enum export export; const char *name; - if ((!is_vmlinux(mod->name) || mod->is_dot_o) && - strstarts(symname, "__ksymtab")) + if (strstarts(symname, "__ksymtab")) export = export_from_secname(info, get_secindex(info, sym)); else export = export_from_sec(info, get_secindex(info, sym)); @@ -2676,13 +2673,6 @@ int main(int argc, char **argv) struct symbol *s; for (s = symbolhash[n]; s; s = s->next) { - /* - * Do not check "vmlinux". This avoids the same warnings - * shown twice, and false-positives for ARCH=um. - */ - if (is_vmlinux(s->module->name) && !s->module->is_dot_o) - continue; - if (s->is_static) warn("\"%s\" [%s] is a static %s\n", s->name, s->module->name, diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index bc524506d2f9..68d813abf33d 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -126,7 +126,6 @@ struct module { int has_cleanup; struct buffer dev_table_buf; char srcversion[25]; - int is_dot_o; // Missing namespace dependencies struct namespace_list *missing_namespaces; // Actual imported namespaces From 1be5fa6c948533bb95ac783010ef686261be5384 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:25 +0900 Subject: [PATCH 416/427] modpost: remove is_vmlinux() call in check_for_{gpl_usage,unused}() check_exports() is never called for vmlinux because mod->skip is set for vmlinux. Hence, check_for_gpl_usage() and check_for_unused() are not called for vmlinux, either. is_vmlinux() is always false here. Remove the is_vmlinux() calls, and hard-code the ".ko" suffix. Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index bc00bbac50bb..84a642c14775 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2144,20 +2144,18 @@ void buf_write(struct buffer *buf, const char *s, int len) static void check_for_gpl_usage(enum export exp, const char *m, const char *s) { - const char *e = is_vmlinux(m) ?"":".ko"; - switch (exp) { case export_gpl: - fatal("GPL-incompatible module %s%s " - "uses GPL-only symbol '%s'\n", m, e, s); + fatal("GPL-incompatible module %s.ko uses GPL-only symbol '%s'\n", + m, s); break; case export_unused_gpl: - fatal("GPL-incompatible module %s%s " - "uses GPL-only symbol marked UNUSED '%s'\n", m, e, s); + fatal("GPL-incompatible module %s.ko uses GPL-only symbol marked UNUSED '%s'\n", + m, s); break; case export_gpl_future: - warn("GPL-incompatible module %s%s " - "uses future GPL-only symbol '%s'\n", m, e, s); + warn("GPL-incompatible module %s.ko uses future GPL-only symbol '%s'\n", + m, s); break; case export_plain: case export_unused: @@ -2169,13 +2167,11 @@ static void check_for_gpl_usage(enum export exp, const char *m, const char *s) static void check_for_unused(enum export exp, const char *m, const char *s) { - const char *e = is_vmlinux(m) ?"":".ko"; - switch (exp) { case export_unused: case export_unused_gpl: - warn("module %s%s " - "uses symbol '%s' marked UNUSED\n", m, e, s); + warn("module %s.ko uses symbol '%s' marked UNUSED\n", + m, s); break; default: /* ignore */ From 5a438af9db2c4a0b80d51d8c1c9c623b0c0de967 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:26 +0900 Subject: [PATCH 417/427] modpost: add mod->is_vmlinux struct member is_vmlinux() is called in several places to check whether the current module is vmlinux or not. It is faster and clearer to check mod->is_vmlinux flag. Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 19 ++++++++++--------- scripts/mod/modpost.h | 1 + 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 84a642c14775..167700a7b80f 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -187,6 +187,7 @@ static struct module *new_module(const char *modname) /* add to list */ mod->name = p; + mod->is_vmlinux = is_vmlinux(modname); mod->gpl_compatible = -1; mod->next = modules; modules = mod; @@ -431,11 +432,11 @@ static struct symbol *sym_add_exported(const char *name, struct module *mod, if (!s) { s = new_symbol(name, mod, export); - } else if (!external_module || is_vmlinux(s->module->name) || + } else if (!external_module || s->module->is_vmlinux || s->module == mod) { warn("%s: '%s' exported twice. Previous export was in %s%s\n", mod->name, name, s->module->name, - is_vmlinux(s->module->name) ? "" : ".ko"); + s->module->is_vmlinux ? "" : ".ko"); return s; } @@ -692,7 +693,7 @@ static void handle_modversion(const struct module *mod, if (sym->st_shndx == SHN_UNDEF) { warn("EXPORT symbol \"%s\" [%s%s] version generation failed, symbol will not be versioned.\n", - symname, mod->name, is_vmlinux(mod->name) ? "":".ko"); + symname, mod->name, mod->is_vmlinux ? "" : ".ko"); return; } @@ -2011,12 +2012,12 @@ static void read_symbols(const char *modname) mod = new_module(modname); - if (is_vmlinux(modname)) { + if (mod->is_vmlinux) { have_vmlinux = 1; mod->skip = 1; } - if (!is_vmlinux(modname)) { + if (!mod->is_vmlinux) { license = get_modinfo(&info, "license"); if (!license) warn("missing MODULE_LICENSE() in %s\n", modname); @@ -2075,7 +2076,7 @@ static void read_symbols(const char *modname) check_sec_ref(mod, modname, &info); - if (!is_vmlinux(modname)) { + if (!mod->is_vmlinux) { version = get_modinfo(&info, "version"); if (version || all_versions) get_src_version(modname, mod->srcversion, @@ -2345,7 +2346,7 @@ static void add_depends(struct buffer *b, struct module *mod) /* Clear ->seen flag of modules that own symbols needed by this. */ for (s = mod->unres; s; s = s->next) if (s->module) - s->module->seen = is_vmlinux(s->module->name); + s->module->seen = s->module->is_vmlinux; buf_printf(b, "\n"); buf_printf(b, "MODULE_INFO(depends, \""); @@ -2470,9 +2471,9 @@ static void read_dump(const char *fname) goto fail; mod = find_module(modname); if (!mod) { - if (is_vmlinux(modname)) - have_vmlinux = 1; mod = new_module(modname); + if (mod->is_vmlinux) + have_vmlinux = 1; mod->skip = 1; mod->from_dump = 1; } diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index 68d813abf33d..87251729539e 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -120,6 +120,7 @@ struct module { int gpl_compatible; struct symbol *unres; int from_dump; /* 1 if module was loaded from *.symvers */ + int is_vmlinux; int seen; int skip; int has_init; From 0b19d54cae11bd5b9e208f52e42d88ad33a3b1d9 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:27 +0900 Subject: [PATCH 418/427] modpost: remove mod->skip struct member The meaning of 'skip' is obscure since it does not explain "what to skip". mod->skip is set when it is vmlinux or the module info came from a dump file. So, mod->skip is equivalent to (mod->is_vmlinux || mod->from_dump). For the check in write_namespace_deps_files(), mod->is_vmlinux is unneeded because the -d option is not passed in the first pass of modpost. Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 9 +++------ scripts/mod/modpost.h | 1 - 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 167700a7b80f..925c1a1856aa 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2012,10 +2012,8 @@ static void read_symbols(const char *modname) mod = new_module(modname); - if (mod->is_vmlinux) { + if (mod->is_vmlinux) have_vmlinux = 1; - mod->skip = 1; - } if (!mod->is_vmlinux) { license = get_modinfo(&info, "license"); @@ -2474,7 +2472,6 @@ static void read_dump(const char *fname) mod = new_module(modname); if (mod->is_vmlinux) have_vmlinux = 1; - mod->skip = 1; mod->from_dump = 1; } s = sym_add_exported(symname, mod, export_no(export)); @@ -2535,7 +2532,7 @@ static void write_namespace_deps_files(const char *fname) for (mod = modules; mod; mod = mod->next) { - if (mod->skip || !mod->missing_namespaces) + if (mod->from_dump || !mod->missing_namespaces) continue; buf_printf(&ns_deps_buf, "%s.ko:", mod->name); @@ -2637,7 +2634,7 @@ int main(int argc, char **argv) for (mod = modules; mod; mod = mod->next) { char fname[PATH_MAX]; - if (mod->skip) + if (mod->is_vmlinux || mod->from_dump) continue; buf.pos = 0; diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index 87251729539e..3dc9e8fa5d1f 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -122,7 +122,6 @@ struct module { int from_dump; /* 1 if module was loaded from *.symvers */ int is_vmlinux; int seen; - int skip; int has_init; int has_cleanup; struct buffer dev_table_buf; From 858b937d289bbf7551d496100c1fa9efcad5796e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:28 +0900 Subject: [PATCH 419/427] modpost: set have_vmlinux in new_module() Set have_vmlinux flag in a single place. Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 925c1a1856aa..b317328ae21b 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -192,6 +192,9 @@ static struct module *new_module(const char *modname) mod->next = modules; modules = mod; + if (mod->is_vmlinux) + have_vmlinux = 1; + return mod; } @@ -2012,9 +2015,6 @@ static void read_symbols(const char *modname) mod = new_module(modname); - if (mod->is_vmlinux) - have_vmlinux = 1; - if (!mod->is_vmlinux) { license = get_modinfo(&info, "license"); if (!license) @@ -2470,8 +2470,6 @@ static void read_dump(const char *fname) mod = find_module(modname); if (!mod) { mod = new_module(modname); - if (mod->is_vmlinux) - have_vmlinux = 1; mod->from_dump = 1; } s = sym_add_exported(symname, mod, export_no(export)); From a82f794c41ab51f088af325f5d9acba30a6facdb Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:29 +0900 Subject: [PATCH 420/427] modpost: strip .o from modname before calling new_module() new_module() conditionally strips the .o because the modname has .o suffix when it is called from read_symbols(), but no .o when it is called from read_dump(). It is clearer to strip .o in read_symbols(). I also used flexible-array for mod->name. Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 20 +++++++++++--------- scripts/mod/modpost.h | 2 +- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index b317328ae21b..ebfa9b76ba92 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -175,18 +175,12 @@ static struct module *find_module(const char *modname) static struct module *new_module(const char *modname) { struct module *mod; - char *p; - mod = NOFAIL(malloc(sizeof(*mod))); + mod = NOFAIL(malloc(sizeof(*mod) + strlen(modname) + 1)); memset(mod, 0, sizeof(*mod)); - p = NOFAIL(strdup(modname)); - - /* strip trailing .o */ - if (strends(p, ".o")) - p[strlen(p) - 2] = '\0'; /* add to list */ - mod->name = p; + strcpy(mod->name, modname); mod->is_vmlinux = is_vmlinux(modname); mod->gpl_compatible = -1; mod->next = modules; @@ -2013,7 +2007,15 @@ static void read_symbols(const char *modname) if (!parse_elf(&info, modname)) return; - mod = new_module(modname); + { + char *tmp; + + /* strip trailing .o */ + tmp = NOFAIL(strdup(modname)); + tmp[strlen(tmp) - 2] = '\0'; + mod = new_module(tmp); + free(tmp); + } if (!mod->is_vmlinux) { license = get_modinfo(&info, "license"); diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index 3dc9e8fa5d1f..254c75378583 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -116,7 +116,6 @@ struct namespace_list { struct module { struct module *next; - const char *name; int gpl_compatible; struct symbol *unres; int from_dump; /* 1 if module was loaded from *.symvers */ @@ -130,6 +129,7 @@ struct module { struct namespace_list *missing_namespaces; // Actual imported namespaces struct namespace_list *imported_namespaces; + char name[]; }; struct elf_info { From 4de7b62936122570408357417f21072e78292926 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:30 +0900 Subject: [PATCH 421/427] modpost: remove is_vmlinux() helper Now that is_vmlinux() is called only in new_module(), we can inline the function call. modname is the basename with '.o' is stripped. No need to compare it with 'vmlinux.o'. vmlinux is always located at the current working directory. No need to strip the directory path. Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index ebfa9b76ba92..a3ffabf4eca5 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -88,20 +88,6 @@ static inline bool strends(const char *str, const char *postfix) return strcmp(str + strlen(str) - strlen(postfix), postfix) == 0; } -static int is_vmlinux(const char *modname) -{ - const char *myname; - - myname = strrchr(modname, '/'); - if (myname) - myname++; - else - myname = modname; - - return (strcmp(myname, "vmlinux") == 0) || - (strcmp(myname, "vmlinux.o") == 0); -} - void *do_nofail(void *ptr, const char *expr) { if (!ptr) @@ -181,7 +167,7 @@ static struct module *new_module(const char *modname) /* add to list */ strcpy(mod->name, modname); - mod->is_vmlinux = is_vmlinux(modname); + mod->is_vmlinux = (strcmp(modname, "vmlinux") == 0); mod->gpl_compatible = -1; mod->next = modules; modules = mod; From 3b09efc4f0c94669a928c0453d2dcb54c59543f2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jun 2020 14:57:31 +0900 Subject: [PATCH 422/427] modpost: change elf_info->size to size_t Align with the mmap / munmap APIs. Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 9 ++++----- scripts/mod/modpost.h | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index a3ffabf4eca5..e5cee2367d5e 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -443,7 +443,7 @@ static void sym_set_crc(const char *name, unsigned int crc) s->crc_valid = 1; } -static void *grab_file(const char *filename, unsigned long *size) +static void *grab_file(const char *filename, size_t *size) { struct stat st; void *map = MAP_FAILED; @@ -465,7 +465,7 @@ failed: return map; } -static void release_file(void *file, unsigned long size) +static void release_file(void *file, size_t size) { munmap(file, size); } @@ -521,9 +521,8 @@ static int parse_elf(struct elf_info *info, const char *filename) /* Check if file offset is correct */ if (hdr->e_shoff > info->size) { - fatal("section header offset=%lu in file '%s' is bigger than " - "filesize=%lu\n", (unsigned long)hdr->e_shoff, - filename, info->size); + fatal("section header offset=%lu in file '%s' is bigger than filesize=%zu\n", + (unsigned long)hdr->e_shoff, filename, info->size); return 0; } diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index 254c75378583..3aa052722233 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -133,7 +133,7 @@ struct module { }; struct elf_info { - unsigned long size; + size_t size; Elf_Ehdr *hdr; Elf_Shdr *sechdrs; Elf_Sym *symtab_start; From c0901577e1dcc8d1c0fd1a11c8d571f650df845f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 2 Jun 2020 02:03:28 +0900 Subject: [PATCH 423/427] kbuild: doc: rename LDFLAGS to KBUILD_LDFLAGS Commit d503ac531a52 ("kbuild: rename LDFLAGS to KBUILD_LDFLAGS") missed to update the documentation. Signed-off-by: Masahiro Yamada --- Documentation/kbuild/makefiles.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst index 2a18aea7c043..6515ebc12b6f 100644 --- a/Documentation/kbuild/makefiles.rst +++ b/Documentation/kbuild/makefiles.rst @@ -910,7 +910,7 @@ When kbuild executes, the following steps are followed (roughly): 7.1 Set variables to tweak the build to the architecture -------------------------------------------------------- - LDFLAGS + KBUILD_LDFLAGS Generic $(LD) options Flags used for all invocations of the linker. @@ -919,7 +919,7 @@ When kbuild executes, the following steps are followed (roughly): Example:: #arch/s390/Makefile - LDFLAGS := -m elf_s390 + KBUILD_LDFLAGS := -m elf_s390 Note: ldflags-y can be used to further customise the flags used. See chapter 3.7. From 72d24accf02add25e08733f0ecc93cf10fcbd88c Mon Sep 17 00:00:00 2001 From: ashimida Date: Tue, 2 Jun 2020 15:45:17 +0800 Subject: [PATCH 424/427] mksysmap: Fix the mismatch of '.L' symbols in System.map When System.map was generated, the kernel used mksysmap to filter the kernel symbols, but all the symbols with the second letter 'L' in the kernel were filtered out, not just the symbols starting with 'dot + L'. For example: ashimida@ubuntu:~/linux$ cat System.map |grep ' .L' ashimida@ubuntu:~/linux$ nm -n vmlinux |grep ' .L' ffff0000088028e0 t bLength_show ...... ffff0000092e0408 b PLLP_OUTC_lock ffff0000092e0410 b PLLP_OUTA_lock The original intent should be to filter out all local symbols starting with '.L', so the dot should be escaped. Fixes: 00902e984732 ("mksysmap: Add h8300 local symbol pattern") Signed-off-by: ashimida Signed-off-by: Masahiro Yamada --- scripts/mksysmap | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/mksysmap b/scripts/mksysmap index a35acc0d0b82..9aa23d15862a 100755 --- a/scripts/mksysmap +++ b/scripts/mksysmap @@ -41,4 +41,4 @@ # so we just ignore them to let readprofile continue to work. # (At least sparc64 has __crc_ in the middle). -$NM -n $1 | grep -v '\( [aNUw] \)\|\(__crc_\)\|\( \$[adt]\)\|\( .L\)' > $2 +$NM -n $1 | grep -v '\( [aNUw] \)\|\(__crc_\)\|\( \$[adt]\)\|\( \.L\)' > $2 From e0b250b57dcf403529081e5898a9de717f96b76b Mon Sep 17 00:00:00 2001 From: Jonas Zeiger Date: Wed, 3 Jun 2020 15:34:05 +0200 Subject: [PATCH 425/427] Makefile: install modules.builtin even if CONFIG_MODULES=n Many applications check for available kernel features via: - /proc/modules (loaded modules, present if CONFIG_MODULES=y) - $(MODLIB)/modules.builtin (builtin modules) They fail to detect features if the kernel was built with CONFIG_MODULES=n and modules.builtin isn't installed. Therefore, add the target "_builtin_inst_" and make "install" and "modules_install" depend on it. Tests results: - make install: kernel image is copied as before, modules.builtin copied - make modules_install: (CONFIG_MODULES=n) nothing is copied, exit 1 Signed-off-by: Jonas Zeiger Signed-off-by: Masahiro Yamada --- Makefile | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index ee3ed9dfca2c..7b750dc0b2da 100644 --- a/Makefile +++ b/Makefile @@ -1322,6 +1322,16 @@ dt_binding_check: scripts_dtc # --------------------------------------------------------------------------- # Modules +# install modules.builtin regardless of CONFIG_MODULES +PHONY += _builtin_inst_ +_builtin_inst_: + @mkdir -p $(MODLIB)/ + @cp -f modules.builtin $(MODLIB)/ + @cp -f $(objtree)/modules.builtin.modinfo $(MODLIB)/ + +PHONY += install +install: _builtin_inst_ + ifdef CONFIG_MODULES # By default, build modules as well @@ -1365,7 +1375,7 @@ PHONY += modules_install modules_install: _modinst_ _modinst_post PHONY += _modinst_ -_modinst_: +_modinst_: _builtin_inst_ @rm -rf $(MODLIB)/kernel @rm -f $(MODLIB)/source @mkdir -p $(MODLIB)/kernel @@ -1375,8 +1385,6 @@ _modinst_: ln -s $(CURDIR) $(MODLIB)/build ; \ fi @sed 's:^:kernel/:' modules.order > $(MODLIB)/modules.order - @cp -f modules.builtin $(MODLIB)/ - @cp -f $(objtree)/modules.builtin.modinfo $(MODLIB)/ $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modinst # This depmod is only for convenience to give the initial From 8dfb61dcbaceb19a5ded5e9c9dcf8d05acc32294 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Fri, 5 Jun 2020 10:39:55 +0300 Subject: [PATCH 426/427] kbuild: add variables for compression tools Allow user to use alternative implementations of compression tools, such as pigz, pbzip2, pxz. For example, multi-threaded tools to speed up the build: $ make GZIP=pigz BZIP2=pbzip2 Variables _GZIP, _BZIP2, _LZOP are used internally because original env vars are reserved by the tools. The use of GZIP in gzip tool is obsolete since 2015. However, alternative implementations (e.g., pigz) still rely on it. BZIP2, BZIP, LZOP vars are not obsolescent. The credit goes to @grsecurity. As a sidenote, for multi-threaded lzma, xz compression one can use: $ export XZ_OPT="--threads=0" Signed-off-by: Denis Efremov Signed-off-by: Masahiro Yamada --- Makefile | 25 +++++++++++++++++++++++-- arch/arm/boot/deflate_xip_data.sh | 2 +- arch/ia64/Makefile | 2 +- arch/m68k/Makefile | 8 ++++---- arch/parisc/Makefile | 2 +- kernel/gen_kheaders.sh | 2 +- scripts/Makefile.lib | 12 ++++++------ scripts/Makefile.package | 8 ++++---- scripts/package/buildtar | 6 +++--- scripts/xz_wrap.sh | 2 +- 10 files changed, 45 insertions(+), 24 deletions(-) diff --git a/Makefile b/Makefile index 7b750dc0b2da..64ebbc1dfbef 100644 --- a/Makefile +++ b/Makefile @@ -458,6 +458,26 @@ PYTHON = python PYTHON3 = python3 CHECK = sparse BASH = bash +GZIP = gzip +BZIP2 = bzip2 +LZOP = lzop +LZMA = lzma +LZ4 = lz4c +XZ = xz + +# GZIP, BZIP2, LZOP env vars are used by the tools. Support them as the command +# line interface, but use _GZIP, _BZIP2, _LZOP internally. +_GZIP := $(GZIP) +_BZIP2 := $(BZIP2) +_LZOP := $(LZOP) + +# Reset GZIP, BZIP2, LZOP in this Makefile +override GZIP= +override BZIP2= +override LZOP= + +# Reset GZIP, BZIP2, LZOP in recursive invocations +MAKEOVERRIDES += GZIP= BZIP2= LZOP= CHECKFLAGS := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \ -Wbitwise -Wno-return-void -Wno-unknown-attribute $(CF) @@ -506,6 +526,7 @@ CLANG_FLAGS := export ARCH SRCARCH CONFIG_SHELL BASH HOSTCC KBUILD_HOSTCFLAGS CROSS_COMPILE LD CC export CPP AR NM STRIP OBJCOPY OBJDUMP OBJSIZE READELF PAHOLE LEX YACC AWK INSTALLKERNEL export PERL PYTHON PYTHON3 CHECK CHECKFLAGS MAKE UTS_MACHINE HOSTCXX +export _GZIP _BZIP2 _LZOP LZMA LZ4 XZ export KBUILD_HOSTCXXFLAGS KBUILD_HOSTLDFLAGS KBUILD_HOSTLDLIBS LDFLAGS_MODULE export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS KBUILD_LDFLAGS @@ -1020,10 +1041,10 @@ export mod_strip_cmd mod_compress_cmd = true ifdef CONFIG_MODULE_COMPRESS ifdef CONFIG_MODULE_COMPRESS_GZIP - mod_compress_cmd = gzip -n -f + mod_compress_cmd = $(_GZIP) -n -f endif # CONFIG_MODULE_COMPRESS_GZIP ifdef CONFIG_MODULE_COMPRESS_XZ - mod_compress_cmd = xz -f + mod_compress_cmd = $(XZ) -f endif # CONFIG_MODULE_COMPRESS_XZ endif # CONFIG_MODULE_COMPRESS export mod_compress_cmd diff --git a/arch/arm/boot/deflate_xip_data.sh b/arch/arm/boot/deflate_xip_data.sh index 40937248cebe..739f0464321e 100755 --- a/arch/arm/boot/deflate_xip_data.sh +++ b/arch/arm/boot/deflate_xip_data.sh @@ -56,7 +56,7 @@ trap 'rm -f "$XIPIMAGE.tmp"; exit 1' 1 2 3 # substitute the data section by a compressed version $DD if="$XIPIMAGE" count=$data_start iflag=count_bytes of="$XIPIMAGE.tmp" $DD if="$XIPIMAGE" skip=$data_start iflag=skip_bytes | -gzip -9 >> "$XIPIMAGE.tmp" +$_GZIP -9 >> "$XIPIMAGE.tmp" # replace kernel binary mv -f "$XIPIMAGE.tmp" "$XIPIMAGE" diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile index 32240000dc0c..f817f3d5e758 100644 --- a/arch/ia64/Makefile +++ b/arch/ia64/Makefile @@ -40,7 +40,7 @@ $(error Sorry, you need a newer version of the assember, one that is built from endif quiet_cmd_gzip = GZIP $@ -cmd_gzip = cat $(real-prereqs) | gzip -n -f -9 > $@ +cmd_gzip = cat $(real-prereqs) | $(_GZIP) -n -f -9 > $@ quiet_cmd_objcopy = OBJCOPY $@ cmd_objcopy = $(OBJCOPY) $(OBJCOPYFLAGS) $(OBJCOPYFLAGS_$(@F)) $< $@ diff --git a/arch/m68k/Makefile b/arch/m68k/Makefile index 5d9288384096..ce6db5e5a5a3 100644 --- a/arch/m68k/Makefile +++ b/arch/m68k/Makefile @@ -135,10 +135,10 @@ vmlinux.gz: vmlinux ifndef CONFIG_KGDB cp vmlinux vmlinux.tmp $(STRIP) vmlinux.tmp - gzip -9c vmlinux.tmp >vmlinux.gz + $(_GZIP) -9c vmlinux.tmp >vmlinux.gz rm vmlinux.tmp else - gzip -9c vmlinux >vmlinux.gz + $(_GZIP) -9c vmlinux >vmlinux.gz endif bzImage: vmlinux.bz2 @@ -148,10 +148,10 @@ vmlinux.bz2: vmlinux ifndef CONFIG_KGDB cp vmlinux vmlinux.tmp $(STRIP) vmlinux.tmp - bzip2 -1c vmlinux.tmp >vmlinux.bz2 + $(_BZIP2) -1c vmlinux.tmp >vmlinux.bz2 rm vmlinux.tmp else - bzip2 -1c vmlinux >vmlinux.bz2 + $(_BZIP2) -1c vmlinux >vmlinux.bz2 endif archclean: diff --git a/arch/parisc/Makefile b/arch/parisc/Makefile index 628cd8bb7ad8..e1aa514aeb36 100644 --- a/arch/parisc/Makefile +++ b/arch/parisc/Makefile @@ -162,7 +162,7 @@ vmlinuz: bzImage $(OBJCOPY) $(boot)/bzImage $@ else vmlinuz: vmlinux - @gzip -cf -9 $< > $@ + @$(_GZIP) -cf -9 $< > $@ endif install: diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index e13ca842eb7e..c1510f0ab3ea 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -88,7 +88,7 @@ find $cpio_dir -type f -print0 | find $cpio_dir -printf "./%P\n" | LC_ALL=C sort | \ tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ --owner=0 --group=0 --numeric-owner --no-recursion \ - -Jcf $tarfile -C $cpio_dir/ -T - > /dev/null + -I $XZ -cf $tarfile -C $cpio_dir/ -T - > /dev/null echo $headers_md5 > kernel/kheaders.md5 echo "$this_file_md5" >> kernel/kheaders.md5 diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index e598b07e6de4..127f2a7e3ced 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -244,7 +244,7 @@ cmd_objcopy = $(OBJCOPY) $(OBJCOPYFLAGS) $(OBJCOPYFLAGS_$(@F)) $< $@ # --------------------------------------------------------------------------- quiet_cmd_gzip = GZIP $@ - cmd_gzip = cat $(real-prereqs) | gzip -n -f -9 > $@ + cmd_gzip = cat $(real-prereqs) | $(_GZIP) -n -f -9 > $@ # DTC # --------------------------------------------------------------------------- @@ -337,19 +337,19 @@ printf "%08x\n" $$dec_size | \ ) quiet_cmd_bzip2 = BZIP2 $@ - cmd_bzip2 = { cat $(real-prereqs) | bzip2 -9; $(size_append); } > $@ + cmd_bzip2 = { cat $(real-prereqs) | $(_BZIP2) -9; $(size_append); } > $@ # Lzma # --------------------------------------------------------------------------- quiet_cmd_lzma = LZMA $@ - cmd_lzma = { cat $(real-prereqs) | lzma -9; $(size_append); } > $@ + cmd_lzma = { cat $(real-prereqs) | $(LZMA) -9; $(size_append); } > $@ quiet_cmd_lzo = LZO $@ - cmd_lzo = { cat $(real-prereqs) | lzop -9; $(size_append); } > $@ + cmd_lzo = { cat $(real-prereqs) | $(_LZOP) -9; $(size_append); } > $@ quiet_cmd_lz4 = LZ4 $@ - cmd_lz4 = { cat $(real-prereqs) | lz4c -l -c1 stdin stdout; \ + cmd_lz4 = { cat $(real-prereqs) | $(LZ4) -l -c1 stdin stdout; \ $(size_append); } > $@ # U-Boot mkimage @@ -396,7 +396,7 @@ quiet_cmd_xzkern = XZKERN $@ $(size_append); } > $@ quiet_cmd_xzmisc = XZMISC $@ - cmd_xzmisc = cat $(real-prereqs) | xz --check=crc32 --lzma2=dict=1MiB > $@ + cmd_xzmisc = cat $(real-prereqs) | $(XZ) --check=crc32 --lzma2=dict=1MiB > $@ # ASM offsets # --------------------------------------------------------------------------- diff --git a/scripts/Makefile.package b/scripts/Makefile.package index 02135d2671a6..b2b6153af63a 100644 --- a/scripts/Makefile.package +++ b/scripts/Makefile.package @@ -45,7 +45,7 @@ if test "$(objtree)" != "$(srctree)"; then \ false; \ fi ; \ $(srctree)/scripts/setlocalversion --save-scmversion; \ -tar -cz $(RCS_TAR_IGNORE) -f $(2).tar.gz \ +tar -I $(_GZIP) -c $(RCS_TAR_IGNORE) -f $(2).tar.gz \ --transform 's:^:$(2)/:S' $(TAR_CONTENT) $(3); \ rm -f $(objtree)/.scmversion @@ -127,9 +127,9 @@ util/PERF-VERSION-GEN $(CURDIR)/$(perf-tar)/); \ tar rf $(perf-tar).tar $(perf-tar)/HEAD $(perf-tar)/PERF-VERSION-FILE; \ rm -r $(perf-tar); \ $(if $(findstring tar-src,$@),, \ -$(if $(findstring bz2,$@),bzip2, \ -$(if $(findstring gz,$@),gzip, \ -$(if $(findstring xz,$@),xz, \ +$(if $(findstring bz2,$@),$(_BZIP2), \ +$(if $(findstring gz,$@),$(_GZIP), \ +$(if $(findstring xz,$@),$(XZ), \ $(error unknown target $@)))) \ -f -9 $(perf-tar).tar) diff --git a/scripts/package/buildtar b/scripts/package/buildtar index 77c7caefede1..ad62c6879622 100755 --- a/scripts/package/buildtar +++ b/scripts/package/buildtar @@ -28,15 +28,15 @@ case "${1}" in opts= ;; targz-pkg) - opts=--gzip + opts="-I ${_GZIP}" tarball=${tarball}.gz ;; tarbz2-pkg) - opts=--bzip2 + opts="-I ${_BZIP2}" tarball=${tarball}.bz2 ;; tarxz-pkg) - opts=--xz + opts="-I ${XZ}" tarball=${tarball}.xz ;; *) diff --git a/scripts/xz_wrap.sh b/scripts/xz_wrap.sh index 7a2d372f4885..76e9cbcfbeab 100755 --- a/scripts/xz_wrap.sh +++ b/scripts/xz_wrap.sh @@ -20,4 +20,4 @@ case $SRCARCH in sparc) BCJ=--sparc ;; esac -exec xz --check=crc32 $BCJ --lzma2=$LZMA2OPTS,dict=32MiB +exec $XZ --check=crc32 $BCJ --lzma2=$LZMA2OPTS,dict=32MiB From 9fa88c5d3f5eae3e68ef20d226c3f13e21490668 Mon Sep 17 00:00:00 2001 From: Zou Wei Date: Sat, 6 Jun 2020 18:57:41 +0200 Subject: [PATCH 427/427] hpfs: fix warning due to superfluous semicolon Fixes coccicheck warning: fs/hpfs/buffer.c:56:2-3: Unneeded semicolon Reported-by: Hulk Robot Signed-off-by: Zou Wei Signed-off-by: Mikulas Patocka Signed-off-by: Linus Torvalds --- fs/hpfs/buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c index e285d6b3bba4..d39246865c51 100644 --- a/fs/hpfs/buffer.c +++ b/fs/hpfs/buffer.c @@ -53,7 +53,7 @@ void hpfs_prefetch_sectors(struct super_block *s, unsigned secno, int n) return; } brelse(bh); - }; + } blk_start_plug(&plug); while (n > 0) {