diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index 8408670d0328..f93333524a44 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -1495,7 +1495,7 @@ Extended error information # { stacktrace: - _do_fork+0x18e/0x330 + kernel_clone+0x18e/0x330 kernel_thread+0x29/0x30 kthreadd+0x154/0x1b0 ret_from_fork+0x3f/0x70 @@ -1588,7 +1588,7 @@ Extended error information SYSC_sendto+0xef/0x170 } hitcount: 88 { stacktrace: - _do_fork+0x18e/0x330 + kernel_clone+0x18e/0x330 SyS_clone+0x19/0x20 entry_SYSCALL_64_fastpath+0x12/0x6a } hitcount: 244 diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index 83ce3caf7313..aea0a40b77a9 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -172,5 +172,5 @@ asmlinkage int sys_clone(unsigned long __user *args) kargs.exit_signal = (lower_32_bits(clone_flags) & CSIGNAL); kargs.stack = newsp; - return _do_fork(&kargs); + return kernel_clone(&kargs); } diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index e74e10f19fff..f25f2f723196 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -271,7 +271,7 @@ ia64_load_extra (struct task_struct *task) * * * sys_clone : - * _do_fork _do_fork + * kernel_clone kernel_clone * copy_thread copy_thread * * This means that the stack layout is as follows: @@ -411,7 +411,7 @@ asmlinkage long ia64_clone(unsigned long clone_flags, unsigned long stack_start, .tls = tls, }; - return _do_fork(&args); + return kernel_clone(&args); } static void diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 6492a2c54dbc..08359a6e058f 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -107,10 +107,10 @@ void flush_thread(void) * on top of pt_regs, which means that sys_clone() arguments would be * buried. We could, of course, copy them, but it's too costly for no * good reason - generic clone() would have to copy them *again* for - * _do_fork() anyway. So in this case it's actually better to pass pt_regs * - * and extract arguments for _do_fork() from there. Eventually we might - * go for calling _do_fork() directly from the wrapper, but only after we - * are finished with _do_fork() prototype conversion. + * kernel_clone() anyway. So in this case it's actually better to pass pt_regs * + * and extract arguments for kernel_clone() from there. Eventually we might + * go for calling kernel_clone() directly from the wrapper, but only after we + * are finished with kernel_clone() prototype conversion. */ asmlinkage int m68k_clone(struct pt_regs *regs) { @@ -125,7 +125,7 @@ asmlinkage int m68k_clone(struct pt_regs *regs) .tls = regs->d5, }; - return _do_fork(&args); + return kernel_clone(&args); } /* diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c index 88a4ec03edab..4ffe857e6ada 100644 --- a/arch/nios2/kernel/process.c +++ b/arch/nios2/kernel/process.c @@ -266,5 +266,5 @@ asmlinkage int nios2_clone(unsigned long clone_flags, unsigned long newsp, .tls = tls, }; - return _do_fork(&args); + return kernel_clone(&args); } diff --git a/arch/sparc/kernel/process.c b/arch/sparc/kernel/process.c index 5234b5ccc0b9..0442ab00518d 100644 --- a/arch/sparc/kernel/process.c +++ b/arch/sparc/kernel/process.c @@ -25,7 +25,7 @@ asmlinkage long sparc_fork(struct pt_regs *regs) .stack = regs->u_regs[UREG_FP], }; - ret = _do_fork(&args); + ret = kernel_clone(&args); /* If we get an error and potentially restart the system * call, we're screwed because copy_thread() clobbered @@ -50,7 +50,7 @@ asmlinkage long sparc_vfork(struct pt_regs *regs) .stack = regs->u_regs[UREG_FP], }; - ret = _do_fork(&args); + ret = kernel_clone(&args); /* If we get an error and potentially restart the system * call, we're screwed because copy_thread() clobbered @@ -96,7 +96,7 @@ asmlinkage long sparc_clone(struct pt_regs *regs) else args.stack = regs->u_regs[UREG_FP]; - ret = _do_fork(&args); + ret = kernel_clone(&args); /* If we get an error and potentially restart the system * call, we're screwed because copy_thread() clobbered diff --git a/arch/x86/kernel/sys_ia32.c b/arch/x86/kernel/sys_ia32.c index 720cde885042..6cf65397d225 100644 --- a/arch/x86/kernel/sys_ia32.c +++ b/arch/x86/kernel/sys_ia32.c @@ -251,6 +251,6 @@ COMPAT_SYSCALL_DEFINE5(ia32_clone, unsigned long, clone_flags, .tls = tls_val, }; - return _do_fork(&args); + return kernel_clone(&args); } #endif /* CONFIG_IA32_EMULATION */ diff --git a/drivers/char/ipmi/ipmi_bt_sm.c b/drivers/char/ipmi/ipmi_bt_sm.c index f3f216cdf686..f41f78972b9c 100644 --- a/drivers/char/ipmi/ipmi_bt_sm.c +++ b/drivers/char/ipmi/ipmi_bt_sm.c @@ -213,8 +213,10 @@ static int bt_start_transaction(struct si_sm_data *bt, if (bt->state == BT_STATE_LONG_BUSY) return IPMI_NODE_BUSY_ERR; - if (bt->state != BT_STATE_IDLE) + if (bt->state != BT_STATE_IDLE) { + dev_warn(bt->io->dev, "BT in invalid state %d\n", bt->state); return IPMI_NOT_IN_MY_STATE_ERR; + } if (bt_debug & BT_DEBUG_MSG) { dev_dbg(bt->io->dev, "+++++++++++++++++ New command\n"); diff --git a/drivers/char/ipmi/ipmi_kcs_sm.c b/drivers/char/ipmi/ipmi_kcs_sm.c index 2e7cda08b079..efda90dcf5b3 100644 --- a/drivers/char/ipmi/ipmi_kcs_sm.c +++ b/drivers/char/ipmi/ipmi_kcs_sm.c @@ -17,6 +17,8 @@ * that document. */ +#define DEBUG /* So dev_dbg() is always available. */ + #include /* For printk. */ #include #include @@ -187,8 +189,8 @@ static inline void start_error_recovery(struct si_sm_data *kcs, char *reason) (kcs->error_retries)++; if (kcs->error_retries > MAX_ERROR_RETRIES) { if (kcs_debug & KCS_DEBUG_ENABLE) - printk(KERN_DEBUG "ipmi_kcs_sm: kcs hosed: %s\n", - reason); + dev_dbg(kcs->io->dev, "ipmi_kcs_sm: kcs hosed: %s\n", + reason); kcs->state = KCS_HOSED; } else { kcs->error0_timeout = jiffies + ERROR0_OBF_WAIT_JIFFIES; @@ -268,11 +270,13 @@ static int start_kcs_transaction(struct si_sm_data *kcs, unsigned char *data, if (size > MAX_KCS_WRITE_SIZE) return IPMI_REQ_LEN_EXCEEDED_ERR; - if ((kcs->state != KCS_IDLE) && (kcs->state != KCS_HOSED)) + if ((kcs->state != KCS_IDLE) && (kcs->state != KCS_HOSED)) { + dev_warn(kcs->io->dev, "KCS in invalid state %d\n", kcs->state); return IPMI_NOT_IN_MY_STATE_ERR; + } if (kcs_debug & KCS_DEBUG_MSG) { - printk(KERN_DEBUG "start_kcs_transaction -"); + dev_dbg(kcs->io->dev, "%s -", __func__); for (i = 0; i < size; i++) pr_cont(" %02x", data[i]); pr_cont("\n"); @@ -331,7 +335,8 @@ static enum si_sm_result kcs_event(struct si_sm_data *kcs, long time) status = read_status(kcs); if (kcs_debug & KCS_DEBUG_STATES) - printk(KERN_DEBUG "KCS: State = %d, %x\n", kcs->state, status); + dev_dbg(kcs->io->dev, + "KCS: State = %d, %x\n", kcs->state, status); /* All states wait for ibf, so just do it here. */ if (!check_ibf(kcs, status, time)) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 737c0b6b24ea..8774a3b8ff95 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -34,12 +34,13 @@ #include #include #include +#include #define IPMI_DRIVER_VERSION "39.2" static struct ipmi_recv_msg *ipmi_alloc_recv_msg(void); static int ipmi_init_msghandler(void); -static void smi_recv_tasklet(unsigned long); +static void smi_recv_tasklet(struct tasklet_struct *t); static void handle_new_recv_msgs(struct ipmi_smi *intf); static void need_waiter(struct ipmi_smi *intf); static int handle_one_recv_msg(struct ipmi_smi *intf, @@ -60,6 +61,7 @@ enum ipmi_panic_event_op { #else #define IPMI_PANIC_DEFAULT IPMI_SEND_PANIC_EVENT_NONE #endif + static enum ipmi_panic_event_op ipmi_send_panic_event = IPMI_PANIC_DEFAULT; static int panic_op_write_handler(const char *val, @@ -89,19 +91,19 @@ static int panic_op_read_handler(char *buffer, const struct kernel_param *kp) { switch (ipmi_send_panic_event) { case IPMI_SEND_PANIC_EVENT_NONE: - strcpy(buffer, "none"); + strcpy(buffer, "none\n"); break; case IPMI_SEND_PANIC_EVENT: - strcpy(buffer, "event"); + strcpy(buffer, "event\n"); break; case IPMI_SEND_PANIC_EVENT_STRING: - strcpy(buffer, "string"); + strcpy(buffer, "string\n"); break; default: - strcpy(buffer, "???"); + strcpy(buffer, "???\n"); break; } @@ -317,6 +319,7 @@ struct bmc_device { int dyn_guid_set; struct kref usecount; struct work_struct remove_work; + unsigned char cc; /* completion code */ }; #define to_bmc_device(x) container_of((x), struct bmc_device, pdev.dev) @@ -2381,6 +2384,8 @@ static void bmc_device_id_handler(struct ipmi_smi *intf, msg->msg.data, msg->msg.data_len, &intf->bmc->fetch_id); if (rv) { dev_warn(intf->si_dev, "device id demangle failed: %d\n", rv); + /* record completion code when error */ + intf->bmc->cc = msg->msg.data[0]; intf->bmc->dyn_id_set = 0; } else { /* @@ -2426,23 +2431,39 @@ send_get_device_id_cmd(struct ipmi_smi *intf) static int __get_device_id(struct ipmi_smi *intf, struct bmc_device *bmc) { int rv; - - bmc->dyn_id_set = 2; + unsigned int retry_count = 0; intf->null_user_handler = bmc_device_id_handler; +retry: + bmc->cc = 0; + bmc->dyn_id_set = 2; + rv = send_get_device_id_cmd(intf); if (rv) - return rv; + goto out_reset_handler; wait_event(intf->waitq, bmc->dyn_id_set != 2); - if (!bmc->dyn_id_set) + if (!bmc->dyn_id_set) { + if ((bmc->cc == IPMI_DEVICE_IN_FW_UPDATE_ERR + || bmc->cc == IPMI_DEVICE_IN_INIT_ERR + || bmc->cc == IPMI_NOT_IN_MY_STATE_ERR) + && ++retry_count <= GET_DEVICE_ID_MAX_RETRY) { + msleep(500); + dev_warn(intf->si_dev, + "BMC returned 0x%2.2x, retry get bmc device id\n", + bmc->cc); + goto retry; + } + rv = -EIO; /* Something went wrong in the fetch. */ + } /* dyn_id_set makes the id data available. */ smp_rmb(); +out_reset_handler: intf->null_user_handler = NULL; return rv; @@ -3245,7 +3266,6 @@ channel_handler(struct ipmi_smi *intf, struct ipmi_recv_msg *msg) /* It's the one we want */ if (msg->msg.data[0] != 0) { /* Got an error from the channel, just go on. */ - if (msg->msg.data[0] == IPMI_INVALID_COMMAND_ERR) { /* * If the MC does not support this @@ -3329,6 +3349,7 @@ static int __scan_channels(struct ipmi_smi *intf, struct ipmi_device_id *id) dev_warn(intf->si_dev, "Error sending channel information for channel 0, %d\n", rv); + intf->null_user_handler = NULL; return -EIO; } @@ -3430,9 +3451,8 @@ int ipmi_add_smi(struct module *owner, intf->curr_seq = 0; spin_lock_init(&intf->waiting_rcv_msgs_lock); INIT_LIST_HEAD(&intf->waiting_rcv_msgs); - tasklet_init(&intf->recv_tasklet, - smi_recv_tasklet, - (unsigned long) intf); + tasklet_setup(&intf->recv_tasklet, + smi_recv_tasklet); atomic_set(&intf->watchdog_pretimeouts_to_deliver, 0); spin_lock_init(&intf->xmit_msgs_lock); INIT_LIST_HEAD(&intf->xmit_msgs); @@ -4467,10 +4487,10 @@ static void handle_new_recv_msgs(struct ipmi_smi *intf) } } -static void smi_recv_tasklet(unsigned long val) +static void smi_recv_tasklet(struct tasklet_struct *t) { unsigned long flags = 0; /* keep us warning-free. */ - struct ipmi_smi *intf = (struct ipmi_smi *) val; + struct ipmi_smi *intf = from_tasklet(intf, t, recv_tasklet); int run_to_completion = intf->run_to_completion; struct ipmi_smi_msg *newmsg = NULL; @@ -4542,7 +4562,7 @@ void ipmi_smi_msg_received(struct ipmi_smi *intf, spin_unlock_irqrestore(&intf->xmit_msgs_lock, flags); if (run_to_completion) - smi_recv_tasklet((unsigned long) intf); + smi_recv_tasklet(&intf->recv_tasklet); else tasklet_schedule(&intf->recv_tasklet); } diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 77b8d551ae7f..5eac94cf4ff8 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -1316,6 +1316,7 @@ static int try_get_dev_id(struct smi_info *smi_info) unsigned char *resp; unsigned long resp_len; int rv = 0; + unsigned int retry_count = 0; resp = kmalloc(IPMI_MAX_MSG_LENGTH, GFP_KERNEL); if (!resp) @@ -1327,6 +1328,8 @@ static int try_get_dev_id(struct smi_info *smi_info) */ msg[0] = IPMI_NETFN_APP_REQUEST << 2; msg[1] = IPMI_GET_DEVICE_ID_CMD; + +retry: smi_info->handlers->start_transaction(smi_info->si_sm, msg, 2); rv = wait_for_msg_done(smi_info); @@ -1339,6 +1342,20 @@ static int try_get_dev_id(struct smi_info *smi_info) /* Check and record info from the get device id, in case we need it. */ rv = ipmi_demangle_device_id(resp[0] >> 2, resp[1], resp + 2, resp_len - 2, &smi_info->device_id); + if (rv) { + /* record completion code */ + unsigned char cc = *(resp + 2); + + if ((cc == IPMI_DEVICE_IN_FW_UPDATE_ERR + || cc == IPMI_DEVICE_IN_INIT_ERR + || cc == IPMI_NOT_IN_MY_STATE_ERR) + && ++retry_count <= GET_DEVICE_ID_MAX_RETRY) { + dev_warn(smi_info->io.dev, + "BMC returned 0x%2.2x, retry get bmc device id\n", + cc); + goto retry; + } + } out: kfree(resp); @@ -1963,7 +1980,7 @@ static int try_smi_init(struct smi_info *new_smi) /* Do this early so it's available for logs. */ if (!new_smi->io.dev) { pr_err("IPMI interface added with no device\n"); - rv = EIO; + rv = -EIO; goto out_err; } diff --git a/drivers/char/ipmi/ipmi_smic_sm.c b/drivers/char/ipmi/ipmi_smic_sm.c index b6225bba2532..bfea500d6f5f 100644 --- a/drivers/char/ipmi/ipmi_smic_sm.c +++ b/drivers/char/ipmi/ipmi_smic_sm.c @@ -21,6 +21,8 @@ * 2001 Hewlett-Packard Company */ +#define DEBUG /* So dev_dbg() is always available. */ + #include /* For printk. */ #include #include @@ -126,11 +128,14 @@ static int start_smic_transaction(struct si_sm_data *smic, if (size > MAX_SMIC_WRITE_SIZE) return IPMI_REQ_LEN_EXCEEDED_ERR; - if ((smic->state != SMIC_IDLE) && (smic->state != SMIC_HOSED)) + if ((smic->state != SMIC_IDLE) && (smic->state != SMIC_HOSED)) { + dev_warn(smic->io->dev, + "SMIC in invalid state %d\n", smic->state); return IPMI_NOT_IN_MY_STATE_ERR; + } if (smic_debug & SMIC_DEBUG_MSG) { - printk(KERN_DEBUG "start_smic_transaction -"); + dev_dbg(smic->io->dev, "%s -", __func__); for (i = 0; i < size; i++) pr_cont(" %02x", data[i]); pr_cont("\n"); @@ -152,7 +157,7 @@ static int smic_get_result(struct si_sm_data *smic, int i; if (smic_debug & SMIC_DEBUG_MSG) { - printk(KERN_DEBUG "smic_get result -"); + dev_dbg(smic->io->dev, "smic_get result -"); for (i = 0; i < smic->read_pos; i++) pr_cont(" %02x", smic->read_data[i]); pr_cont("\n"); @@ -324,9 +329,9 @@ static enum si_sm_result smic_event(struct si_sm_data *smic, long time) } if (smic->state != SMIC_IDLE) { if (smic_debug & SMIC_DEBUG_STATES) - printk(KERN_DEBUG - "smic_event - smic->smic_timeout = %ld, time = %ld\n", - smic->smic_timeout, time); + dev_dbg(smic->io->dev, + "%s - smic->smic_timeout = %ld, time = %ld\n", + __func__, smic->smic_timeout, time); /* * FIXME: smic_event is sometimes called with time > * SMIC_RETRY_TIMEOUT @@ -345,8 +350,9 @@ static enum si_sm_result smic_event(struct si_sm_data *smic, long time) status = read_smic_status(smic); if (smic_debug & SMIC_DEBUG_STATES) - printk(KERN_DEBUG "smic_event - state = %d, flags = 0x%02x, status = 0x%02x\n", - smic->state, flags, status); + dev_dbg(smic->io->dev, + "%s - state = %d, flags = 0x%02x, status = 0x%02x\n", + __func__, smic->state, flags, status); switch (smic->state) { case SMIC_IDLE: @@ -436,8 +442,9 @@ static enum si_sm_result smic_event(struct si_sm_data *smic, long time) data = read_smic_data(smic); if (data != 0) { if (smic_debug & SMIC_DEBUG_ENABLE) - printk(KERN_DEBUG "SMIC_WRITE_END: data = %02x\n", - data); + dev_dbg(smic->io->dev, + "SMIC_WRITE_END: data = %02x\n", + data); start_error_recovery(smic, "state = SMIC_WRITE_END, " "data != SUCCESS"); @@ -516,8 +523,9 @@ static enum si_sm_result smic_event(struct si_sm_data *smic, long time) /* data register holds an error code */ if (data != 0) { if (smic_debug & SMIC_DEBUG_ENABLE) - printk(KERN_DEBUG "SMIC_READ_END: data = %02x\n", - data); + dev_dbg(smic->io->dev, + "SMIC_READ_END: data = %02x\n", + data); start_error_recovery(smic, "state = SMIC_READ_END, " "data != SUCCESS"); @@ -533,7 +541,8 @@ static enum si_sm_result smic_event(struct si_sm_data *smic, long time) default: if (smic_debug & SMIC_DEBUG_ENABLE) { - printk(KERN_DEBUG "smic->state = %d\n", smic->state); + dev_dbg(smic->io->dev, + "smic->state = %d\n", smic->state); start_error_recovery(smic, "state = UNKNOWN"); return SI_SM_CALL_WITH_DELAY; } diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c index d5d2af4d10e6..945701bce553 100644 --- a/drivers/misc/kgdbts.c +++ b/drivers/misc/kgdbts.c @@ -33,16 +33,16 @@ * You can also specify optional tests: * N## = Go to sleep with interrupts of for ## seconds * to test the HW NMI watchdog - * F## = Break at do_fork for ## iterations + * F## = Break at kernel_clone for ## iterations * S## = Break at sys_open for ## iterations * I## = Run the single step test ## iterations * - * NOTE: that the do_fork and sys_open tests are mutually exclusive. + * NOTE: that the kernel_clone and sys_open tests are mutually exclusive. * * To invoke the kgdb test suite from boot you use a kernel start * argument as follows: * kgdbts=V1 kgdbwait - * Or if you wanted to perform the NMI test for 6 seconds and do_fork + * Or if you wanted to perform the NMI test for 6 seconds and kernel_clone * test for 100 forks, you could use: * kgdbts=V1N6F100 kgdbwait * @@ -74,7 +74,7 @@ * echo kgdbts=V1S10000 > /sys/module/kgdbts/parameters/kgdbts * fg # and hit control-c * fg # and hit control-c - * ## This tests break points on do_fork + * ## This tests break points on kernel_clone * while [ 1 ] ; do date > /dev/null ; done & * while [ 1 ] ; do date > /dev/null ; done & * echo kgdbts=V1F1000 > /sys/module/kgdbts/parameters/kgdbts @@ -209,8 +209,8 @@ static unsigned long lookup_addr(char *arg) addr = (unsigned long)kgdbts_break_test; else if (!strcmp(arg, "sys_open")) addr = (unsigned long)do_sys_open; - else if (!strcmp(arg, "do_fork")) - addr = (unsigned long)_do_fork; + else if (!strcmp(arg, "kernel_clone")) + addr = (unsigned long)kernel_clone; else if (!strcmp(arg, "hw_break_val")) addr = (unsigned long)&hw_break_val; addr = (unsigned long) dereference_function_descriptor((void *)addr); @@ -310,7 +310,7 @@ static int check_and_rewind_pc(char *put_str, char *arg) if (arch_needs_sstep_emulation && sstep_addr && ip + offset == sstep_addr && - ((!strcmp(arg, "sys_open") || !strcmp(arg, "do_fork")))) { + ((!strcmp(arg, "sys_open") || !strcmp(arg, "kernel_clone")))) { /* This is special case for emulated single step */ v2printk("Emul: rewind hit single step bp\n"); restart_from_top_after_write = 1; @@ -596,19 +596,19 @@ static struct test_struct singlestep_break_test[] = { }; /* - * Test for hitting a breakpoint at do_fork for what ever the number + * Test for hitting a breakpoint at kernel_clone for what ever the number * of iterations required by the variable repeat_test. */ -static struct test_struct do_fork_test[] = { +static struct test_struct do_kernel_clone_test[] = { { "?", "S0*" }, /* Clear break points */ - { "do_fork", "OK", sw_break, }, /* set sw breakpoint */ + { "kernel_clone", "OK", sw_break, }, /* set sw breakpoint */ { "c", "T0*", NULL, get_thread_id_continue }, /* Continue */ - { "do_fork", "OK", sw_rem_break }, /*remove breakpoint */ - { "g", "do_fork", NULL, check_and_rewind_pc }, /* check location */ + { "kernel_clone", "OK", sw_rem_break }, /*remove breakpoint */ + { "g", "kernel_clone", NULL, check_and_rewind_pc }, /* check location */ { "write", "OK", write_regs, emul_reset }, /* Write registers */ { "s", "T0*", emul_sstep_get, emul_sstep_put }, /* Single step */ - { "g", "do_fork", NULL, check_single_step }, - { "do_fork", "OK", sw_break, }, /* set sw breakpoint */ + { "g", "kernel_clone", NULL, check_single_step }, + { "kernel_clone", "OK", sw_break, }, /* set sw breakpoint */ { "7", "T0*", skip_back_repeat_test }, /* Loop based on repeat_test */ { "D", "OK", NULL, final_ack_set }, /* detach and unregister I/O */ { "", "", get_cont_catch, put_cont_catch }, @@ -935,11 +935,11 @@ static void run_bad_read_test(void) kgdb_breakpoint(); } -static void run_do_fork_test(void) +static void run_kernel_clone_test(void) { init_simple_test(); - ts.tst = do_fork_test; - ts.name = "do_fork_test"; + ts.tst = do_kernel_clone_test; + ts.name = "do_kernel_clone_test"; /* Activate test with initial breakpoint */ kgdb_breakpoint(); } @@ -967,7 +967,7 @@ static void run_singlestep_break_test(void) static void kgdbts_run_tests(void) { char *ptr; - int fork_test = 0; + int clone_test = 0; int do_sys_open_test = 0; int sstep_test = 1000; int nmi_sleep = 0; @@ -981,7 +981,7 @@ static void kgdbts_run_tests(void) ptr = strchr(config, 'F'); if (ptr) - fork_test = simple_strtol(ptr + 1, NULL, 10); + clone_test = simple_strtol(ptr + 1, NULL, 10); ptr = strchr(config, 'S'); if (ptr) do_sys_open_test = simple_strtol(ptr + 1, NULL, 10); @@ -1025,16 +1025,16 @@ static void kgdbts_run_tests(void) run_nmi_sleep_test(nmi_sleep); } - /* If the do_fork test is run it will be the last test that is + /* If the kernel_clone test is run it will be the last test that is * executed because a kernel thread will be spawned at the very * end to unregister the debug hooks. */ - if (fork_test) { - repeat_test = fork_test; - printk(KERN_INFO "kgdbts:RUN do_fork for %i breakpoints\n", + if (clone_test) { + repeat_test = clone_test; + printk(KERN_INFO "kgdbts:RUN kernel_clone for %i breakpoints\n", repeat_test); kthread_run(kgdbts_unreg_thread, NULL, "kgdbts_unreg"); - run_do_fork_test(); + run_kernel_clone_test(); return; } diff --git a/fs/d_path.c b/fs/d_path.c index 0f1fc1743302..a69e2cd36e6e 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -102,6 +102,8 @@ restart: if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { struct mount *parent = READ_ONCE(mnt->mnt_parent); + struct mnt_namespace *mnt_ns; + /* Escaped? */ if (dentry != vfsmnt->mnt_root) { bptr = *buffer; @@ -116,7 +118,9 @@ restart: vfsmnt = &mnt->mnt; continue; } - if (is_mounted(vfsmnt) && !is_anon_ns(mnt->mnt_ns)) + mnt_ns = READ_ONCE(mnt->mnt_ns); + /* open-coded is_mounted() to use local mnt_ns */ + if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns)) error = 1; // absolute root else error = 2; // detached or not attached yet diff --git a/fs/dax.c b/fs/dax.c index 994ab66a9907..6ad346352a8c 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1037,18 +1037,18 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, return ret; } -int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size, - struct iomap *iomap) +s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap) { sector_t sector = iomap_sector(iomap, pos & PAGE_MASK); pgoff_t pgoff; long rc, id; void *kaddr; bool page_aligned = false; - + unsigned offset = offset_in_page(pos); + unsigned size = min_t(u64, PAGE_SIZE - offset, length); if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) && - IS_ALIGNED(size, PAGE_SIZE)) + (size == PAGE_SIZE)) page_aligned = true; rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff); @@ -1058,8 +1058,7 @@ int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size, id = dax_read_lock(); if (page_aligned) - rc = dax_zero_page_range(iomap->dax_dev, pgoff, - size >> PAGE_SHIFT); + rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); else rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL); if (rc < 0) { @@ -1072,7 +1071,7 @@ int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size, dax_flush(iomap->dax_dev, kaddr + offset, size); } dax_read_unlock(id); - return 0; + return size; } static loff_t diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index bcfc288dba3f..8180061b9e16 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -22,18 +22,25 @@ #include "../internal.h" /* - * Structure allocated for each page when block size < PAGE_SIZE to track - * sub-page uptodate status and I/O completions. + * Structure allocated for each page or THP when block size < page size + * to track sub-page uptodate status and I/O completions. */ struct iomap_page { - atomic_t read_count; - atomic_t write_count; + atomic_t read_bytes_pending; + atomic_t write_bytes_pending; spinlock_t uptodate_lock; - DECLARE_BITMAP(uptodate, PAGE_SIZE / 512); + unsigned long uptodate[]; }; static inline struct iomap_page *to_iomap_page(struct page *page) { + /* + * per-block data is stored in the head page. Callers should + * not be dealing with tail pages (and if they are, they can + * call thp_head() first. + */ + VM_BUG_ON_PGFLAGS(PageTail(page), page); + if (page_has_private(page)) return (struct iomap_page *)page_private(page); return NULL; @@ -45,20 +52,16 @@ static struct iomap_page * iomap_page_create(struct inode *inode, struct page *page) { struct iomap_page *iop = to_iomap_page(page); + unsigned int nr_blocks = i_blocks_per_page(inode, page); - if (iop || i_blocksize(inode) == PAGE_SIZE) + if (iop || nr_blocks <= 1) return iop; - iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL); - atomic_set(&iop->read_count, 0); - atomic_set(&iop->write_count, 0); + iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), + GFP_NOFS | __GFP_NOFAIL); spin_lock_init(&iop->uptodate_lock); - bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); - - /* - * migrate_page_move_mapping() assumes that pages with private data have - * their count elevated by 1. - */ + if (PageUptodate(page)) + bitmap_fill(iop->uptodate, nr_blocks); attach_page_private(page, iop); return iop; } @@ -67,11 +70,14 @@ static void iomap_page_release(struct page *page) { struct iomap_page *iop = detach_page_private(page); + unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page); if (!iop) return; - WARN_ON_ONCE(atomic_read(&iop->read_count)); - WARN_ON_ONCE(atomic_read(&iop->write_count)); + WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending)); + WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending)); + WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != + PageUptodate(page)); kfree(iop); } @@ -142,19 +148,11 @@ iomap_iop_set_range_uptodate(struct page *page, unsigned off, unsigned len) struct inode *inode = page->mapping->host; unsigned first = off >> inode->i_blkbits; unsigned last = (off + len - 1) >> inode->i_blkbits; - bool uptodate = true; unsigned long flags; - unsigned int i; spin_lock_irqsave(&iop->uptodate_lock, flags); - for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) { - if (i >= first && i <= last) - set_bit(i, iop->uptodate); - else if (!test_bit(i, iop->uptodate)) - uptodate = false; - } - - if (uptodate) + bitmap_set(iop->uptodate, first, last - first + 1); + if (bitmap_full(iop->uptodate, i_blocks_per_page(inode, page))) SetPageUptodate(page); spin_unlock_irqrestore(&iop->uptodate_lock, flags); } @@ -171,13 +169,6 @@ iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len) SetPageUptodate(page); } -static void -iomap_read_finish(struct iomap_page *iop, struct page *page) -{ - if (!iop || atomic_dec_and_test(&iop->read_count)) - unlock_page(page); -} - static void iomap_read_page_end_io(struct bio_vec *bvec, int error) { @@ -191,7 +182,8 @@ iomap_read_page_end_io(struct bio_vec *bvec, int error) iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len); } - iomap_read_finish(iop, page); + if (!iop || atomic_sub_and_test(bvec->bv_len, &iop->read_bytes_pending)) + unlock_page(page); } static void @@ -271,30 +263,19 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, } ctx->cur_page_in_bio = true; + if (iop) + atomic_add(plen, &iop->read_bytes_pending); - /* - * Try to merge into a previous segment if we can. - */ + /* Try to merge into a previous segment if we can */ sector = iomap_sector(iomap, pos); - if (ctx->bio && bio_end_sector(ctx->bio) == sector) + if (ctx->bio && bio_end_sector(ctx->bio) == sector) { + if (__bio_try_merge_page(ctx->bio, page, plen, poff, + &same_page)) + goto done; is_contig = true; - - if (is_contig && - __bio_try_merge_page(ctx->bio, page, plen, poff, &same_page)) { - if (!same_page && iop) - atomic_inc(&iop->read_count); - goto done; } - /* - * If we start a new segment we need to increase the read count, and we - * need to do so before submitting any previous full bio to make sure - * that we don't prematurely unlock the page. - */ - if (iop) - atomic_inc(&iop->read_count); - - if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) { + if (!is_contig || bio_full(ctx->bio, plen)) { gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); gfp_t orig_gfp = gfp; int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -571,13 +552,13 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, { struct iomap_page *iop = iomap_page_create(inode, page); loff_t block_size = i_blocksize(inode); - loff_t block_start = pos & ~(block_size - 1); - loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1); + loff_t block_start = round_down(pos, block_size); + loff_t block_end = round_up(pos + len, block_size); unsigned from = offset_in_page(pos), to = from + len, poff, plen; - int status; if (PageUptodate(page)) return 0; + ClearPageError(page); do { iomap_adjust_read_range(inode, iop, &block_start, @@ -594,14 +575,13 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE)) return -EIO; zero_user_segments(page, poff, from, to, poff + plen); - iomap_set_range_uptodate(page, poff, plen); - continue; + } else { + int status = iomap_read_page_sync(block_start, page, + poff, plen, srcmap); + if (status) + return status; } - - status = iomap_read_page_sync(block_start, page, poff, plen, - srcmap); - if (status) - return status; + iomap_set_range_uptodate(page, poff, plen); } while ((block_start += plen) < block_end); return 0; @@ -685,9 +665,8 @@ iomap_set_page_dirty(struct page *page) } EXPORT_SYMBOL_GPL(iomap_set_page_dirty); -static int -__iomap_write_end(struct inode *inode, loff_t pos, unsigned len, - unsigned copied, struct page *page) +static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, + size_t copied, struct page *page) { flush_dcache_page(page); @@ -709,15 +688,15 @@ __iomap_write_end(struct inode *inode, loff_t pos, unsigned len, return copied; } -static int -iomap_write_end_inline(struct inode *inode, struct page *page, - struct iomap *iomap, loff_t pos, unsigned copied) +static size_t iomap_write_end_inline(struct inode *inode, struct page *page, + struct iomap *iomap, loff_t pos, size_t copied) { void *addr; WARN_ON_ONCE(!PageUptodate(page)); BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data)); + flush_dcache_page(page); addr = kmap_atomic(page); memcpy(iomap->inline_data + pos, addr + pos, copied); kunmap_atomic(addr); @@ -726,13 +705,14 @@ iomap_write_end_inline(struct inode *inode, struct page *page, return copied; } -static int -iomap_write_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, - struct page *page, struct iomap *iomap, struct iomap *srcmap) +/* Returns the number of bytes copied. May be 0. Cannot be an errno. */ +static size_t iomap_write_end(struct inode *inode, loff_t pos, size_t len, + size_t copied, struct page *page, struct iomap *iomap, + struct iomap *srcmap) { const struct iomap_page_ops *page_ops = iomap->page_ops; loff_t old_size = inode->i_size; - int ret; + size_t ret; if (srcmap->type == IOMAP_INLINE) { ret = iomap_write_end_inline(inode, page, iomap, pos, copied); @@ -811,13 +791,8 @@ again: copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); - flush_dcache_page(page); - - status = iomap_write_end(inode, pos, bytes, copied, page, iomap, + copied = iomap_write_end(inode, pos, bytes, copied, page, iomap, srcmap); - if (unlikely(status < 0)) - break; - copied = status; cond_resched(); @@ -891,11 +866,8 @@ iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, status = iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap); - if (unlikely(status <= 0)) { - if (WARN_ON_ONCE(status == 0)) - return -EIO; - return status; - } + if (WARN_ON_ONCE(status == 0)) + return -EIO; cond_resched(); @@ -928,11 +900,13 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, } EXPORT_SYMBOL_GPL(iomap_file_unshare); -static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, - unsigned bytes, struct iomap *iomap, struct iomap *srcmap) +static s64 iomap_zero(struct inode *inode, loff_t pos, u64 length, + struct iomap *iomap, struct iomap *srcmap) { struct page *page; int status; + unsigned offset = offset_in_page(pos); + unsigned bytes = min_t(u64, PAGE_SIZE - offset, length); status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap); if (status) @@ -944,38 +918,33 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap); } -static loff_t -iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, - void *data, struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos, + loff_t length, void *data, struct iomap *iomap, + struct iomap *srcmap) { bool *did_zero = data; loff_t written = 0; - int status; /* already zeroed? we're done. */ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) - return count; + return length; do { - unsigned offset, bytes; - - offset = offset_in_page(pos); - bytes = min_t(loff_t, PAGE_SIZE - offset, count); + s64 bytes; if (IS_DAX(inode)) - status = dax_iomap_zero(pos, offset, bytes, iomap); + bytes = dax_iomap_zero(pos, length, iomap); else - status = iomap_zero(inode, pos, offset, bytes, iomap, - srcmap); - if (status < 0) - return status; + bytes = iomap_zero(inode, pos, length, iomap, srcmap); + if (bytes < 0) + return bytes; pos += bytes; - count -= bytes; + length -= bytes; written += bytes; if (did_zero) *did_zero = true; - } while (count > 0); + } while (length > 0); return written; } @@ -1070,7 +1039,7 @@ EXPORT_SYMBOL_GPL(iomap_page_mkwrite); static void iomap_finish_page_writeback(struct inode *inode, struct page *page, - int error) + int error, unsigned int len) { struct iomap_page *iop = to_iomap_page(page); @@ -1079,10 +1048,10 @@ iomap_finish_page_writeback(struct inode *inode, struct page *page, mapping_set_error(inode->i_mapping, -EIO); } - WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop); - WARN_ON_ONCE(iop && atomic_read(&iop->write_count) <= 0); + WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop); + WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0); - if (!iop || atomic_dec_and_test(&iop->write_count)) + if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending)) end_page_writeback(page); } @@ -1116,7 +1085,8 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) /* walk each page on bio, ending page IO on them */ bio_for_each_segment_all(bv, bio, iter_all) - iomap_finish_page_writeback(inode, bv->bv_page, error); + iomap_finish_page_writeback(inode, bv->bv_page, error, + bv->bv_len); bio_put(bio); } /* The ioend has been freed by bio_put() */ @@ -1332,8 +1302,8 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, &same_page); - if (iop && !same_page) - atomic_inc(&iop->write_count); + if (iop) + atomic_add(len, &iop->write_bytes_pending); if (!merged) { if (bio_full(wpc->ioend->io_bio, len)) { @@ -1375,8 +1345,8 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, int error = 0, count = 0, i; LIST_HEAD(submit_list); - WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop); - WARN_ON_ONCE(iop && atomic_read(&iop->write_count) != 0); + WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop); + WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0); /* * Walk through the page to find areas to write back. If we run off the diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index c9ec3e9d59a3..70488904a778 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -77,7 +77,7 @@ static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, dio->submit.cookie = submit_bio(bio); } -static ssize_t iomap_dio_complete(struct iomap_dio *dio) +ssize_t iomap_dio_complete(struct iomap_dio *dio) { const struct iomap_dio_ops *dops = dio->dops; struct kiocb *iocb = dio->iocb; @@ -109,7 +109,7 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ - if (!dio->error && + if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { int err; err = invalidate_inode_pages2_range(inode->i_mapping, @@ -119,6 +119,7 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) dio_warn_stale_pagecache(iocb->ki_filp); } + inode_dio_end(file_inode(iocb->ki_filp)); /* * If this is a DSYNC write, make sure we push it to stable storage now * that we've written data. @@ -126,11 +127,11 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) ret = generic_write_sync(iocb, ret); - inode_dio_end(file_inode(iocb->ki_filp)); kfree(dio); return ret; } +EXPORT_SYMBOL_GPL(iomap_dio_complete); static void iomap_dio_complete_work(struct work_struct *work) { @@ -396,6 +397,16 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, return iomap_dio_bio_actor(inode, pos, length, dio, iomap); case IOMAP_INLINE: return iomap_dio_inline_actor(inode, pos, length, dio, iomap); + case IOMAP_DELALLOC: + /* + * DIO is not serialised against mmap() access at all, and so + * if the page_mkwrite occurs between the writeback and the + * iomap_apply() call in the DIO path, then it will see the + * DELALLOC block that the page-mkwrite allocated. + */ + pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n", + dio->iocb->ki_filp, current->comm); + return -EIO; default: WARN_ON_ONCE(1); return -EIO; @@ -414,8 +425,8 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, * Returns -ENOTBLK In case of a page invalidation invalidation failure for * writes. The callers needs to fall back to buffered I/O in this case. */ -ssize_t -iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, +struct iomap_dio * +__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, bool wait_for_completion) { @@ -429,14 +440,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_dio *dio; if (!count) - return 0; + return NULL; if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion)) - return -EIO; + return ERR_PTR(-EIO); dio = kmalloc(sizeof(*dio), GFP_KERNEL); if (!dio) - return -ENOMEM; + return ERR_PTR(-ENOMEM); dio->iocb = iocb; atomic_set(&dio->ref, 1); @@ -566,7 +577,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->wait_for_completion = wait_for_completion; if (!atomic_dec_and_test(&dio->ref)) { if (!wait_for_completion) - return -EIOCBQUEUED; + return ERR_PTR(-EIOCBQUEUED); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -582,10 +593,26 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, __set_current_state(TASK_RUNNING); } - return iomap_dio_complete(dio); + return dio; out_free_dio: kfree(dio); - return ret; + if (ret) + return ERR_PTR(ret); + return NULL; +} +EXPORT_SYMBOL_GPL(__iomap_dio_rw); + +ssize_t +iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, + const struct iomap_ops *ops, const struct iomap_dio_ops *dops, + bool wait_for_completion) +{ + struct iomap_dio *dio; + + dio = __iomap_dio_rw(iocb, iter, ops, dops, wait_for_completion); + if (IS_ERR_OR_NULL(dio)) + return PTR_ERR_OR_ZERO(dio); + return iomap_dio_complete(dio); } EXPORT_SYMBOL_GPL(iomap_dio_rw); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index a2f5338a5ea1..176580f54af9 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -473,7 +473,7 @@ static int metapage_readpage(struct file *fp, struct page *page) struct inode *inode = page->mapping->host; struct bio *bio = NULL; int block_offset; - int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; + int blocks_per_page = i_blocks_per_page(inode, page); sector_t page_start; /* address of page in fs blocks */ sector_t pblock; int xlen; diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index e841ed781a25..e986b95d94c9 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -93,25 +93,3 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags) return ptr; return __kmem_vmalloc(size, flags); } - -void * -kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) -{ - int retries = 0; - gfp_t lflags = kmem_flags_convert(flags); - void *ptr; - - trace_kmem_realloc(newsize, flags, _RET_IP_); - - do { - ptr = krealloc(old, newsize, lflags); - if (ptr || (flags & KM_MAYFAIL)) - return ptr; - if (!(++retries % 100)) - xfs_err(NULL, - "%s(%u) possible memory allocation deadlock size %zu in %s (mode:0x%x)", - current->comm, current->pid, - newsize, __func__, lflags); - congestion_wait(BLK_RW_ASYNC, HZ/50); - } while (1); -} diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 8e8555817e6d..38007117697e 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -59,7 +59,6 @@ kmem_flags_convert(xfs_km_flags_t flags) extern void *kmem_alloc(size_t, xfs_km_flags_t); extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags); extern void *kmem_alloc_large(size_t size, xfs_km_flags_t); -extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t); static inline void kmem_free(const void *ptr) { kvfree(ptr); @@ -72,12 +71,6 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags) return kmem_alloc(size, flags | KM_ZERO); } -static inline void * -kmem_zalloc_large(size_t size, xfs_km_flags_t flags) -{ - return kmem_alloc_large(size, flags | KM_ZERO); -} - /* * Zone interfaces */ diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 8cf73fe4338e..9331f3516afa 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -333,6 +333,11 @@ xfs_agiblock_init( } for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); + if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) { + agi->agi_iblocks = cpu_to_be32(1); + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + agi->agi_fblocks = cpu_to_be32(1); + } } typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp, diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 2e055c079f39..fd8e6418a0d3 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -428,7 +428,7 @@ xfs_attr_set( */ if (XFS_IFORK_Q(dp) == 0) { int sf_size = sizeof(struct xfs_attr_sf_hdr) + - XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, + xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); @@ -523,6 +523,14 @@ out_trans_cancel: * External routines when attribute list is inside the inode *========================================================================*/ +static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) +{ + struct xfs_attr_shortform *sf; + + sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; + return be16_to_cpu(sf->hdr.totsize); +} + /* * Add a name to the shortform attribute list structure * This is the external routine. @@ -555,8 +563,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX) return -ENOSPC; - newsize = XFS_ATTR_SF_TOTSIZE(args->dp); - newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); + newsize = xfs_attr_sf_totsize(args->dp); + newsize += xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize); if (!forkoff) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 305d4bc07337..bb128db220ac 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -684,9 +684,9 @@ xfs_attr_sf_findname( sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data; sfe = &sf->list[0]; end = sf->hdr.count; - for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), + for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe), base += size, i++) { - size = XFS_ATTR_SF_ENTSIZE(sfe); + size = xfs_attr_sf_entsize(sfe); if (!xfs_attr_match(args, sfe->namelen, sfe->nameval, sfe->flags)) continue; @@ -728,15 +728,15 @@ xfs_attr_shortform_add( ifp = dp->i_afp; ASSERT(ifp->if_flags & XFS_IFINLINE); - sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST) ASSERT(0); offset = (char *)sfe - (char *)sf; - size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); + size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); xfs_idata_realloc(dp, size, XFS_ATTR_FORK); - sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; - sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset); + sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; + sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset); sfe->namelen = args->namelen; sfe->valuelen = args->valuelen; @@ -787,12 +787,12 @@ xfs_attr_shortform_remove( dp = args->dp; mp = dp->i_mount; - sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; error = xfs_attr_sf_findname(args, &sfe, &base); if (error != -EEXIST) return error; - size = XFS_ATTR_SF_ENTSIZE(sfe); + size = xfs_attr_sf_entsize(sfe); /* * Fix up the attribute fork data, covering the hole @@ -837,8 +837,8 @@ xfs_attr_shortform_remove( int xfs_attr_shortform_lookup(xfs_da_args_t *args) { - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; int i; struct xfs_ifork *ifp; @@ -846,10 +846,10 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) ifp = args->dp->i_afp; ASSERT(ifp->if_flags & XFS_IFINLINE); - sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { + sfe = xfs_attr_sf_nextentry(sfe), i++) { if (xfs_attr_match(args, sfe->namelen, sfe->nameval, sfe->flags)) return -EEXIST; @@ -873,10 +873,10 @@ xfs_attr_shortform_getvalue( int i; ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE); - sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { + sfe = xfs_attr_sf_nextentry(sfe), i++) { if (xfs_attr_match(args, sfe->namelen, sfe->nameval, sfe->flags)) return xfs_attr_copy_value(args, @@ -908,12 +908,12 @@ xfs_attr_shortform_to_leaf( dp = args->dp; ifp = dp->i_afp; - sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; size = be16_to_cpu(sf->hdr.totsize); tmpbuffer = kmem_alloc(size, 0); ASSERT(tmpbuffer != NULL); memcpy(tmpbuffer, ifp->if_u1.if_data, size); - sf = (xfs_attr_shortform_t *)tmpbuffer; + sf = (struct xfs_attr_shortform *)tmpbuffer; xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK); @@ -951,7 +951,7 @@ xfs_attr_shortform_to_leaf( ASSERT(error != -ENOSPC); if (error) goto out; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sfe = xfs_attr_sf_nextentry(sfe); } error = 0; *leaf_bp = bp; @@ -992,9 +992,8 @@ xfs_attr_shortform_allfit( return 0; if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) return 0; - bytes += sizeof(struct xfs_attr_sf_entry) - 1 - + name_loc->namelen - + be16_to_cpu(name_loc->valuelen); + bytes += xfs_attr_sf_entsize_byname(name_loc->namelen, + be16_to_cpu(name_loc->valuelen)); } if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && @@ -1039,7 +1038,7 @@ xfs_attr_shortform_verify( * xfs_attr_sf_entry is defined with a 1-byte variable * array at the end, so we must subtract that off. */ - if (((char *)sfep + sizeof(*sfep) - 1) >= endp) + if (((char *)sfep + sizeof(*sfep)) >= endp) return __this_address; /* Don't allow names with known bad length. */ @@ -1051,7 +1050,7 @@ xfs_attr_shortform_verify( * within the data buffer. The next entry starts after the * name component, so nextentry is an acceptable test. */ - next_sfep = XFS_ATTR_SF_NEXTENTRY(sfep); + next_sfep = xfs_attr_sf_nextentry(sfep); if ((char *)next_sfep > endp) return __this_address; diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index bb004fb7944a..37578b369d9b 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -13,7 +13,6 @@ * to fit into the literal area of the inode. */ typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t; -typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t; /* * We generate this then sort it, attr_list() must return things in hash-order. @@ -27,16 +26,26 @@ typedef struct xfs_attr_sf_sort { unsigned char *name; /* name value, pointer into buffer */ } xfs_attr_sf_sort_t; -#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \ - (((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen))) #define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \ ((1 << (NBBY*(int)sizeof(uint8_t))) - 1) -#define XFS_ATTR_SF_ENTSIZE(sfep) /* space an entry uses */ \ - ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen) -#define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \ - ((xfs_attr_sf_entry_t *)((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep))) -#define XFS_ATTR_SF_TOTSIZE(dp) /* total space in use */ \ - (be16_to_cpu(((xfs_attr_shortform_t *) \ - ((dp)->i_afp->if_u1.if_data))->hdr.totsize)) + +/* space name/value uses */ +static inline int xfs_attr_sf_entsize_byname(uint8_t nlen, uint8_t vlen) +{ + return sizeof(struct xfs_attr_sf_entry) + nlen + vlen; +} + +/* space an entry uses */ +static inline int xfs_attr_sf_entsize(struct xfs_attr_sf_entry *sfep) +{ + return struct_size(sfep, nameval, sfep->namelen + sfep->valuelen); +} + +/* next entry in struct */ +static inline struct xfs_attr_sf_entry * +xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep) +{ + return (void *)sfep + xfs_attr_sf_entsize(sfep); +} #endif /* __XFS_ATTR_SF_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 059ac108b1b3..b40a4e80f5ee 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -579,7 +579,7 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp) /* * Entries are packed toward the top as tight as possible. */ -typedef struct xfs_attr_shortform { +struct xfs_attr_shortform { struct xfs_attr_sf_hdr { /* constant-structure header block */ __be16 totsize; /* total bytes in shortform list */ __u8 count; /* count of active entries */ @@ -589,9 +589,9 @@ typedef struct xfs_attr_shortform { uint8_t namelen; /* actual length of name (no NULL) */ uint8_t valuelen; /* actual length of value (no NULL) */ uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ - uint8_t nameval[1]; /* name & value bytes concatenated */ + uint8_t nameval[]; /* name & value bytes concatenated */ } list[1]; /* variable sized array */ -} xfs_attr_shortform_t; +}; typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ __be16 base; /* base of free region */ diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 5a2db00b9d5f..6766417d5ba4 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -69,6 +69,13 @@ xfs_dquot_verify( ddq_type != XFS_DQTYPE_GROUP) return __this_address; + if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && + !xfs_sb_version_hasbigtime(&mp->m_sb)) + return __this_address; + + if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && !ddq->d_id) + return __this_address; + if (id != -1 && id != be32_to_cpu(ddq->d_id)) return __this_address; @@ -288,3 +295,31 @@ const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { .verify_read = xfs_dquot_buf_readahead_verify, .verify_write = xfs_dquot_buf_write_verify, }; + +/* Convert an on-disk timer value into an incore timer value. */ +time64_t +xfs_dquot_from_disk_ts( + struct xfs_disk_dquot *ddq, + __be32 dtimer) +{ + uint32_t t = be32_to_cpu(dtimer); + + if (t != 0 && (ddq->d_type & XFS_DQTYPE_BIGTIME)) + return xfs_dq_bigtime_to_unix(t); + + return t; +} + +/* Convert an incore timer value into an on-disk timer value. */ +__be32 +xfs_dquot_to_disk_ts( + struct xfs_dquot *dqp, + time64_t timer) +{ + uint32_t t = timer; + + if (timer != 0 && (dqp->q_type & XFS_DQTYPE_BIGTIME)) + t = xfs_dq_unix_to_bigtime(timer); + + return cpu_to_be32(t); +} diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 31b7ece985bb..dd764da08f6f 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -449,10 +449,12 @@ xfs_sb_has_compat_feature( #define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ #define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */ +#define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */ #define XFS_SB_FEAT_RO_COMPAT_ALL \ (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ - XFS_SB_FEAT_RO_COMPAT_REFLINK) + XFS_SB_FEAT_RO_COMPAT_REFLINK| \ + XFS_SB_FEAT_RO_COMPAT_INOBTCNT) #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( @@ -465,10 +467,12 @@ xfs_sb_has_ro_compat_feature( #define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ #define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ #define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ +#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */ #define XFS_SB_FEAT_INCOMPAT_ALL \ (XFS_SB_FEAT_INCOMPAT_FTYPE| \ XFS_SB_FEAT_INCOMPAT_SPINODES| \ - XFS_SB_FEAT_INCOMPAT_META_UUID) + XFS_SB_FEAT_INCOMPAT_META_UUID| \ + XFS_SB_FEAT_INCOMPAT_BIGTIME) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -563,6 +567,23 @@ static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp) (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK); } +static inline bool xfs_sb_version_hasbigtime(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_BIGTIME); +} + +/* + * Inode btree block counter. We record the number of inobt and finobt blocks + * in the AGI header so that we can skip the finobt walk at mount time when + * setting up per-AG reservations. + */ +static inline bool xfs_sb_version_hasinobtcounts(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT); +} + /* * end of superblock version macros */ @@ -765,6 +786,9 @@ typedef struct xfs_agi { __be32 agi_free_root; /* root of the free inode btree */ __be32 agi_free_level;/* levels in free inode btree */ + __be32 agi_iblocks; /* inobt blocks used */ + __be32 agi_fblocks; /* finobt blocks used */ + /* structure must be padded to 64 bit alignment */ } xfs_agi_t; @@ -785,7 +809,8 @@ typedef struct xfs_agi { #define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1) #define XFS_AGI_FREE_ROOT (1 << 11) #define XFS_AGI_FREE_LEVEL (1 << 12) -#define XFS_AGI_NUM_BITS_R2 13 +#define XFS_AGI_IBLOCKS (1 << 13) /* both inobt/finobt block counters */ +#define XFS_AGI_NUM_BITS_R2 14 /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) @@ -831,10 +856,87 @@ struct xfs_agfl { ASSERT(xfs_daddr_to_agno(mp, d) == \ xfs_daddr_to_agno(mp, (d) + (len) - 1))) -typedef struct xfs_timestamp { +/* + * XFS Timestamps + * ============== + * + * Traditional ondisk inode timestamps consist of signed 32-bit counters for + * seconds and nanoseconds; time zero is the Unix epoch, Jan 1 00:00:00 UTC + * 1970, which means that the timestamp epoch is the same as the Unix epoch. + * Therefore, the ondisk min and max defined here can be used directly to + * constrain the incore timestamps on a Unix system. Note that we actually + * encode a __be64 value on disk. + * + * When the bigtime feature is enabled, ondisk inode timestamps become an + * unsigned 64-bit nanoseconds counter. This means that the bigtime inode + * timestamp epoch is the start of the classic timestamp range, which is + * Dec 31 20:45:52 UTC 1901. Because the epochs are not the same, callers + * /must/ use the bigtime conversion functions when encoding and decoding raw + * timestamps. + */ +typedef __be64 xfs_timestamp_t; + +/* Legacy timestamp encoding format. */ +struct xfs_legacy_timestamp { __be32 t_sec; /* timestamp seconds */ __be32 t_nsec; /* timestamp nanoseconds */ -} xfs_timestamp_t; +}; + +/* + * Smallest possible ondisk seconds value with traditional timestamps. This + * corresponds exactly with the incore timestamp Dec 13 20:45:52 UTC 1901. + */ +#define XFS_LEGACY_TIME_MIN ((int64_t)S32_MIN) + +/* + * Largest possible ondisk seconds value with traditional timestamps. This + * corresponds exactly with the incore timestamp Jan 19 03:14:07 UTC 2038. + */ +#define XFS_LEGACY_TIME_MAX ((int64_t)S32_MAX) + +/* + * Smallest possible ondisk seconds value with bigtime timestamps. This + * corresponds (after conversion to a Unix timestamp) with the traditional + * minimum timestamp of Dec 13 20:45:52 UTC 1901. + */ +#define XFS_BIGTIME_TIME_MIN ((int64_t)0) + +/* + * Largest supported ondisk seconds value with bigtime timestamps. This + * corresponds (after conversion to a Unix timestamp) with an incore timestamp + * of Jul 2 20:20:24 UTC 2486. + * + * We round down the ondisk limit so that the bigtime quota and inode max + * timestamps will be the same. + */ +#define XFS_BIGTIME_TIME_MAX ((int64_t)((-1ULL / NSEC_PER_SEC) & ~0x3ULL)) + +/* + * Bigtime epoch is set exactly to the minimum time value that a traditional + * 32-bit timestamp can represent when using the Unix epoch as a reference. + * Hence the Unix epoch is at a fixed offset into the supported bigtime + * timestamp range. + * + * The bigtime epoch also matches the minimum value an on-disk 32-bit XFS + * timestamp can represent so we will not lose any fidelity in converting + * to/from unix and bigtime timestamps. + * + * The following conversion factor converts a seconds counter from the Unix + * epoch to the bigtime epoch. + */ +#define XFS_BIGTIME_EPOCH_OFFSET (-(int64_t)S32_MIN) + +/* Convert a timestamp from the Unix epoch to the bigtime epoch. */ +static inline uint64_t xfs_unix_to_bigtime(time64_t unix_seconds) +{ + return (uint64_t)unix_seconds + XFS_BIGTIME_EPOCH_OFFSET; +} + +/* Convert a timestamp from the bigtime epoch to the Unix epoch. */ +static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds) +{ + return (time64_t)ondisk_seconds - XFS_BIGTIME_EPOCH_OFFSET; +} /* * On-disk inode structure. @@ -1061,12 +1163,22 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */ #define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */ #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ +#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ + #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) +#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) #define XFS_DIFLAG2_ANY \ - (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE) + (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ + XFS_DIFLAG2_BIGTIME) + +static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) +{ + return dip->di_version >= 3 && + (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME)); +} /* * Inode number format: @@ -1152,13 +1264,98 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DQTYPE_USER 0x01 /* user dquot record */ #define XFS_DQTYPE_PROJ 0x02 /* project dquot record */ #define XFS_DQTYPE_GROUP 0x04 /* group dquot record */ +#define XFS_DQTYPE_BIGTIME 0x80 /* large expiry timestamps */ /* bitmask to determine if this is a user/group/project dquot */ #define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \ XFS_DQTYPE_PROJ | \ XFS_DQTYPE_GROUP) -#define XFS_DQTYPE_ANY (XFS_DQTYPE_REC_MASK) +#define XFS_DQTYPE_ANY (XFS_DQTYPE_REC_MASK | \ + XFS_DQTYPE_BIGTIME) + +/* + * XFS Quota Timers + * ================ + * + * Traditional quota grace period expiration timers are an unsigned 32-bit + * seconds counter; time zero is the Unix epoch, Jan 1 00:00:01 UTC 1970. + * Note that an expiration value of zero means that the quota limit has not + * been reached, and therefore no expiration has been set. Therefore, the + * ondisk min and max defined here can be used directly to constrain the incore + * quota expiration timestamps on a Unix system. + * + * When bigtime is enabled, we trade two bits of precision to expand the + * expiration timeout range to match that of big inode timestamps. The min and + * max recorded here are the on-disk limits, not a Unix timestamp. + * + * The grace period for each quota type is stored in the root dquot (id = 0) + * and is applied to a non-root dquot when it exceeds the soft or hard limits. + * The length of quota grace periods are unsigned 32-bit quantities measured in + * units of seconds. A value of zero means to use the default period. + */ + +/* + * Smallest possible ondisk quota expiration value with traditional timestamps. + * This corresponds exactly with the incore expiration Jan 1 00:00:01 UTC 1970. + */ +#define XFS_DQ_LEGACY_EXPIRY_MIN ((int64_t)1) + +/* + * Largest possible ondisk quota expiration value with traditional timestamps. + * This corresponds exactly with the incore expiration Feb 7 06:28:15 UTC 2106. + */ +#define XFS_DQ_LEGACY_EXPIRY_MAX ((int64_t)U32_MAX) + +/* + * Smallest possible ondisk quota expiration value with bigtime timestamps. + * This corresponds (after conversion to a Unix timestamp) with the incore + * expiration of Jan 1 00:00:04 UTC 1970. + */ +#define XFS_DQ_BIGTIME_EXPIRY_MIN (XFS_DQ_LEGACY_EXPIRY_MIN) + +/* + * Largest supported ondisk quota expiration value with bigtime timestamps. + * This corresponds (after conversion to a Unix timestamp) with an incore + * expiration of Jul 2 20:20:24 UTC 2486. + * + * The ondisk field supports values up to -1U, which corresponds to an incore + * expiration in 2514. This is beyond the maximum the bigtime inode timestamp, + * so we cap the maximum bigtime quota expiration to the max inode timestamp. + */ +#define XFS_DQ_BIGTIME_EXPIRY_MAX ((int64_t)4074815106U) + +/* + * The following conversion factors assist in converting a quota expiration + * timestamp between the incore and ondisk formats. + */ +#define XFS_DQ_BIGTIME_SHIFT (2) +#define XFS_DQ_BIGTIME_SLACK ((int64_t)(1ULL << XFS_DQ_BIGTIME_SHIFT) - 1) + +/* Convert an incore quota expiration timestamp to an ondisk bigtime value. */ +static inline uint32_t xfs_dq_unix_to_bigtime(time64_t unix_seconds) +{ + /* + * Round the expiration timestamp up to the nearest bigtime timestamp + * that we can store, to give users the most time to fix problems. + */ + return ((uint64_t)unix_seconds + XFS_DQ_BIGTIME_SLACK) >> + XFS_DQ_BIGTIME_SHIFT; +} + +/* Convert an ondisk bigtime quota expiration value to an incore timestamp. */ +static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds) +{ + return (time64_t)ondisk_seconds << XFS_DQ_BIGTIME_SHIFT; +} + +/* + * Default quota grace periods, ranging from zero (use the compiled defaults) + * to ~136 years. These are applied to a non-root dquot that has exceeded + * either limit. + */ +#define XFS_DQ_GRACE_MIN ((int64_t)0) +#define XFS_DQ_GRACE_MAX ((int64_t)U32_MAX) /* * This is the main portion of the on-disk representation of quota information diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 84bcffa87753..2a2e3cfd94f0 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -249,6 +249,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_SPINODES (1 << 18) /* sparse inode chunks */ #define XFS_FSOP_GEOM_FLAGS_RMAPBT (1 << 19) /* reverse mapping btree */ #define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */ +#define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */ /* * Minimum and maximum sizes need for growth checks. diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index a6b37db55169..974e71bc4a3a 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2473,6 +2473,7 @@ xfs_ialloc_log_agi( offsetof(xfs_agi_t, agi_unlinked), offsetof(xfs_agi_t, agi_free_root), offsetof(xfs_agi_t, agi_free_level), + offsetof(xfs_agi_t, agi_iblocks), sizeof(xfs_agi_t) }; #ifdef DEBUG @@ -2806,6 +2807,10 @@ xfs_ialloc_setup_geometry( uint64_t icount; uint inodes; + igeo->new_diflags2 = 0; + if (xfs_sb_version_hasbigtime(&mp->m_sb)) + igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME; + /* Compute inode btree geometry. */ igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog; igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 3c8aebc36e64..cc919a2ee870 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -67,6 +67,25 @@ xfs_finobt_set_root( XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL); } +/* Update the inode btree block counter for this btree. */ +static inline void +xfs_inobt_mod_blockcount( + struct xfs_btree_cur *cur, + int howmuch) +{ + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agi *agi = agbp->b_addr; + + if (!xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) + return; + + if (cur->bc_btnum == XFS_BTNUM_FINO) + be32_add_cpu(&agi->agi_fblocks, howmuch); + else if (cur->bc_btnum == XFS_BTNUM_INO) + be32_add_cpu(&agi->agi_iblocks, howmuch); + xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_IBLOCKS); +} + STATIC int __xfs_inobt_alloc_block( struct xfs_btree_cur *cur, @@ -102,6 +121,7 @@ __xfs_inobt_alloc_block( new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno)); *stat = 1; + xfs_inobt_mod_blockcount(cur, 1); return 0; } @@ -134,6 +154,7 @@ __xfs_inobt_free_block( struct xfs_buf *bp, enum xfs_ag_resv_type resv) { + xfs_inobt_mod_blockcount(cur, -1); return xfs_free_extent(cur->bc_tp, XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, &XFS_RMAP_OINFO_INOBT, resv); @@ -480,19 +501,29 @@ xfs_inobt_commit_staged_btree( { struct xfs_agi *agi = agbp->b_addr; struct xbtree_afakeroot *afake = cur->bc_ag.afake; + int fields; ASSERT(cur->bc_flags & XFS_BTREE_STAGING); if (cur->bc_btnum == XFS_BTNUM_INO) { + fields = XFS_AGI_ROOT | XFS_AGI_LEVEL; agi->agi_root = cpu_to_be32(afake->af_root); agi->agi_level = cpu_to_be32(afake->af_levels); - xfs_ialloc_log_agi(tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL); + if (xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) { + agi->agi_iblocks = cpu_to_be32(afake->af_blocks); + fields |= XFS_AGI_IBLOCKS; + } + xfs_ialloc_log_agi(tp, agbp, fields); xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops); } else { + fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL; agi->agi_free_root = cpu_to_be32(afake->af_root); agi->agi_free_level = cpu_to_be32(afake->af_levels); - xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREE_ROOT | - XFS_AGI_FREE_LEVEL); + if (xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) { + agi->agi_fblocks = cpu_to_be32(afake->af_blocks); + fields |= XFS_AGI_IBLOCKS; + } + xfs_ialloc_log_agi(tp, agbp, fields); xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops); } } @@ -673,6 +704,28 @@ xfs_inobt_count_blocks( return error; } +/* Read finobt block count from AGI header. */ +static int +xfs_finobt_read_blocks( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_extlen_t *tree_blocks) +{ + struct xfs_buf *agbp; + struct xfs_agi *agi; + int error; + + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) + return error; + + agi = agbp->b_addr; + *tree_blocks = be32_to_cpu(agi->agi_fblocks); + xfs_trans_brelse(tp, agbp); + return 0; +} + /* * Figure out how many blocks to reserve and how many are used by this btree. */ @@ -690,7 +743,11 @@ xfs_finobt_calc_reserves( if (!xfs_sb_version_hasfinobt(&mp->m_sb)) return 0; - error = xfs_inobt_count_blocks(mp, tp, agno, XFS_BTNUM_FINO, &tree_len); + if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) + error = xfs_finobt_read_blocks(mp, tp, agno, &tree_len); + else + error = xfs_inobt_count_blocks(mp, tp, agno, XFS_BTNUM_FINO, + &tree_len); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 52451809c478..b4164256993d 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -603,7 +603,7 @@ xfs_iext_realloc_root( if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF) new_size = NODE_SIZE; - new = kmem_realloc(ifp->if_u1.if_root, new_size, KM_NOFS); + new = krealloc(ifp->if_u1.if_root, new_size, GFP_NOFS | __GFP_NOFAIL); memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes); ifp->if_u1.if_root = new; cur->leaf = new; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 8d5dd08eab75..c667c63f2cb0 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -157,6 +157,36 @@ xfs_imap_to_bp( return 0; } +static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts) +{ + struct timespec64 tv; + uint32_t n; + + tv.tv_sec = xfs_bigtime_to_unix(div_u64_rem(ts, NSEC_PER_SEC, &n)); + tv.tv_nsec = n; + + return tv; +} + +/* Convert an ondisk timestamp to an incore timestamp. */ +struct timespec64 +xfs_inode_from_disk_ts( + struct xfs_dinode *dip, + const xfs_timestamp_t ts) +{ + struct timespec64 tv; + struct xfs_legacy_timestamp *lts; + + if (xfs_dinode_has_bigtime(dip)) + return xfs_inode_decode_bigtime(be64_to_cpu(ts)); + + lts = (struct xfs_legacy_timestamp *)&ts; + tv.tv_sec = (int)be32_to_cpu(lts->t_sec); + tv.tv_nsec = (int)be32_to_cpu(lts->t_nsec); + + return tv; +} + int xfs_inode_from_disk( struct xfs_inode *ip, @@ -211,12 +241,9 @@ xfs_inode_from_disk( * a time before epoch is converted to a time long after epoch * on 64 bit systems. */ - inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec); - inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec); - inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec); - inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec); - inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec); - inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec); + inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime); + inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime); + inode->i_ctime = xfs_inode_from_disk_ts(from, from->di_ctime); to->di_size = be64_to_cpu(from->di_size); to->di_nblocks = be64_to_cpu(from->di_nblocks); @@ -229,8 +256,7 @@ xfs_inode_from_disk( if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { inode_set_iversion_queried(inode, be64_to_cpu(from->di_changecount)); - to->di_crtime.tv_sec = be32_to_cpu(from->di_crtime.t_sec); - to->di_crtime.tv_nsec = be32_to_cpu(from->di_crtime.t_nsec); + to->di_crtime = xfs_inode_from_disk_ts(from, from->di_crtime); to->di_flags2 = be64_to_cpu(from->di_flags2); to->di_cowextsize = be32_to_cpu(from->di_cowextsize); } @@ -252,6 +278,25 @@ out_destroy_data_fork: return error; } +/* Convert an incore timestamp to an ondisk timestamp. */ +static inline xfs_timestamp_t +xfs_inode_to_disk_ts( + struct xfs_inode *ip, + const struct timespec64 tv) +{ + struct xfs_legacy_timestamp *lts; + xfs_timestamp_t ts; + + if (xfs_inode_has_bigtime(ip)) + return cpu_to_be64(xfs_inode_encode_bigtime(tv)); + + lts = (struct xfs_legacy_timestamp *)&ts; + lts->t_sec = cpu_to_be32(tv.tv_sec); + lts->t_nsec = cpu_to_be32(tv.tv_nsec); + + return ts; +} + void xfs_inode_to_disk( struct xfs_inode *ip, @@ -271,12 +316,9 @@ xfs_inode_to_disk( to->di_projid_hi = cpu_to_be16(from->di_projid >> 16); memset(to->di_pad, 0, sizeof(to->di_pad)); - to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec); - to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec); - to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec); - to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec); - to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec); - to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec); + to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime); + to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime); + to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime); to->di_nlink = cpu_to_be32(inode->i_nlink); to->di_gen = cpu_to_be32(inode->i_generation); to->di_mode = cpu_to_be16(inode->i_mode); @@ -295,8 +337,7 @@ xfs_inode_to_disk( if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { to->di_version = 3; to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); - to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.tv_sec); - to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.tv_nsec); + to->di_crtime = xfs_inode_to_disk_ts(ip, from->di_crtime); to->di_flags2 = cpu_to_be64(from->di_flags2); to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(ip->i_ino); @@ -310,58 +351,6 @@ xfs_inode_to_disk( } } -void -xfs_log_dinode_to_disk( - struct xfs_log_dinode *from, - struct xfs_dinode *to) -{ - to->di_magic = cpu_to_be16(from->di_magic); - to->di_mode = cpu_to_be16(from->di_mode); - to->di_version = from->di_version; - to->di_format = from->di_format; - to->di_onlink = 0; - to->di_uid = cpu_to_be32(from->di_uid); - to->di_gid = cpu_to_be32(from->di_gid); - to->di_nlink = cpu_to_be32(from->di_nlink); - to->di_projid_lo = cpu_to_be16(from->di_projid_lo); - to->di_projid_hi = cpu_to_be16(from->di_projid_hi); - memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); - - to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); - to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); - to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); - to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); - to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); - to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); - - to->di_size = cpu_to_be64(from->di_size); - to->di_nblocks = cpu_to_be64(from->di_nblocks); - to->di_extsize = cpu_to_be32(from->di_extsize); - to->di_nextents = cpu_to_be32(from->di_nextents); - to->di_anextents = cpu_to_be16(from->di_anextents); - to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; - to->di_dmevmask = cpu_to_be32(from->di_dmevmask); - to->di_dmstate = cpu_to_be16(from->di_dmstate); - to->di_flags = cpu_to_be16(from->di_flags); - to->di_gen = cpu_to_be32(from->di_gen); - - if (from->di_version == 3) { - to->di_changecount = cpu_to_be64(from->di_changecount); - to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); - to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); - to->di_flags2 = cpu_to_be64(from->di_flags2); - to->di_cowextsize = cpu_to_be32(from->di_cowextsize); - to->di_ino = cpu_to_be64(from->di_ino); - to->di_lsn = cpu_to_be64(from->di_lsn); - memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); - uuid_copy(&to->di_uuid, &from->di_uuid); - to->di_flushiter = 0; - } else { - to->di_flushiter = cpu_to_be16(from->di_flushiter); - } -} - static xfs_failaddr_t xfs_dinode_verify_fork( struct xfs_dinode *dip, @@ -568,6 +557,11 @@ xfs_dinode_verify( if (fa) return fa; + /* bigtime iflag can only happen on bigtime filesystems */ + if (xfs_dinode_has_bigtime(dip) && + !xfs_sb_version_hasbigtime(&mp->m_sb)) + return __this_address; + return NULL; } diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 6b08b9d060c2..536666143fe7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -32,6 +32,11 @@ struct xfs_icdinode { struct timespec64 di_crtime; /* time created */ }; +static inline bool xfs_icdinode_has_bigtime(const struct xfs_icdinode *icd) +{ + return icd->di_flags2 & XFS_DIFLAG2_BIGTIME; +} + /* * Inode location information. Stored in the inode and passed to * xfs_imap_to_bp() to get a buffer and dinode for a given inode. @@ -49,8 +54,6 @@ void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to, xfs_lsn_t lsn); int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); -void xfs_log_dinode_to_disk(struct xfs_log_dinode *from, - struct xfs_dinode *to); xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_dinode *dip); @@ -60,4 +63,12 @@ xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp, uint32_t cowextsize, uint16_t mode, uint16_t flags, uint64_t flags2); +static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv) +{ + return xfs_unix_to_bigtime(tv.tv_sec) * NSEC_PER_SEC + tv.tv_nsec; +} + +struct timespec64 xfs_inode_from_disk_ts(struct xfs_dinode *dip, + const xfs_timestamp_t ts); + #endif /* __XFS_INODE_BUF_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 0cf853d42d62..7575de5cecb1 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -386,8 +386,8 @@ xfs_iroot_realloc( cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); new_max = cur_max + rec_diff; new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); - ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, - KM_NOFS); + ifp->if_broot = krealloc(ifp->if_broot, new_size, + GFP_NOFS | __GFP_NOFAIL); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, @@ -496,8 +496,8 @@ xfs_idata_realloc( * in size so that it can be logged and stay on word boundaries. * We enforce that here. */ - ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data, - roundup(new_size, 4), KM_NOFS); + ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, roundup(new_size, 4), + GFP_NOFS | __GFP_NOFAIL); ifp->if_bytes = new_size; } diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index e3400c9c71cd..8bd00da6d2a4 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -368,10 +368,13 @@ static inline int xfs_ilog_fdata(int w) * directly mirrors the xfs_dinode structure as it must contain all the same * information. */ -typedef struct xfs_ictimestamp { +typedef uint64_t xfs_ictimestamp_t; + +/* Legacy timestamp encoding format. */ +struct xfs_legacy_ictimestamp { int32_t t_sec; /* timestamp seconds */ int32_t t_nsec; /* timestamp nanoseconds */ -} xfs_ictimestamp_t; +}; /* * Define the format of the inode core that is logged. This structure must be diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 641132d0e39d..3cca2bfe714c 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -121,7 +121,6 @@ struct xlog_recover { void xlog_buf_readahead(struct xlog *log, xfs_daddr_t blkno, uint len, const struct xfs_buf_ops *ops); bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len); -void xlog_recover_iodone(struct xfs_buf *bp); void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 076bdc7037ee..0f0af4e35032 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -23,7 +23,8 @@ typedef uint8_t xfs_dqtype_t; #define XFS_DQTYPE_STRINGS \ { XFS_DQTYPE_USER, "USER" }, \ { XFS_DQTYPE_PROJ, "PROJ" }, \ - { XFS_DQTYPE_GROUP, "GROUP" } + { XFS_DQTYPE_GROUP, "GROUP" }, \ + { XFS_DQTYPE_BIGTIME, "BIGTIME" } /* * flags for q_flags field in the dquot. @@ -143,4 +144,9 @@ extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb, xfs_dqid_t id, xfs_dqtype_t type); +struct xfs_dquot; +time64_t xfs_dquot_from_disk_ts(struct xfs_disk_dquot *ddq, + __be32 dtimer); +__be32 xfs_dquot_to_disk_ts(struct xfs_dquot *ddq, time64_t timer); + #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index ae9aaf1f34bf..5aeafa59ed27 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -954,7 +954,7 @@ xfs_log_sb( struct xfs_trans *tp) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_buf *bp = xfs_trans_getsb(tp, mp); + struct xfs_buf *bp = xfs_trans_getsb(tp); mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); @@ -1084,7 +1084,7 @@ xfs_sync_sb_buf( if (error) return error; - bp = xfs_trans_getsb(tp, mp); + bp = xfs_trans_getsb(tp); xfs_log_sb(tp); xfs_trans_bhold(tp, bp); xfs_trans_set_sync(tp); @@ -1166,6 +1166,8 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_RMAPBT; if (xfs_sb_version_hasreflink(sbp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK; + if (xfs_sb_version_hasbigtime(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME; if (xfs_sb_version_hassector(sbp)) geo->logsectsize = sbp->sb_logsectsize; else diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 708feb8eac76..c795ae47b3c9 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -176,6 +176,9 @@ struct xfs_ino_geometry { unsigned int ialloc_align; unsigned int agino_log; /* #bits for agino in inum */ + + /* precomputed value for di_flags2 */ + uint64_t new_diflags2; }; #endif /* __XFS_SHARED_H__ */ diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index b7e222befb08..90f1d5645052 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -131,6 +131,17 @@ xfs_trans_log_inode( iversion_flags = XFS_ILOG_CORE; } + /* + * If we're updating the inode core or the timestamps and it's possible + * to upgrade this inode to bigtime format, do so now. + */ + if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) && + xfs_sb_version_hasbigtime(&ip->i_mount->m_sb) && + !xfs_inode_has_bigtime(ip)) { + ip->i_d.di_flags2 |= XFS_DIFLAG2_BIGTIME; + flags |= XFS_ILOG_CORE; + } + /* * Record the specific change for fdatasync optimisation. This allows * fdatasync to skip log forces for inodes that are only timestamp @@ -177,9 +188,9 @@ xfs_trans_log_inode( /* * Always OR in the bits from the ili_last_fields field. This is to - * coordinate with the xfs_iflush() and xfs_iflush_done() routines in - * the eventual clearing of the ili_fields bits. See the big comment in - * xfs_iflush() for an explanation of this coordination mechanism. + * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines + * in the eventual clearing of the ili_fields bits. See the big comment + * in xfs_iflush() for an explanation of this coordination mechanism. */ iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags); spin_unlock(&iip->ili_lock); diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index e9bcf1faa183..ae8e2e0ac64a 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -781,6 +781,35 @@ xchk_agi_xref_icounts( xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp); } +/* Check agi_[fi]blocks against tree size */ +static inline void +xchk_agi_xref_fiblocks( + struct xfs_scrub *sc) +{ + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; + xfs_agblock_t blocks; + int error = 0; + + if (!xfs_sb_version_hasinobtcounts(&sc->mp->m_sb)) + return; + + if (sc->sa.ino_cur) { + error = xfs_btree_count_blocks(sc->sa.ino_cur, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.ino_cur)) + return; + if (blocks != be32_to_cpu(agi->agi_iblocks)) + xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp); + } + + if (sc->sa.fino_cur) { + error = xfs_btree_count_blocks(sc->sa.fino_cur, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur)) + return; + if (blocks != be32_to_cpu(agi->agi_fblocks)) + xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp); + } +} + /* Cross-reference with the other btrees. */ STATIC void xchk_agi_xref( @@ -804,6 +833,7 @@ xchk_agi_xref( xchk_agi_xref_icounts(sc); xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_agi_xref_fiblocks(sc); /* scrub teardown will take care of sc->sa for us */ } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index bca2ab1d4be9..401f71579ce6 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -810,10 +810,34 @@ xrep_agi_calc_from_btrees( error = xfs_ialloc_count_inodes(cur, &count, &freecount); if (error) goto err; + if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) { + xfs_agblock_t blocks; + + error = xfs_btree_count_blocks(cur, &blocks); + if (error) + goto err; + agi->agi_iblocks = cpu_to_be32(blocks); + } xfs_btree_del_cursor(cur, error); agi->agi_count = cpu_to_be32(count); agi->agi_freecount = cpu_to_be32(freecount); + + if (xfs_sb_version_hasfinobt(&mp->m_sb) && + xfs_sb_version_hasinobtcounts(&mp->m_sb)) { + xfs_agblock_t blocks; + + cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, sc->sa.agno, + XFS_BTNUM_FINO); + if (error) + goto err; + error = xfs_btree_count_blocks(cur, &blocks); + if (error) + goto err; + xfs_btree_del_cursor(cur, error); + agi->agi_fblocks = cpu_to_be32(blocks); + } + return 0; err: xfs_btree_del_cursor(cur, error); diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 6d483ab29e63..3aa85b64de36 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -190,11 +190,30 @@ xchk_inode_flags2( if ((flags2 & XFS_DIFLAG2_DAX) && (flags2 & XFS_DIFLAG2_REFLINK)) goto bad; + /* no bigtime iflag without the bigtime feature */ + if (xfs_dinode_has_bigtime(dip) && + !xfs_sb_version_hasbigtime(&mp->m_sb)) + goto bad; + return; bad: xchk_ino_set_corrupt(sc, ino); } +static inline void +xchk_dinode_nsec( + struct xfs_scrub *sc, + xfs_ino_t ino, + struct xfs_dinode *dip, + const xfs_timestamp_t ts) +{ + struct timespec64 tv; + + tv = xfs_inode_from_disk_ts(dip, ts); + if (tv.tv_nsec < 0 || tv.tv_nsec >= NSEC_PER_SEC) + xchk_ino_set_corrupt(sc, ino); +} + /* Scrub all the ondisk inode fields. */ STATIC void xchk_dinode( @@ -293,12 +312,9 @@ xchk_dinode( } /* di_[amc]time.nsec */ - if (be32_to_cpu(dip->di_atime.t_nsec) >= NSEC_PER_SEC) - xchk_ino_set_corrupt(sc, ino); - if (be32_to_cpu(dip->di_mtime.t_nsec) >= NSEC_PER_SEC) - xchk_ino_set_corrupt(sc, ino); - if (be32_to_cpu(dip->di_ctime.t_nsec) >= NSEC_PER_SEC) - xchk_ino_set_corrupt(sc, ino); + xchk_dinode_nsec(sc, ino, dip, dip->di_atime); + xchk_dinode_nsec(sc, ino, dip, dip->di_mtime); + xchk_dinode_nsec(sc, ino, dip, dip->di_ctime); /* * di_size. xfs_dinode_verify checks for things that screw up @@ -403,8 +419,7 @@ xchk_dinode( } if (dip->di_version >= 3) { - if (be32_to_cpu(dip->di_crtime.t_nsec) >= NSEC_PER_SEC) - xchk_ino_set_corrupt(sc, ino); + xchk_dinode_nsec(sc, ino, dip, dip->di_crtime); xchk_inode_flags2(sc, dip, ino, mode, flags, flags2); xchk_inode_cowextsize(sc, dip, ino, mode, flags, flags2); diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 5641ae512c9e..c08be5ede066 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -22,7 +22,7 @@ xchk_setup_symlink( struct xfs_inode *ip) { /* Allocate the buffer without the inode lock held. */ - sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0); + sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, GFP_KERNEL); if (!sc->buf) return -ENOMEM; diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index d4c687b5cd06..c544951a0c07 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -192,7 +192,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (acl) { args.valuelen = XFS_ACL_SIZE(acl->a_count); - args.value = kmem_zalloc_large(args.valuelen, 0); + args.value = kvzalloc(args.valuelen, GFP_KERNEL); if (!args.value) return -ENOMEM; xfs_acl_to_disk(args.value, acl); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index b35611882ff9..55d126d4e096 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -544,7 +544,7 @@ xfs_discard_page( page, ip->i_ino, offset); error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - PAGE_SIZE / i_blocksize(inode)); + i_blocks_per_page(inode, page)); if (error && !XFS_FORCED_SHUTDOWN(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); out_invalidate: diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 50f922cad91a..8f8837fe21cf 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -61,7 +61,7 @@ xfs_attr_shortform_list( int error = 0; ASSERT(dp->i_afp != NULL); - sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; ASSERT(sf != NULL); if (!sf->hdr.count) return 0; @@ -96,7 +96,7 @@ xfs_attr_shortform_list( */ if (context->seen_enough) break; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sfe = xfs_attr_sf_nextentry(sfe); } trace_xfs_attr_list_sf_all(context); return 0; @@ -136,7 +136,7 @@ xfs_attr_shortform_list( /* These are bytes, and both on-disk, don't endian-flip */ sbp->valuelen = sfe->valuelen; sbp->flags = sfe->flags; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sfe = xfs_attr_sf_nextentry(sfe); sbp++; nsbuf++; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 5123f82f2477..f2a8a0e75e1f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -946,6 +946,14 @@ xfs_free_file_space( startoffset_fsb = XFS_B_TO_FSB(mp, offset); endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); + /* We can only free complete realtime extents. */ + if (XFS_IS_REALTIME_INODE(ip)) { + xfs_extlen_t extsz = xfs_get_extsz_hint(ip); + + if ((startoffset_fsb | endoffset_fsb) & (extsz - 1)) + return -EINVAL; + } + /* * Need to zero the stuff we're not freeing, on disk. */ @@ -1139,6 +1147,14 @@ xfs_insert_file_space( trace_xfs_insert_file_space(ip); + /* We can only insert complete realtime extents. */ + if (XFS_IS_REALTIME_INODE(ip)) { + xfs_extlen_t extsz = xfs_get_extsz_hint(ip); + + if ((stop_fsb | shift_fsb) & (extsz - 1)) + return -EINVAL; + } + error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb); if (error) return error; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d4cdcb6fb2fe..4e4cf91f4f9f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -52,6 +52,15 @@ static kmem_zone_t *xfs_buf_zone; * b_lock (trylock due to inversion) */ +static int __xfs_buf_submit(struct xfs_buf *bp, bool wait); + +static inline int +xfs_buf_submit( + struct xfs_buf *bp) +{ + return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC)); +} + static inline int xfs_buf_is_vmapped( struct xfs_buf *bp) @@ -751,7 +760,7 @@ found: return 0; } -STATIC int +int _xfs_buf_read( xfs_buf_t *bp, xfs_buf_flags_t flags) @@ -759,7 +768,7 @@ _xfs_buf_read( ASSERT(!(flags & XBF_WRITE)); ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); - bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); + bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE); bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); return xfs_buf_submit(bp); @@ -1170,20 +1179,145 @@ xfs_buf_wait_unpin( set_current_state(TASK_RUNNING); } -/* - * Buffer Utility Routines - */ +static void +xfs_buf_ioerror_alert_ratelimited( + struct xfs_buf *bp) +{ + static unsigned long lasttime; + static struct xfs_buftarg *lasttarg; -void + if (bp->b_target != lasttarg || + time_after(jiffies, (lasttime + 5*HZ))) { + lasttime = jiffies; + xfs_buf_ioerror_alert(bp, __this_address); + } + lasttarg = bp->b_target; +} + +/* + * Account for this latest trip around the retry handler, and decide if + * we've failed enough times to constitute a permanent failure. + */ +static bool +xfs_buf_ioerror_permanent( + struct xfs_buf *bp, + struct xfs_error_cfg *cfg) +{ + struct xfs_mount *mp = bp->b_mount; + + if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && + ++bp->b_retries > cfg->max_retries) + return true; + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && + time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) + return true; + + /* At unmount we may treat errors differently */ + if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) + return true; + + return false; +} + +/* + * On a sync write or shutdown we just want to stale the buffer and let the + * caller handle the error in bp->b_error appropriately. + * + * If the write was asynchronous then no one will be looking for the error. If + * this is the first failure of this type, clear the error state and write the + * buffer out again. This means we always retry an async write failure at least + * once, but we also need to set the buffer up to behave correctly now for + * repeated failures. + * + * If we get repeated async write failures, then we take action according to the + * error configuration we have been set up to use. + * + * Returns true if this function took care of error handling and the caller must + * not touch the buffer again. Return false if the caller should proceed with + * normal I/O completion handling. + */ +static bool +xfs_buf_ioend_handle_error( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_error_cfg *cfg; + + /* + * If we've already decided to shutdown the filesystem because of I/O + * errors, there's no point in giving this a retry. + */ + if (XFS_FORCED_SHUTDOWN(mp)) + goto out_stale; + + xfs_buf_ioerror_alert_ratelimited(bp); + + /* + * We're not going to bother about retrying this during recovery. + * One strike! + */ + if (bp->b_flags & _XBF_LOGRECOVERY) { + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + return false; + } + + /* + * Synchronous writes will have callers process the error. + */ + if (!(bp->b_flags & XBF_ASYNC)) + goto out_stale; + + trace_xfs_buf_iodone_async(bp, _RET_IP_); + + cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); + if (bp->b_last_error != bp->b_error || + !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) { + bp->b_last_error = bp->b_error; + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && + !bp->b_first_retry_time) + bp->b_first_retry_time = jiffies; + goto resubmit; + } + + /* + * Permanent error - we need to trigger a shutdown if we haven't already + * to indicate that inconsistency will result from this action. + */ + if (xfs_buf_ioerror_permanent(bp, cfg)) { + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + goto out_stale; + } + + /* Still considered a transient error. Caller will schedule retries. */ + if (bp->b_flags & _XBF_INODES) + xfs_buf_inode_io_fail(bp); + else if (bp->b_flags & _XBF_DQUOTS) + xfs_buf_dquot_io_fail(bp); + else + ASSERT(list_empty(&bp->b_li_list)); + xfs_buf_ioerror(bp, 0); + xfs_buf_relse(bp); + return true; + +resubmit: + xfs_buf_ioerror(bp, 0); + bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL); + xfs_buf_submit(bp); + return true; +out_stale: + xfs_buf_stale(bp); + bp->b_flags |= XBF_DONE; + bp->b_flags &= ~XBF_WRITE; + trace_xfs_buf_error_relse(bp, _RET_IP_); + return false; +} + +static void xfs_buf_ioend( struct xfs_buf *bp) { - bool read = bp->b_flags & XBF_READ; - trace_xfs_buf_iodone(bp, _RET_IP_); - bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); - /* * Pull in IO completion errors now. We are guaranteed to be running * single threaded, so we don't need the lock to read b_io_error. @@ -1191,39 +1325,47 @@ xfs_buf_ioend( if (!bp->b_error && bp->b_io_error) xfs_buf_ioerror(bp, bp->b_io_error); - if (read) { + if (bp->b_flags & XBF_READ) { if (!bp->b_error && bp->b_ops) bp->b_ops->verify_read(bp); if (!bp->b_error) bp->b_flags |= XBF_DONE; - xfs_buf_ioend_finish(bp); - return; + } else { + if (!bp->b_error) { + bp->b_flags &= ~XBF_WRITE_FAIL; + bp->b_flags |= XBF_DONE; + } + + if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp)) + return; + + /* clear the retry state */ + bp->b_last_error = 0; + bp->b_retries = 0; + bp->b_first_retry_time = 0; + + /* + * Note that for things like remote attribute buffers, there may + * not be a buffer log item here, so processing the buffer log + * item must remain optional. + */ + if (bp->b_log_item) + xfs_buf_item_done(bp); + + if (bp->b_flags & _XBF_INODES) + xfs_buf_inode_iodone(bp); + else if (bp->b_flags & _XBF_DQUOTS) + xfs_buf_dquot_iodone(bp); + } - if (!bp->b_error) { - bp->b_flags &= ~XBF_WRITE_FAIL; - bp->b_flags |= XBF_DONE; - } + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD | + _XBF_LOGRECOVERY); - /* - * If this is a log recovery buffer, we aren't doing transactional IO - * yet so we need to let it handle IO completions. - */ - if (bp->b_flags & _XBF_LOGRECOVERY) { - xlog_recover_iodone(bp); - return; - } - - if (bp->b_flags & _XBF_INODES) { - xfs_buf_inode_iodone(bp); - return; - } - - if (bp->b_flags & _XBF_DQUOTS) { - xfs_buf_dquot_iodone(bp); - return; - } - xfs_buf_iodone(bp); + if (bp->b_flags & XBF_ASYNC) + xfs_buf_relse(bp); + else + complete(&bp->b_iowait); } static void @@ -1506,7 +1648,7 @@ xfs_buf_iowait( * safe to reference the buffer after a call to this function unless the caller * holds an additional reference itself. */ -int +static int __xfs_buf_submit( struct xfs_buf *bp, bool wait) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 755b652e695a..bfd2907e7bc4 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -249,6 +249,7 @@ int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags, int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, int flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops); +int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ @@ -269,28 +270,12 @@ static inline void xfs_buf_relse(xfs_buf_t *bp) /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_buf *bp); -extern void xfs_buf_ioend(struct xfs_buf *bp); -static inline void xfs_buf_ioend_finish(struct xfs_buf *bp) -{ - if (bp->b_flags & XBF_ASYNC) - xfs_buf_relse(bp); - else - complete(&bp->b_iowait); -} extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa); void xfs_buf_ioend_fail(struct xfs_buf *); - -extern int __xfs_buf_submit(struct xfs_buf *bp, bool); -static inline int xfs_buf_submit(struct xfs_buf *bp) -{ - bool wait = bp->b_flags & XBF_ASYNC ? false : true; - return __xfs_buf_submit(bp, wait); -} - void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize); void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa); #define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 408d1b572d3f..0356f2e340a1 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -30,8 +30,6 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_buf_log_item, bli_item); } -static void xfs_buf_item_done(struct xfs_buf *bp); - /* Is this log iovec plausibly large enough to contain the buffer log format? */ bool xfs_buf_log_check_iovec( @@ -463,7 +461,7 @@ xfs_buf_item_unpin( */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { xfs_buf_item_done(bp); - xfs_iflush_done(bp); + xfs_buf_inode_iodone(bp); ASSERT(list_empty(&bp->b_li_list)); } else { xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); @@ -956,153 +954,10 @@ xfs_buf_item_relse( xfs_buf_item_free(bip); } -/* - * Decide if we're going to retry the write after a failure, and prepare - * the buffer for retrying the write. - */ -static bool -xfs_buf_ioerror_fail_without_retry( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_mount; - static ulong lasttime; - static xfs_buftarg_t *lasttarg; - - /* - * If we've already decided to shutdown the filesystem because of - * I/O errors, there's no point in giving this a retry. - */ - if (XFS_FORCED_SHUTDOWN(mp)) - return true; - - if (bp->b_target != lasttarg || - time_after(jiffies, (lasttime + 5*HZ))) { - lasttime = jiffies; - xfs_buf_ioerror_alert(bp, __this_address); - } - lasttarg = bp->b_target; - - /* synchronous writes will have callers process the error */ - if (!(bp->b_flags & XBF_ASYNC)) - return true; - return false; -} - -static bool -xfs_buf_ioerror_retry( - struct xfs_buf *bp, - struct xfs_error_cfg *cfg) -{ - if ((bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) && - bp->b_last_error == bp->b_error) - return false; - - bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); - bp->b_last_error = bp->b_error; - if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && - !bp->b_first_retry_time) - bp->b_first_retry_time = jiffies; - return true; -} - -/* - * Account for this latest trip around the retry handler, and decide if - * we've failed enough times to constitute a permanent failure. - */ -static bool -xfs_buf_ioerror_permanent( - struct xfs_buf *bp, - struct xfs_error_cfg *cfg) -{ - struct xfs_mount *mp = bp->b_mount; - - if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && - ++bp->b_retries > cfg->max_retries) - return true; - if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && - time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) - return true; - - /* At unmount we may treat errors differently */ - if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) - return true; - - return false; -} - -/* - * On a sync write or shutdown we just want to stale the buffer and let the - * caller handle the error in bp->b_error appropriately. - * - * If the write was asynchronous then no one will be looking for the error. If - * this is the first failure of this type, clear the error state and write the - * buffer out again. This means we always retry an async write failure at least - * once, but we also need to set the buffer up to behave correctly now for - * repeated failures. - * - * If we get repeated async write failures, then we take action according to the - * error configuration we have been set up to use. - * - * Multi-state return value: - * - * XBF_IOERROR_FINISH: clear IO error retry state and run callback completions - * XBF_IOERROR_DONE: resubmitted immediately, do not run any completions - * XBF_IOERROR_FAIL: transient error, run failure callback completions and then - * release the buffer - */ -enum { - XBF_IOERROR_FINISH, - XBF_IOERROR_DONE, - XBF_IOERROR_FAIL, -}; - -static int -xfs_buf_iodone_error( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_mount; - struct xfs_error_cfg *cfg; - - if (xfs_buf_ioerror_fail_without_retry(bp)) - goto out_stale; - - trace_xfs_buf_item_iodone_async(bp, _RET_IP_); - - cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); - if (xfs_buf_ioerror_retry(bp, cfg)) { - xfs_buf_ioerror(bp, 0); - xfs_buf_submit(bp); - return XBF_IOERROR_DONE; - } - - /* - * Permanent error - we need to trigger a shutdown if we haven't already - * to indicate that inconsistency will result from this action. - */ - if (xfs_buf_ioerror_permanent(bp, cfg)) { - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - goto out_stale; - } - - /* Still considered a transient error. Caller will schedule retries. */ - return XBF_IOERROR_FAIL; - -out_stale: - xfs_buf_stale(bp); - bp->b_flags |= XBF_DONE; - trace_xfs_buf_error_relse(bp, _RET_IP_); - return XBF_IOERROR_FINISH; -} - -static void +void xfs_buf_item_done( struct xfs_buf *bp) { - struct xfs_buf_log_item *bip = bp->b_log_item; - - if (!bip) - return; - /* * If we are forcibly shutting down, this may well be off the AIL * already. That's because we simulate the log-committed callbacks to @@ -1111,113 +966,12 @@ xfs_buf_item_done( * xfs_trans_ail_delete() takes care of these. * * Either way, AIL is useless if we're forcing a shutdown. + * + * Note that log recovery writes might have buffer items that are not on + * the AIL even when the file system is not shut down. */ - xfs_trans_ail_delete(&bip->bli_item, SHUTDOWN_CORRUPT_INCORE); - bp->b_log_item = NULL; - xfs_buf_item_free(bip); - xfs_buf_rele(bp); -} - -static inline void -xfs_buf_clear_ioerror_retry_state( - struct xfs_buf *bp) -{ - bp->b_last_error = 0; - bp->b_retries = 0; - bp->b_first_retry_time = 0; -} - -/* - * Inode buffer iodone callback function. - */ -void -xfs_buf_inode_iodone( - struct xfs_buf *bp) -{ - if (bp->b_error) { - struct xfs_log_item *lip; - int ret = xfs_buf_iodone_error(bp); - - if (ret == XBF_IOERROR_FINISH) - goto finish_iodone; - if (ret == XBF_IOERROR_DONE) - return; - ASSERT(ret == XBF_IOERROR_FAIL); - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - set_bit(XFS_LI_FAILED, &lip->li_flags); - } - xfs_buf_ioerror(bp, 0); - xfs_buf_relse(bp); - return; - } - -finish_iodone: - xfs_buf_clear_ioerror_retry_state(bp); - xfs_buf_item_done(bp); - xfs_iflush_done(bp); - xfs_buf_ioend_finish(bp); -} - -/* - * Dquot buffer iodone callback function. - */ -void -xfs_buf_dquot_iodone( - struct xfs_buf *bp) -{ - if (bp->b_error) { - struct xfs_log_item *lip; - int ret = xfs_buf_iodone_error(bp); - - if (ret == XBF_IOERROR_FINISH) - goto finish_iodone; - if (ret == XBF_IOERROR_DONE) - return; - ASSERT(ret == XBF_IOERROR_FAIL); - spin_lock(&bp->b_mount->m_ail->ail_lock); - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - xfs_set_li_failed(lip, bp); - } - spin_unlock(&bp->b_mount->m_ail->ail_lock); - xfs_buf_ioerror(bp, 0); - xfs_buf_relse(bp); - return; - } - -finish_iodone: - xfs_buf_clear_ioerror_retry_state(bp); - /* a newly allocated dquot buffer might have a log item attached */ - xfs_buf_item_done(bp); - xfs_dquot_done(bp); - xfs_buf_ioend_finish(bp); -} - -/* - * Dirty buffer iodone callback function. - * - * Note that for things like remote attribute buffers, there may not be a buffer - * log item here, so processing the buffer log item must remain be optional. - */ -void -xfs_buf_iodone( - struct xfs_buf *bp) -{ - if (bp->b_error) { - int ret = xfs_buf_iodone_error(bp); - - if (ret == XBF_IOERROR_FINISH) - goto finish_iodone; - if (ret == XBF_IOERROR_DONE) - return; - ASSERT(ret == XBF_IOERROR_FAIL); - ASSERT(list_empty(&bp->b_li_list)); - xfs_buf_ioerror(bp, 0); - xfs_buf_relse(bp); - return; - } - -finish_iodone: - xfs_buf_clear_ioerror_retry_state(bp); - xfs_buf_item_done(bp); - xfs_buf_ioend_finish(bp); + xfs_trans_ail_delete(&bp->b_log_item->bli_item, + (bp->b_flags & _XBF_LOGRECOVERY) ? 0 : + SHUTDOWN_CORRUPT_INCORE); + xfs_buf_item_relse(bp); } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 23507cbb4c41..50aa0f5ef959 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -50,12 +50,24 @@ struct xfs_buf_log_item { }; int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); +void xfs_buf_item_done(struct xfs_buf *bp); void xfs_buf_item_relse(struct xfs_buf *); bool xfs_buf_item_put(struct xfs_buf_log_item *); void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_inode_iodone(struct xfs_buf *); +void xfs_buf_inode_io_fail(struct xfs_buf *bp); +#ifdef CONFIG_XFS_QUOTA void xfs_buf_dquot_iodone(struct xfs_buf *); +void xfs_buf_dquot_io_fail(struct xfs_buf *bp); +#else +static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp) +{ +} +static inline void xfs_buf_dquot_io_fail(struct xfs_buf *bp) +{ +} +#endif /* CONFIG_XFS_QUOTA */ void xfs_buf_iodone(struct xfs_buf *); bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 8f0457d67d77..24c7a8d11e1a 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -414,7 +414,7 @@ xlog_recover_validate_buf_type( * * Write verifiers update the metadata LSN from log items attached to * the buffer. Therefore, initialize a bli purely to carry the LSN to - * the verifier. We'll clean it up in our ->iodone() callback. + * the verifier. */ if (bp->b_ops) { struct xfs_buf_log_item *bip; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index bcd73b9c2994..3072814e407d 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -98,12 +98,33 @@ xfs_qm_adjust_dqlimits( xfs_dquot_set_prealloc_limits(dq); } +/* Set the expiration time of a quota's grace period. */ +time64_t +xfs_dquot_set_timeout( + struct xfs_mount *mp, + time64_t timeout) +{ + struct xfs_quotainfo *qi = mp->m_quotainfo; + + return clamp_t(time64_t, timeout, qi->qi_expiry_min, + qi->qi_expiry_max); +} + +/* Set the length of the default grace period. */ +time64_t +xfs_dquot_set_grace_period( + time64_t grace) +{ + return clamp_t(time64_t, grace, XFS_DQ_GRACE_MIN, XFS_DQ_GRACE_MAX); +} + /* * Determine if this quota counter is over either limit and set the quota * timers as appropriate. */ static inline void xfs_qm_adjust_res_timer( + struct xfs_mount *mp, struct xfs_dquot_res *res, struct xfs_quota_limits *qlim) { @@ -112,7 +133,8 @@ xfs_qm_adjust_res_timer( if ((res->softlimit && res->count > res->softlimit) || (res->hardlimit && res->count > res->hardlimit)) { if (res->timer == 0) - res->timer = ktime_get_real_seconds() + qlim->time; + res->timer = xfs_dquot_set_timeout(mp, + ktime_get_real_seconds() + qlim->time); } else { if (res->timer == 0) res->warnings = 0; @@ -145,9 +167,9 @@ xfs_qm_adjust_dqtimers( ASSERT(dq->q_id); defq = xfs_get_defquota(qi, xfs_dquot_type(dq)); - xfs_qm_adjust_res_timer(&dq->q_blk, &defq->blk); - xfs_qm_adjust_res_timer(&dq->q_ino, &defq->ino); - xfs_qm_adjust_res_timer(&dq->q_rtb, &defq->rtb); + xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_blk, &defq->blk); + xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_ino, &defq->ino); + xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_rtb, &defq->rtb); } /* @@ -201,6 +223,8 @@ xfs_qm_init_dquot_blk( d->dd_diskdq.d_version = XFS_DQUOT_VERSION; d->dd_diskdq.d_id = cpu_to_be32(curid); d->dd_diskdq.d_type = type; + if (curid > 0 && xfs_sb_version_hasbigtime(&mp->m_sb)) + d->dd_diskdq.d_type |= XFS_DQTYPE_BIGTIME; if (xfs_sb_version_hascrc(&mp->m_sb)) { uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), @@ -514,9 +538,9 @@ xfs_dquot_from_disk( dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns); dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns); - dqp->q_blk.timer = be32_to_cpu(ddqp->d_btimer); - dqp->q_ino.timer = be32_to_cpu(ddqp->d_itimer); - dqp->q_rtb.timer = be32_to_cpu(ddqp->d_rtbtimer); + dqp->q_blk.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_btimer); + dqp->q_ino.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_itimer); + dqp->q_rtb.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_rtbtimer); /* * Reservation counters are defined as reservation plus current usage @@ -559,9 +583,9 @@ xfs_dquot_to_disk( ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings); ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings); - ddqp->d_btimer = cpu_to_be32(dqp->q_blk.timer); - ddqp->d_itimer = cpu_to_be32(dqp->q_ino.timer); - ddqp->d_rtbtimer = cpu_to_be32(dqp->q_rtb.timer); + ddqp->d_btimer = xfs_dquot_to_disk_ts(dqp, dqp->q_blk.timer); + ddqp->d_itimer = xfs_dquot_to_disk_ts(dqp, dqp->q_ino.timer); + ddqp->d_rtbtimer = xfs_dquot_to_disk_ts(dqp, dqp->q_rtb.timer); } /* Allocate and initialize the dquot buffer for this in-core dquot. */ @@ -1107,7 +1131,7 @@ xfs_qm_dqflush_done( } void -xfs_dquot_done( +xfs_buf_dquot_iodone( struct xfs_buf *bp) { struct xfs_log_item *lip, *n; @@ -1118,6 +1142,18 @@ xfs_dquot_done( } } +void +xfs_buf_dquot_io_fail( + struct xfs_buf *bp) +{ + struct xfs_log_item *lip; + + spin_lock(&bp->b_mount->m_ail->ail_lock); + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) + xfs_set_li_failed(lip, bp); + spin_unlock(&bp->b_mount->m_ail->ail_lock); +} + /* Check incore dquot for errors before we flush. */ static xfs_failaddr_t xfs_qm_dqflush_check( @@ -1145,6 +1181,14 @@ xfs_qm_dqflush_check( !dqp->q_rtb.timer) return __this_address; + /* bigtime flag should never be set on root dquots */ + if (dqp->q_type & XFS_DQTYPE_BIGTIME) { + if (!xfs_sb_version_hasbigtime(&dqp->q_mount->m_sb)) + return __this_address; + if (dqp->q_id == 0) + return __this_address; + } + return NULL; } diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 282a65da93c7..f642884a6834 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -237,4 +237,7 @@ typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type, xfs_qm_dqiterate_fn iter_fn, void *priv); +time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout); +time64_t xfs_dquot_set_grace_period(time64_t grace); + #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index a29f78a663ca..3d1b95124744 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1008,6 +1008,21 @@ xfs_file_fadvise( return ret; } +/* Does this file, inode, or mount want synchronous writes? */ +static inline bool xfs_file_sync_writes(struct file *filp) +{ + struct xfs_inode *ip = XFS_I(file_inode(filp)); + + if (ip->i_mount->m_flags & XFS_MOUNT_WSYNC) + return true; + if (filp->f_flags & (__O_SYNC | O_DSYNC)) + return true; + if (IS_SYNC(file_inode(filp))) + return true; + + return false; +} + STATIC loff_t xfs_file_remap_range( struct file *file_in, @@ -1065,7 +1080,7 @@ xfs_file_remap_range( if (ret) goto out_unlock; - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) xfs_log_force_inode(dest); out_unlock: xfs_iunlock2_io_mmap(src, dest); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 101028ebb571..deb99300d171 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -52,7 +52,6 @@ xfs_inode_alloc( XFS_STATS_INC(mp, vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!xfs_isiflocked(ip)); ASSERT(ip->i_ino == 0); /* initialise the xfs inode */ @@ -123,7 +122,7 @@ void xfs_inode_free( struct xfs_inode *ip) { - ASSERT(!xfs_isiflocked(ip)); + ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING)); /* * Because we use RCU freeing we need to ensure the inode always @@ -1035,23 +1034,21 @@ xfs_reclaim_inode( if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) goto out; - if (!xfs_iflock_nowait(ip)) + if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING)) goto out_iunlock; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); - /* xfs_iflush_abort() drops the flush lock */ xfs_iflush_abort(ip); goto reclaim; } if (xfs_ipincount(ip)) - goto out_ifunlock; + goto out_clear_flush; if (!xfs_inode_clean(ip)) - goto out_ifunlock; + goto out_clear_flush; - xfs_ifunlock(ip); + xfs_iflags_clear(ip, XFS_IFLUSHING); reclaim: - ASSERT(!xfs_isiflocked(ip)); /* * Because we use RCU freeing we need to ensure the inode always appears @@ -1101,8 +1098,8 @@ reclaim: __xfs_inode_free(ip); return; -out_ifunlock: - xfs_ifunlock(ip); +out_clear_flush: + xfs_iflags_clear(ip, XFS_IFLUSHING); out_iunlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); out: @@ -1211,7 +1208,7 @@ xfs_reclaim_inodes( while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { xfs_ail_push_all_sync(mp->m_ail); xfs_reclaim_inodes_ag(mp, &nr_to_scan); - }; + } } /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c06129cffba9..49624973eecc 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -598,22 +598,6 @@ xfs_lock_two_inodes( } } -void -__xfs_iflock( - struct xfs_inode *ip) -{ - wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); - DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); - - do { - prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - if (xfs_isiflocked(ip)) - io_schedule(); - } while (!xfs_iflock_nowait(ip)); - - finish_wait(wq, &wait.wq_entry); -} - STATIC uint _xfs_dic2xflags( uint16_t di_flags, @@ -840,7 +824,7 @@ xfs_ialloc( if (xfs_sb_version_has_v3inode(&mp->m_sb)) { inode_set_iversion(inode, 1); - ip->i_d.di_flags2 = 0; + ip->i_d.di_flags2 = mp->m_ino_geo.new_diflags2; ip->i_d.di_cowextsize = 0; ip->i_d.di_crtime = tv; } @@ -2531,11 +2515,8 @@ retry: * valid, the wrong inode or stale. */ spin_lock(&ip->i_flags_lock); - if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) { - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - return; - } + if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) + goto out_iflags_unlock; /* * Don't try to lock/unlock the current inode, but we _cannot_ skip the @@ -2552,16 +2533,14 @@ retry: } } ip->i_flags |= XFS_ISTALE; - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); /* - * If we can't get the flush lock, the inode is already attached. All + * If the inode is flushing, it is already attached to the buffer. All * we needed to do here is mark the inode stale so buffer IO completion * will remove it from the AIL. */ iip = ip->i_itemp; - if (!xfs_iflock_nowait(ip)) { + if (__xfs_iflags_test(ip, XFS_IFLUSHING)) { ASSERT(!list_empty(&iip->ili_item.li_bio_list)); ASSERT(iip->ili_last_fields); goto out_iunlock; @@ -2573,10 +2552,12 @@ retry: * commit as the flock synchronises removal of the inode from the * cluster buffer against inode reclaim. */ - if (!iip || list_empty(&iip->ili_item.li_bio_list)) { - xfs_ifunlock(ip); + if (!iip || list_empty(&iip->ili_item.li_bio_list)) goto out_iunlock; - } + + __xfs_iflags_set(ip, XFS_IFLUSHING); + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); /* we have a dirty inode in memory that has not yet been flushed. */ spin_lock(&iip->ili_lock); @@ -2586,9 +2567,16 @@ retry: spin_unlock(&iip->ili_lock); ASSERT(iip->ili_last_fields); + if (ip != free_ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return; + out_iunlock: if (ip != free_ip) xfs_iunlock(ip, XFS_ILOCK_EXCL); +out_iflags_unlock: + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); } /* @@ -2631,8 +2619,9 @@ xfs_ifree_cluster( /* * We obtain and lock the backing buffer first in the process - * here, as we have to ensure that any dirty inode that we - * can't get the flush lock on is attached to the buffer. + * here to ensure dirty inodes attached to the buffer remain in + * the flushing state while we mark them stale. + * * If we scan the in-memory inodes first, then buffer IO can * complete before we get a lock on it, and hence we may fail * to mark all the active inodes on the buffer stale. @@ -2717,7 +2706,7 @@ xfs_ifree( VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; - ip->i_d.di_flags2 = 0; + ip->i_d.di_flags2 = ip->i_mount->m_ino_geo.new_diflags2; ip->i_d.di_dmevmask = 0; ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; @@ -3443,7 +3432,7 @@ xfs_iflush( int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(xfs_isiflocked(ip)); + ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING)); ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); ASSERT(iip->ili_item.li_buf == bp); @@ -3553,8 +3542,8 @@ xfs_iflush( * * What we do is move the bits to the ili_last_fields field. When * logging the inode, these bits are moved back to the ili_fields field. - * In the xfs_iflush_done() routine we clear ili_last_fields, since we - * know that the information those bits represent is permanently on + * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since + * we know that the information those bits represent is permanently on * disk. As long as the flush completes before the inode is logged * again, then both ili_fields and ili_last_fields will be cleared. */ @@ -3568,7 +3557,7 @@ flush_out: /* * Store the current LSN of the inode so that we can tell whether the - * item has moved in the AIL from xfs_iflush_done(). + * item has moved in the AIL from xfs_buf_inode_iodone(). */ xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); @@ -3613,7 +3602,7 @@ xfs_iflush_cluster( /* * Quick and dirty check to avoid locks if possible. */ - if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) continue; if (xfs_ipincount(ip)) continue; @@ -3627,7 +3616,7 @@ xfs_iflush_cluster( */ spin_lock(&ip->i_flags_lock); ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); - if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) { + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) { spin_unlock(&ip->i_flags_lock); continue; } @@ -3635,23 +3624,16 @@ xfs_iflush_cluster( /* * ILOCK will pin the inode against reclaim and prevent * concurrent transactions modifying the inode while we are - * flushing the inode. + * flushing the inode. If we get the lock, set the flushing + * state before we drop the i_flags_lock. */ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { spin_unlock(&ip->i_flags_lock); continue; } + __xfs_iflags_set(ip, XFS_IFLUSHING); spin_unlock(&ip->i_flags_lock); - /* - * Skip inodes that are already flush locked as they have - * already been written to the buffer. - */ - if (!xfs_iflock_nowait(ip)) { - xfs_iunlock(ip, XFS_ILOCK_SHARED); - continue; - } - /* * Abort flushing this inode if we are shut down because the * inode may not currently be in the AIL. This can occur when @@ -3661,7 +3643,6 @@ xfs_iflush_cluster( */ if (XFS_FORCED_SHUTDOWN(mp)) { xfs_iunpin_wait(ip); - /* xfs_iflush_abort() drops the flush lock */ xfs_iflush_abort(ip); xfs_iunlock(ip, XFS_ILOCK_SHARED); error = -EIO; @@ -3670,7 +3651,7 @@ xfs_iflush_cluster( /* don't block waiting on a log force to unpin dirty inodes */ if (xfs_ipincount(ip)) { - xfs_ifunlock(ip); + xfs_iflags_clear(ip, XFS_IFLUSHING); xfs_iunlock(ip, XFS_ILOCK_SHARED); continue; } @@ -3678,7 +3659,7 @@ xfs_iflush_cluster( if (!xfs_inode_clean(ip)) error = xfs_iflush(ip, bp); else - xfs_ifunlock(ip); + xfs_iflags_clear(ip, XFS_IFLUSHING); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (error) break; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e9a8bb184d1f..751a3d1d7d84 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -194,6 +194,11 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) return ip->i_cowfp && ip->i_cowfp->if_bytes; } +static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) +{ + return ip->i_d.di_flags2 & XFS_DIFLAG2_BIGTIME; +} + /* * Return the buftarg used for data allocations on a given inode. */ @@ -211,8 +216,7 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) #define XFS_INEW (1 << __XFS_INEW_BIT) #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ #define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ -#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ -#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT) +#define XFS_IFLUSHING (1 << 7) /* inode is being flushed */ #define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) #define XFS_IEOFBLOCKS (1 << 9) /* has the preallocblocks tag set */ @@ -233,36 +237,6 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ XFS_IDIRTY_RELEASE | XFS_ITRUNCATED) -/* - * Synchronize processes attempting to flush the in-core inode back to disk. - */ - -static inline int xfs_isiflocked(struct xfs_inode *ip) -{ - return xfs_iflags_test(ip, XFS_IFLOCK); -} - -extern void __xfs_iflock(struct xfs_inode *ip); - -static inline int xfs_iflock_nowait(struct xfs_inode *ip) -{ - return !xfs_iflags_test_and_set(ip, XFS_IFLOCK); -} - -static inline void xfs_iflock(struct xfs_inode *ip) -{ - if (!xfs_iflock_nowait(ip)) - __xfs_iflock(ip); -} - -static inline void xfs_ifunlock(struct xfs_inode *ip) -{ - ASSERT(xfs_isiflocked(ip)); - xfs_iflags_clear(ip, XFS_IFLOCK); - smp_mb(); - wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); -} - /* * Flags for inode locking. * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6c65938cee1c..17e20a6d8b4e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -295,6 +295,28 @@ xfs_inode_item_format_attr_fork( } } +/* + * Convert an incore timestamp to a log timestamp. Note that the log format + * specifies host endian format! + */ +static inline xfs_ictimestamp_t +xfs_inode_to_log_dinode_ts( + struct xfs_inode *ip, + const struct timespec64 tv) +{ + struct xfs_legacy_ictimestamp *lits; + xfs_ictimestamp_t its; + + if (xfs_inode_has_bigtime(ip)) + return xfs_inode_encode_bigtime(tv); + + lits = (struct xfs_legacy_ictimestamp *)&its; + lits->t_sec = tv.tv_sec; + lits->t_nsec = tv.tv_nsec; + + return its; +} + static void xfs_inode_to_log_dinode( struct xfs_inode *ip, @@ -313,12 +335,9 @@ xfs_inode_to_log_dinode( memset(to->di_pad, 0, sizeof(to->di_pad)); memset(to->di_pad3, 0, sizeof(to->di_pad3)); - to->di_atime.t_sec = inode->i_atime.tv_sec; - to->di_atime.t_nsec = inode->i_atime.tv_nsec; - to->di_mtime.t_sec = inode->i_mtime.tv_sec; - to->di_mtime.t_nsec = inode->i_mtime.tv_nsec; - to->di_ctime.t_sec = inode->i_ctime.tv_sec; - to->di_ctime.t_nsec = inode->i_ctime.tv_nsec; + to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime); + to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime); + to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime); to->di_nlink = inode->i_nlink; to->di_gen = inode->i_generation; to->di_mode = inode->i_mode; @@ -340,8 +359,7 @@ xfs_inode_to_log_dinode( if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { to->di_version = 3; to->di_changecount = inode_peek_iversion(inode); - to->di_crtime.t_sec = from->di_crtime.tv_sec; - to->di_crtime.t_nsec = from->di_crtime.tv_nsec; + to->di_crtime = xfs_inode_to_log_dinode_ts(ip, from->di_crtime); to->di_flags2 = from->di_flags2; to->di_cowextsize = from->di_cowextsize; to->di_ino = ip->i_ino; @@ -491,8 +509,7 @@ xfs_inode_item_push( (ip->i_flags & XFS_ISTALE)) return XFS_ITEM_PINNED; - /* If the inode is already flush locked, we're already flushing. */ - if (xfs_isiflocked(ip)) + if (xfs_iflags_test(ip, XFS_IFLUSHING)) return XFS_ITEM_FLUSHING; if (!xfs_buf_trylock(bp)) @@ -703,7 +720,7 @@ xfs_iflush_finish( iip->ili_last_fields = 0; iip->ili_flush_lsn = 0; spin_unlock(&iip->ili_lock); - xfs_ifunlock(iip->ili_inode); + xfs_iflags_clear(iip->ili_inode, XFS_IFLUSHING); if (drop_buffer) xfs_buf_rele(bp); } @@ -711,11 +728,11 @@ xfs_iflush_finish( /* * Inode buffer IO completion routine. It is responsible for removing inodes - * attached to the buffer from the AIL if they have not been re-logged, as well - * as completing the flush and unlocking the inode. + * attached to the buffer from the AIL if they have not been re-logged and + * completing the inode flush. */ void -xfs_iflush_done( +xfs_buf_inode_iodone( struct xfs_buf *bp) { struct xfs_log_item *lip, *n; @@ -754,11 +771,21 @@ xfs_iflush_done( list_splice_tail(&flushed_inodes, &bp->b_li_list); } +void +xfs_buf_inode_io_fail( + struct xfs_buf *bp) +{ + struct xfs_log_item *lip; + + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) + set_bit(XFS_LI_FAILED, &lip->li_flags); +} + /* - * This is the inode flushing abort routine. It is called from xfs_iflush when + * This is the inode flushing abort routine. It is called when * the filesystem is shutting down to clean up the inode state. It is * responsible for removing the inode item from the AIL if it has not been - * re-logged, and unlocking the inode's flush lock. + * re-logged and clearing the inode's flush state. */ void xfs_iflush_abort( @@ -790,7 +817,7 @@ xfs_iflush_abort( list_del_init(&iip->ili_item.li_bio_list); spin_unlock(&iip->ili_lock); } - xfs_ifunlock(ip); + xfs_iflags_clear(ip, XFS_IFLUSHING); if (bp) xfs_buf_rele(bp); } diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 048b5e7dee90..4b926e32831c 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -25,8 +25,8 @@ struct xfs_inode_log_item { * * We need atomic changes between inode dirtying, inode flushing and * inode completion, but these all hold different combinations of - * ILOCK and iflock and hence we need some other method of serialising - * updates to the flush state. + * ILOCK and IFLUSHING and hence we need some other method of + * serialising updates to the flush state. */ spinlock_t ili_lock; /* flush state lock */ unsigned int ili_last_fields; /* fields when flushed */ @@ -43,7 +43,6 @@ static inline int xfs_inode_clean(struct xfs_inode *ip) extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); -extern void xfs_iflush_done(struct xfs_buf *); extern void xfs_iflush_abort(struct xfs_inode *); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, struct xfs_inode_log_format *); diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 5e0d291835b3..cb44f7653f03 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -115,6 +115,82 @@ out_free_ip: return error; } +static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld) +{ + return ld->di_version >= 3 && + (ld->di_flags2 & XFS_DIFLAG2_BIGTIME); +} + +/* Convert a log timestamp to an ondisk timestamp. */ +static inline xfs_timestamp_t +xfs_log_dinode_to_disk_ts( + struct xfs_log_dinode *from, + const xfs_ictimestamp_t its) +{ + struct xfs_legacy_timestamp *lts; + struct xfs_legacy_ictimestamp *lits; + xfs_timestamp_t ts; + + if (xfs_log_dinode_has_bigtime(from)) + return cpu_to_be64(its); + + lts = (struct xfs_legacy_timestamp *)&ts; + lits = (struct xfs_legacy_ictimestamp *)&its; + lts->t_sec = cpu_to_be32(lits->t_sec); + lts->t_nsec = cpu_to_be32(lits->t_nsec); + + return ts; +} + +STATIC void +xfs_log_dinode_to_disk( + struct xfs_log_dinode *from, + struct xfs_dinode *to) +{ + to->di_magic = cpu_to_be16(from->di_magic); + to->di_mode = cpu_to_be16(from->di_mode); + to->di_version = from->di_version; + to->di_format = from->di_format; + to->di_onlink = 0; + to->di_uid = cpu_to_be32(from->di_uid); + to->di_gid = cpu_to_be32(from->di_gid); + to->di_nlink = cpu_to_be32(from->di_nlink); + to->di_projid_lo = cpu_to_be16(from->di_projid_lo); + to->di_projid_hi = cpu_to_be16(from->di_projid_hi); + memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); + + to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime); + to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime); + to->di_ctime = xfs_log_dinode_to_disk_ts(from, from->di_ctime); + + to->di_size = cpu_to_be64(from->di_size); + to->di_nblocks = cpu_to_be64(from->di_nblocks); + to->di_extsize = cpu_to_be32(from->di_extsize); + to->di_nextents = cpu_to_be32(from->di_nextents); + to->di_anextents = cpu_to_be16(from->di_anextents); + to->di_forkoff = from->di_forkoff; + to->di_aformat = from->di_aformat; + to->di_dmevmask = cpu_to_be32(from->di_dmevmask); + to->di_dmstate = cpu_to_be16(from->di_dmstate); + to->di_flags = cpu_to_be16(from->di_flags); + to->di_gen = cpu_to_be32(from->di_gen); + + if (from->di_version == 3) { + to->di_changecount = cpu_to_be64(from->di_changecount); + to->di_crtime = xfs_log_dinode_to_disk_ts(from, + from->di_crtime); + to->di_flags2 = cpu_to_be64(from->di_flags2); + to->di_cowextsize = cpu_to_be32(from->di_cowextsize); + to->di_ino = cpu_to_be64(from->di_ino); + to->di_lsn = cpu_to_be64(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + to->di_flushiter = 0; + } else { + to->di_flushiter = cpu_to_be16(from->di_flushiter); + } +} + STATIC int xlog_recover_inode_commit_pass2( struct xlog *log, diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 6f22a66777cd..bca7659fb5c6 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -404,7 +404,7 @@ xfs_ioc_attr_list( context.cursor.offset)) return -EINVAL; - buffer = kmem_zalloc_large(bufsize, 0); + buffer = kvzalloc(bufsize, GFP_KERNEL); if (!buffer) return -ENOMEM; @@ -1190,7 +1190,8 @@ xfs_flags2diflags2( unsigned int xflags) { uint64_t di_flags2 = - (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); + (ip->i_d.di_flags2 & (XFS_DIFLAG2_REFLINK | + XFS_DIFLAG2_BIGTIME)); if (xflags & FS_XFLAG_DAX) di_flags2 |= XFS_DIFLAG2_DAX; @@ -1690,7 +1691,7 @@ xfs_ioc_getbmap( if (bmx.bmv_count > ULONG_MAX / recsize) return -ENOMEM; - buf = kmem_zalloc_large(bmx.bmv_count * sizeof(*buf), 0); + buf = kvzalloc(bmx.bmv_count * sizeof(*buf), GFP_KERNEL); if (!buf) return -ENOMEM; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index e2ec91b2d0f4..a17d788921d6 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -265,32 +265,6 @@ xlog_header_check_mount( return 0; } -void -xlog_recover_iodone( - struct xfs_buf *bp) -{ - if (bp->b_error) { - /* - * We're not going to bother about retrying - * this during recovery. One strike! - */ - if (!XFS_FORCED_SHUTDOWN(bp->b_mount)) { - xfs_buf_ioerror_alert(bp, __this_address); - xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); - } - } - - /* - * On v5 supers, a bli could be attached to update the metadata LSN. - * Clean it up. - */ - if (bp->b_log_item) - xfs_buf_item_relse(bp); - ASSERT(bp->b_log_item == NULL); - bp->b_flags &= ~_XBF_LOGRECOVERY; - xfs_buf_ioend_finish(bp); -} - /* * This routine finds (to an approximation) the first block in the physical * log which contains the given cycle. It uses a binary search algorithm. @@ -2097,7 +2071,7 @@ xlog_recover_add_to_cont_trans( old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = kmem_realloc(old_ptr, len + old_len, 0); + ptr = krealloc(old_ptr, len + old_len, GFP_KERNEL | __GFP_NOFAIL); memcpy(&ptr[old_len], dp, len); item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; @@ -3294,14 +3268,14 @@ xlog_do_log_recovery( */ STATIC int xlog_do_recover( - struct xlog *log, - xfs_daddr_t head_blk, - xfs_daddr_t tail_blk) + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) { - struct xfs_mount *mp = log->l_mp; - int error; - xfs_buf_t *bp; - xfs_sb_t *sbp; + struct xfs_mount *mp = log->l_mp; + struct xfs_buf *bp = mp->m_sb_bp; + struct xfs_sb *sbp = &mp->m_sb; + int error; trace_xfs_log_recover(log, head_blk, tail_blk); @@ -3315,9 +3289,8 @@ xlog_do_recover( /* * If IO errors happened during recovery, bail out. */ - if (XFS_FORCED_SHUTDOWN(mp)) { + if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - } /* * We now update the tail_lsn since much of the recovery has completed @@ -3331,16 +3304,12 @@ xlog_do_recover( xlog_assign_tail_lsn(mp); /* - * Now that we've finished replaying all buffer and inode - * updates, re-read in the superblock and reverify it. + * Now that we've finished replaying all buffer and inode updates, + * re-read the superblock and reverify it. */ - bp = xfs_getsb(mp); - bp->b_flags &= ~(XBF_DONE | XBF_ASYNC); - ASSERT(!(bp->b_flags & XBF_WRITE)); - bp->b_flags |= XBF_READ; - bp->b_ops = &xfs_sb_buf_ops; - - error = xfs_buf_submit(bp); + xfs_buf_lock(bp); + xfs_buf_hold(bp); + error = _xfs_buf_read(bp, XBF_READ); if (error) { if (!XFS_FORCED_SHUTDOWN(mp)) { xfs_buf_ioerror_alert(bp, __this_address); @@ -3351,7 +3320,6 @@ xlog_do_recover( } /* Convert superblock from on-disk format */ - sbp = &mp->m_sb; xfs_sb_from_disk(sbp, bp->b_addr); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c8ae49a1e99c..150ee5cb8645 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -80,9 +80,9 @@ xfs_uuid_mount( } if (hole < 0) { - xfs_uuid_table = kmem_realloc(xfs_uuid_table, + xfs_uuid_table = krealloc(xfs_uuid_table, (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), - 0); + GFP_KERNEL | __GFP_NOFAIL); hole = xfs_uuid_table_size++; } xfs_uuid_table[hole] = *uuid; @@ -1059,11 +1059,12 @@ xfs_unmountfs( * We can potentially deadlock here if we have an inode cluster * that has been freed has its buffer still pinned in memory because * the transaction is still sitting in a iclog. The stale inodes - * on that buffer will have their flush locks held until the - * transaction hits the disk and the callbacks run. the inode - * flush takes the flush lock unconditionally and with nothing to - * push out the iclog we will never get that unlocked. hence we - * need to force the log first. + * on that buffer will be pinned to the buffer until the + * transaction hits the disk and the callbacks run. Pushing the AIL will + * skip the stale inodes and may never see the pinned buffer, so + * nothing will push out the iclog and unpin the buffer. Hence we + * need to force the log here to ensure all items are flushed into the + * AIL before we go any further. */ xfs_log_force(mp, XFS_LOG_SYNC); @@ -1288,23 +1289,6 @@ xfs_mod_frextents( return ret; } -/* - * xfs_getsb() is called to obtain the buffer for the superblock. - * The buffer is returned locked and read in from disk. - * The buffer should be released with a call to xfs_brelse(). - */ -struct xfs_buf * -xfs_getsb( - struct xfs_mount *mp) -{ - struct xfs_buf *bp = mp->m_sb_bp; - - xfs_buf_lock(bp); - xfs_buf_hold(bp); - ASSERT(bp->b_flags & XBF_DONE); - return bp; -} - /* * Used to free the superblock along various error paths. */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a72cfcaa4ad1..dfa429b77ee2 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -410,7 +410,6 @@ extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved); extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); -extern struct xfs_buf *xfs_getsb(xfs_mount_t *); extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); extern bool xfs_fs_writable(struct xfs_mount *mp, int level); diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 5f04d8a5ab2a..0aa87c210104 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -15,6 +15,10 @@ "XFS: offsetof(" #structname ", " #member ") is wrong, " \ "expected " #off) +#define XFS_CHECK_VALUE(value, expected) \ + BUILD_BUG_ON_MSG((value) != (expected), \ + "XFS: value of " #value " is wrong, expected " #expected) + static inline void __init xfs_check_ondisk_structs(void) { @@ -23,7 +27,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224); XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36); - XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 336); + XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 344); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4); @@ -41,7 +45,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24); - XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp, 8); + XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp, 8); XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8); @@ -84,12 +89,12 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize, 0); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count, 2); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen, 4); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags, 6); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval, 7); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].valuelen, 5); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].flags, 6); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].nameval, 7); XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12); XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8); @@ -121,7 +126,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176); XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28); - XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp, 8); + XFS_CHECK_STRUCT_SIZE(xfs_ictimestamp_t, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_ictimestamp, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52); XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); @@ -152,6 +158,20 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers, 24); XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat_req, 64); XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers_req, 64); + + /* + * Make sure the incore inode timestamp range corresponds to hand + * converted values based on the ondisk format specification. + */ + XFS_CHECK_VALUE(XFS_BIGTIME_TIME_MIN - XFS_BIGTIME_EPOCH_OFFSET, + XFS_LEGACY_TIME_MIN); + XFS_CHECK_VALUE(XFS_BIGTIME_TIME_MAX - XFS_BIGTIME_EPOCH_OFFSET, + 16299260424LL); + + /* Do the same with the incore quota expiration range. */ + XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MIN << XFS_DQ_BIGTIME_SHIFT, 4); + XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MAX << XFS_DQ_BIGTIME_SHIFT, + 16299260424LL); } #endif /* __XFS_ONDISK_H */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index be67570badf8..3f82e0c92c2d 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -661,6 +661,17 @@ xfs_qm_init_quotainfo( /* Precalc some constants */ qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen); + if (xfs_sb_version_hasbigtime(&mp->m_sb)) { + qinf->qi_expiry_min = + xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MIN); + qinf->qi_expiry_max = + xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MAX); + } else { + qinf->qi_expiry_min = XFS_DQ_LEGACY_EXPIRY_MIN; + qinf->qi_expiry_max = XFS_DQ_LEGACY_EXPIRY_MAX; + } + trace_xfs_quota_expiry_range(mp, qinf->qi_expiry_min, + qinf->qi_expiry_max); mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); @@ -879,6 +890,8 @@ xfs_qm_reset_dqcounts( ddq->d_bwarns = 0; ddq->d_iwarns = 0; ddq->d_rtbwarns = 0; + if (xfs_sb_version_hasbigtime(&mp->m_sb)) + ddq->d_type |= XFS_DQTYPE_BIGTIME; } if (xfs_sb_version_hascrc(&mp->m_sb)) { diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 9c078c35d924..e3dabab44097 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -65,6 +65,10 @@ struct xfs_quotainfo { struct xfs_def_quota qi_grp_default; struct xfs_def_quota qi_prj_default; struct shrinker qi_shrinker; + + /* Minimum and maximum quota expiration timestamp values. */ + time64_t qi_expiry_min; + time64_t qi_expiry_max; }; static inline struct radix_tree_root * diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 1c542b4a5220..ca1b57d291dc 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -479,13 +479,19 @@ xfs_setqlim_warns( static inline void xfs_setqlim_timer( + struct xfs_mount *mp, struct xfs_dquot_res *res, struct xfs_quota_limits *qlim, s64 timer) { - res->timer = timer; - if (qlim) - qlim->time = timer; + if (qlim) { + /* Set the length of the default grace period. */ + res->timer = xfs_dquot_set_grace_period(timer); + qlim->time = res->timer; + } else { + /* Set the grace period expiration on a quota. */ + res->timer = xfs_dquot_set_timeout(mp, timer); + } } /* @@ -574,7 +580,7 @@ xfs_qm_scall_setqlim( if (newlim->d_fieldmask & QC_SPC_WARNS) xfs_setqlim_warns(res, qlim, newlim->d_spc_warns); if (newlim->d_fieldmask & QC_SPC_TIMER) - xfs_setqlim_timer(res, qlim, newlim->d_spc_timer); + xfs_setqlim_timer(mp, res, qlim, newlim->d_spc_timer); /* Blocks on the realtime device. */ hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ? @@ -590,7 +596,7 @@ xfs_qm_scall_setqlim( if (newlim->d_fieldmask & QC_RT_SPC_WARNS) xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns); if (newlim->d_fieldmask & QC_RT_SPC_TIMER) - xfs_setqlim_timer(res, qlim, newlim->d_rt_spc_timer); + xfs_setqlim_timer(mp, res, qlim, newlim->d_rt_spc_timer); /* Inodes */ hard = (newlim->d_fieldmask & QC_INO_HARD) ? @@ -606,7 +612,7 @@ xfs_qm_scall_setqlim( if (newlim->d_fieldmask & QC_INO_WARNS) xfs_setqlim_warns(res, qlim, newlim->d_ino_warns); if (newlim->d_fieldmask & QC_INO_TIMER) - xfs_setqlim_timer(res, qlim, newlim->d_ino_timer); + xfs_setqlim_timer(mp, res, qlim, newlim->d_ino_timer); if (id != 0) { /* diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 06b22e35fc90..5a62398940d0 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -108,8 +108,6 @@ extern void xfs_qm_mount_quotas(struct xfs_mount *); extern void xfs_qm_unmount(struct xfs_mount *); extern void xfs_qm_unmount_quotas(struct xfs_mount *); -void xfs_dquot_done(struct xfs_buf *); - #else static inline int xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, @@ -151,12 +149,6 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) #define xfs_qm_unmount_quotas(mp) - -static inline void xfs_dquot_done(struct xfs_buf *bp) -{ - return; -} - #endif /* CONFIG_XFS_QUOTA */ #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 6209e7b6b895..5b89c12f1566 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -247,6 +247,9 @@ xfs_rtallocate_extent_block( end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1; i <= end; i++) { + /* Make sure we don't scan off the end of the rt volume. */ + maxlen = min(mp->m_sb.sb_rextents, i + maxlen) - i; + /* * See if there's a free extent of maxlen starting at i. * If it's not so then next will contain the first non-free. @@ -442,6 +445,14 @@ xfs_rtallocate_extent_near( */ if (bno >= mp->m_sb.sb_rextents) bno = mp->m_sb.sb_rextents - 1; + + /* Make sure we don't run off the end of the rt volume. */ + maxlen = min(mp->m_sb.sb_rextents, bno + maxlen) - bno; + if (maxlen < minlen) { + *rtblock = NULLRTBLOCK; + return 0; + } + /* * Try the exact allocation first. */ @@ -862,7 +873,7 @@ xfs_alloc_rsum_cache( * lower bound on the minimum level with any free extents. We can * continue without the cache if it couldn't be allocated. */ - mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0); + mp->m_rsum_cache = kvzalloc(rbmblocks, GFP_KERNEL); if (!mp->m_rsum_cache) xfs_warn(mp, "could not allocate realtime summary cache"); } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 71ac6c1cdc36..baf5de30eebb 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -654,11 +654,11 @@ xfs_fs_destroy_inode( ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); /* - * We always use background reclaim here because even if the - * inode is clean, it still may be under IO and hence we have - * to take the flush lock. The background reclaim path handles - * this more efficiently than we can here, so simply let background - * reclaim tear down all inodes. + * We always use background reclaim here because even if the inode is + * clean, it still may be under IO and hence we have wait for IO + * completion to occur before we can reclaim the inode. The background + * reclaim path handles this more efficiently than we can here, so + * simply let background reclaim tear down all inodes. */ xfs_inode_set_reclaim_tag(ip); } @@ -1484,8 +1484,14 @@ xfs_fc_fill_super( sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_max_links = XFS_MAXLINK; sb->s_time_gran = 1; - sb->s_time_min = S32_MIN; - sb->s_time_max = S32_MAX; + if (xfs_sb_version_hasbigtime(&mp->m_sb)) { + sb->s_time_min = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MIN); + sb->s_time_max = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MAX); + } else { + sb->s_time_min = XFS_LEGACY_TIME_MIN; + sb->s_time_max = XFS_LEGACY_TIME_MAX; + } + trace_xfs_inode_timestamp_range(mp, sb->s_time_min, sb->s_time_max); sb->s_iflags |= SB_I_CGROUPWB; set_posix_acl_flag(sb); @@ -1494,6 +1500,10 @@ xfs_fc_fill_super( if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) sb->s_flags |= SB_I_VERSION; + if (xfs_sb_version_hasbigtime(&mp->m_sb)) + xfs_warn(mp, + "EXPERIMENTAL big timestamp feature in use. Use at your own risk!"); + if (mp->m_flags & XFS_MOUNT_DAX_ALWAYS) { bool rtdev_is_dax = false, datadev_is_dax; @@ -1549,6 +1559,10 @@ xfs_fc_fill_super( goto out_filestream_unmount; } + if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) + xfs_warn(mp, + "EXPERIMENTAL inode btree counters feature in use. Use at your own risk!"); + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index abb1d859f226..dcdcf99cfa5d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -338,7 +338,7 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_split); DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf); DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_buf_item_relse); -DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); +DEFINE_BUF_EVENT(xfs_buf_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); @@ -3676,7 +3676,6 @@ DEFINE_EVENT(xfs_kmem_class, name, \ DEFINE_KMEM_EVENT(kmem_alloc); DEFINE_KMEM_EVENT(kmem_alloc_io); DEFINE_KMEM_EVENT(kmem_alloc_large); -DEFINE_KMEM_EVENT(kmem_realloc); TRACE_EVENT(xfs_check_new_dalign, TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino), @@ -3844,6 +3843,32 @@ TRACE_EVENT(xfs_btree_bload_block, __entry->nr_records) ) +DECLARE_EVENT_CLASS(xfs_timestamp_range_class, + TP_PROTO(struct xfs_mount *mp, time64_t min, time64_t max), + TP_ARGS(mp, min, max), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(long long, min) + __field(long long, max) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->min = min; + __entry->max = max; + ), + TP_printk("dev %d:%d min %lld max %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->min, + __entry->max) +) + +#define DEFINE_TIMESTAMP_RANGE_EVENT(name) \ +DEFINE_EVENT(xfs_timestamp_range_class, name, \ + TP_PROTO(struct xfs_mount *mp, long long min, long long max), \ + TP_ARGS(mp, min, max)) +DEFINE_TIMESTAMP_RANGE_EVENT(xfs_inode_timestamp_range); +DEFINE_TIMESTAMP_RANGE_EVENT(xfs_quota_expiry_range); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index ed72867b1a19..ca18a040336a 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -468,7 +468,7 @@ xfs_trans_apply_sb_deltas( xfs_buf_t *bp; int whole = 0; - bp = xfs_trans_getsb(tp, tp->t_mountp); + bp = xfs_trans_getsb(tp); sbp = bp->b_addr; /* diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index b752501818d2..f46534b75236 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -209,7 +209,7 @@ xfs_trans_read_buf( flags, bpp, ops); } -struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *); +struct xfs_buf *xfs_trans_getsb(struct xfs_trans *); void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *); void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 11cd666cd99a..42d63b830cb9 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -166,50 +166,34 @@ xfs_trans_get_buf_map( } /* - * Get and lock the superblock buffer of this file system for the - * given transaction. - * - * We don't need to use incore_match() here, because the superblock - * buffer is a private buffer which we keep a pointer to in the - * mount structure. + * Get and lock the superblock buffer for the given transaction. */ -xfs_buf_t * +struct xfs_buf * xfs_trans_getsb( - xfs_trans_t *tp, - struct xfs_mount *mp) + struct xfs_trans *tp) { - xfs_buf_t *bp; - struct xfs_buf_log_item *bip; + struct xfs_buf *bp = tp->t_mountp->m_sb_bp; /* - * Default to just trying to lock the superblock buffer - * if tp is NULL. + * Just increment the lock recursion count if the buffer is already + * attached to this transaction. */ - if (tp == NULL) - return xfs_getsb(mp); - - /* - * If the superblock buffer already has this transaction - * pointer in its b_fsprivate2 field, then we know we already - * have it locked. In this case we just increment the lock - * recursion count and return the buffer to the caller. - */ - bp = mp->m_sb_bp; if (bp->b_transp == tp) { - bip = bp->b_log_item; + struct xfs_buf_log_item *bip = bp->b_log_item; + ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; + trace_xfs_trans_getsb_recur(bip); - return bp; + } else { + xfs_buf_lock(bp); + xfs_buf_hold(bp); + _xfs_trans_bjoin(tp, bp, 1); + + trace_xfs_trans_getsb(bp->b_log_item); } - bp = xfs_getsb(mp); - if (bp == NULL) - return NULL; - - _xfs_trans_bjoin(tp, bp, 1); - trace_xfs_trans_getsb(bp->b_log_item); return bp; } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index c6ba7ef18e06..133fc6fc3edd 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -55,6 +55,12 @@ xfs_trans_log_dquot( { ASSERT(XFS_DQ_IS_LOCKED(dqp)); + /* Upgrade the dquot to bigtime format if possible. */ + if (dqp->q_id != 0 && + xfs_sb_version_hasbigtime(&tp->t_mountp->m_sb) && + !(dqp->q_type & XFS_DQTYPE_BIGTIME)) + dqp->q_type |= XFS_DQTYPE_BIGTIME; + tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &dqp->q_logitem.qli_item.li_flags); } diff --git a/include/linux/dax.h b/include/linux/dax.h index 4ec0bbf86205..e15357223565 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -231,8 +231,7 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); -int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size, - struct iomap *iomap); +s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap); static inline bool dax_mapping(struct address_space *mapping) { return mapping->host && IS_DAX(mapping->host); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 4d1d3c3469e9..172b3397a1a3 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -13,6 +13,7 @@ struct address_space; struct fiemap_extent_info; struct inode; +struct iomap_dio; struct iomap_writepage_ctx; struct iov_iter; struct kiocb; @@ -258,6 +259,10 @@ struct iomap_dio_ops { ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, bool wait_for_completion); +struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, + const struct iomap_ops *ops, const struct iomap_dio_ops *dops, + bool wait_for_completion); +ssize_t iomap_dio_complete(struct iomap_dio *dio); int iomap_dio_iopoll(struct kiocb *kiocb, bool spin); #ifdef CONFIG_SWAP diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h index ef61676cfe05..52850a02a3d0 100644 --- a/include/linux/ipmi.h +++ b/include/linux/ipmi.h @@ -333,4 +333,6 @@ struct ipmi_smi_info { /* This is to get the private info of struct ipmi_smi */ extern int ipmi_get_smi_info(int if_num, struct ipmi_smi_info *data); +#define GET_DEVICE_ID_MAX_RETRY 5 + #endif /* __LINUX_IPMI_H */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 1a3554f5d992..c3afd3242b54 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -927,4 +927,20 @@ static inline int page_mkwrite_check_truncate(struct page *page, return offset; } +/** + * i_blocks_per_page - How many blocks fit in this page. + * @inode: The inode which contains the blocks. + * @page: The page (head page if the page is a THP). + * + * If the block size is larger than the size of this page, return zero. + * + * Context: The caller should hold a refcount on the page to prevent it + * from being split. + * Return: The number of filesystem blocks covered by this page. + */ +static inline +unsigned int i_blocks_per_page(struct inode *inode, struct page *page) +{ + return thp_size(page) >> inode->i_blkbits; +} #endif /* _LINUX_PAGEMAP_H */ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index a98965007eef..85fb2f34c59b 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -83,7 +83,7 @@ extern void do_group_exit(int); extern void exit_files(struct task_struct *); extern void exit_itimers(struct signal_struct *); -extern long _do_fork(struct kernel_clone_args *kargs); +extern pid_t kernel_clone(struct kernel_clone_args *kargs); struct task_struct *fork_idle(int); struct mm_struct *copy_init_mm(void); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); diff --git a/include/uapi/linux/ipmi_msgdefs.h b/include/uapi/linux/ipmi_msgdefs.h index c2b23a9fdf3d..0934af3b8037 100644 --- a/include/uapi/linux/ipmi_msgdefs.h +++ b/include/uapi/linux/ipmi_msgdefs.h @@ -69,6 +69,8 @@ #define IPMI_ERR_MSG_TRUNCATED 0xc6 #define IPMI_REQ_LEN_INVALID_ERR 0xc7 #define IPMI_REQ_LEN_EXCEEDED_ERR 0xc8 +#define IPMI_DEVICE_IN_FW_UPDATE_ERR 0xd1 +#define IPMI_DEVICE_IN_INIT_ERR 0xd2 #define IPMI_NOT_IN_MY_STATE_ERR 0xd5 /* IPMI 2.0 */ #define IPMI_LOST_ARBITRATION_ERR 0x81 #define IPMI_BUS_ERR 0x82 diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h new file mode 100644 index 000000000000..5406fbc13074 --- /dev/null +++ b/include/uapi/linux/pidfd.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +#ifndef _UAPI_LINUX_PIDFD_H +#define _UAPI_LINUX_PIDFD_H + +#include +#include + +/* Flags for pidfd_open(). */ +#define PIDFD_NONBLOCK O_NONBLOCK + +#endif /* _UAPI_LINUX_PIDFD_H */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index dd247747ec14..e41c21819ba0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2006,7 +2006,6 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); - kernfs_activate(root_cgrp->kn); ret = 0; goto out; @@ -3682,6 +3681,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, struct cgroup_subsys_state *css; int ret; + if (!nbytes) + return 0; + /* * If namespaces are delegation boundaries, disallow writes to * files in an non-init namespace root from inside the namespace diff --git a/kernel/exit.c b/kernel/exit.c index 733e80f334e7..1f51c27bae59 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1474,7 +1474,7 @@ end: return retval; } -static struct pid *pidfd_get_pid(unsigned int fd) +static struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) { struct fd f; struct pid *pid; @@ -1484,8 +1484,10 @@ static struct pid *pidfd_get_pid(unsigned int fd) return ERR_PTR(-EBADF); pid = pidfd_pid(f.file); - if (!IS_ERR(pid)) + if (!IS_ERR(pid)) { get_pid(pid); + *flags = f.file->f_flags; + } fdput(f); return pid; @@ -1498,6 +1500,7 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, struct pid *pid = NULL; enum pid_type type; long ret; + unsigned int f_flags = 0; if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) @@ -1531,9 +1534,10 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, if (upid < 0) return -EINVAL; - pid = pidfd_get_pid(upid); + pid = pidfd_get_pid(upid, &f_flags); if (IS_ERR(pid)) return PTR_ERR(pid); + break; default: return -EINVAL; @@ -1544,7 +1548,12 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, wo.wo_flags = options; wo.wo_info = infop; wo.wo_rusage = ru; + if (f_flags & O_NONBLOCK) + wo.wo_flags |= WNOHANG; + ret = do_wait(&wo); + if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK)) + ret = -EAGAIN; put_pid(pid); return ret; diff --git a/kernel/fork.c b/kernel/fork.c index ead9672b69c1..e9d82ce4efa5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2416,14 +2416,14 @@ struct mm_struct *copy_init_mm(void) * * args->exit_signal is expected to be checked for sanity by the caller. */ -long _do_fork(struct kernel_clone_args *args) +pid_t kernel_clone(struct kernel_clone_args *args) { u64 clone_flags = args->flags; struct completion vfork; struct pid *pid; struct task_struct *p; int trace = 0; - long nr; + pid_t nr; /* * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument @@ -2511,7 +2511,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) .stack_size = (unsigned long)arg, }; - return _do_fork(&args); + return kernel_clone(&args); } #ifdef __ARCH_WANT_SYS_FORK @@ -2522,7 +2522,7 @@ SYSCALL_DEFINE0(fork) .exit_signal = SIGCHLD, }; - return _do_fork(&args); + return kernel_clone(&args); #else /* can not support in nommu mode */ return -EINVAL; @@ -2538,7 +2538,7 @@ SYSCALL_DEFINE0(vfork) .exit_signal = SIGCHLD, }; - return _do_fork(&args); + return kernel_clone(&args); } #endif @@ -2576,7 +2576,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, .tls = tls, }; - return _do_fork(&args); + return kernel_clone(&args); } #endif @@ -2734,7 +2734,7 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) if (!clone3_args_valid(&kargs)) return -EINVAL; - return _do_fork(&kargs); + return kernel_clone(&kargs); } #endif @@ -2897,7 +2897,7 @@ int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* - * functions used by _do_fork() cannot be used here directly + * functions used by kernel_clone() cannot be used here directly * because they modify an inactive task_struct that is being * constructed. Here we are modifying the current, active, * task_struct. diff --git a/kernel/pid.c b/kernel/pid.c index 2d0f5f157b96..b97ad355802d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -43,6 +43,7 @@ #include #include #include +#include struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), @@ -523,7 +524,8 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) /** * pidfd_create() - Create a new pid file descriptor. * - * @pid: struct pid that the pidfd will reference + * @pid: struct pid that the pidfd will reference + * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set. * @@ -533,12 +535,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ -static int pidfd_create(struct pid *pid) +static int pidfd_create(struct pid *pid, unsigned int flags) { int fd; fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), - O_RDWR | O_CLOEXEC); + flags | O_RDWR | O_CLOEXEC); if (fd < 0) put_pid(pid); @@ -566,7 +568,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) int fd; struct pid *p; - if (flags) + if (flags & ~PIDFD_NONBLOCK) return -EINVAL; if (pid <= 0) @@ -577,7 +579,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) return -ESRCH; if (pid_has_task(p, PIDTYPE_TGID)) - fd = pidfd_create(p); + fd = pidfd_create(p, flags); else fd = -EINVAL; diff --git a/samples/kprobes/kprobe_example.c b/samples/kprobes/kprobe_example.c index 8b718943d603..365905cb24b1 100644 --- a/samples/kprobes/kprobe_example.c +++ b/samples/kprobes/kprobe_example.c @@ -2,13 +2,13 @@ /* * NOTE: This example is works on x86 and powerpc. * Here's a sample kernel module showing the use of kprobes to dump a - * stack trace and selected registers when _do_fork() is called. + * stack trace and selected registers when kernel_clone() is called. * * For more information on theory of operation of kprobes, see * Documentation/trace/kprobes.rst * * You will see the trace data in /var/log/messages and on the console - * whenever _do_fork() is invoked to create a new process. + * whenever kernel_clone() is invoked to create a new process. */ #include @@ -16,7 +16,7 @@ #include #define MAX_SYMBOL_LEN 64 -static char symbol[MAX_SYMBOL_LEN] = "_do_fork"; +static char symbol[MAX_SYMBOL_LEN] = "kernel_clone"; module_param_string(symbol, symbol, sizeof(symbol), 0644); /* For each probe you need to allocate a kprobe structure */ diff --git a/samples/kprobes/kretprobe_example.c b/samples/kprobes/kretprobe_example.c index 69fd1235108a..5dc1bf3baa98 100644 --- a/samples/kprobes/kretprobe_example.c +++ b/samples/kprobes/kretprobe_example.c @@ -8,7 +8,7 @@ * * usage: insmod kretprobe_example.ko func= * - * If no func_name is specified, _do_fork is instrumented + * If no func_name is specified, kernel_clone is instrumented * * For more information on theory of operation of kretprobes, see * Documentation/trace/kprobes.rst @@ -26,7 +26,7 @@ #include #include -static char func_name[NAME_MAX] = "_do_fork"; +static char func_name[NAME_MAX] = "kernel_clone"; module_param_string(func, func_name, NAME_MAX, S_IRUGO); MODULE_PARM_DESC(func, "Function to kretprobe; this module will report the" " function's execution time"); diff --git a/tools/lib/subcmd/help.c b/tools/lib/subcmd/help.c index 2859f107abc8..bf02d62a3b2b 100644 --- a/tools/lib/subcmd/help.c +++ b/tools/lib/subcmd/help.c @@ -65,12 +65,14 @@ void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes) ci = cj = ei = 0; while (ci < cmds->cnt && ei < excludes->cnt) { cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name); - if (cmp < 0) + if (cmp < 0) { cmds->names[cj++] = cmds->names[ci++]; - else if (cmp == 0) - ci++, ei++; - else if (cmp > 0) + } else if (cmp == 0) { + ci++; ei++; + } else if (cmp > 0) { + ei++; + } } while (ci < cmds->cnt) diff --git a/tools/power/cpupower/utils/cpufreq-set.c b/tools/power/cpupower/utils/cpufreq-set.c index 6ed82fba5aaa..7b2164e07057 100644 --- a/tools/power/cpupower/utils/cpufreq-set.c +++ b/tools/power/cpupower/utils/cpufreq-set.c @@ -99,13 +99,17 @@ static unsigned long string_to_frequency(const char *str) continue; if (str[cp] == '.') { - while (power > -1 && isdigit(str[cp+1])) - cp++, power--; + while (power > -1 && isdigit(str[cp+1])) { + cp++; + power--; + } } - if (power >= -1) /* not enough => pad */ + if (power >= -1) { /* not enough => pad */ pad = power + 1; - else /* to much => strip */ - pad = 0, cp += power + 1; + } else { /* too much => strip */ + pad = 0; + cp += power + 1; + } /* check bounds */ if (cp <= 0 || cp + pad > NORM_FREQ_LEN - 1) return 0; diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_kprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_kprobe.tc index 68550f97d3c3..3bcd4c3624ee 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_kprobe.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_kprobe.tc @@ -6,7 +6,7 @@ echo 0 > events/enable echo > dynamic_events -PLACE=_do_fork +PLACE=kernel_clone echo "p:myevent1 $PLACE" >> dynamic_events echo "r:myevent2 $PLACE" >> dynamic_events diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/clear_select_events.tc b/tools/testing/selftests/ftrace/test.d/dynevent/clear_select_events.tc index c969be9eb7de..438961971b7e 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/clear_select_events.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/clear_select_events.tc @@ -6,7 +6,7 @@ echo 0 > events/enable echo > dynamic_events -PLACE=_do_fork +PLACE=kernel_clone setup_events() { echo "p:myevent1 $PLACE" >> dynamic_events diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/generic_clear_event.tc b/tools/testing/selftests/ftrace/test.d/dynevent/generic_clear_event.tc index 16d543eaac88..a8603bd23e0d 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/generic_clear_event.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/generic_clear_event.tc @@ -6,7 +6,7 @@ echo 0 > events/enable echo > dynamic_events -PLACE=_do_fork +PLACE=kernel_clone setup_events() { echo "p:myevent1 $PLACE" >> dynamic_events diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-stacktrace.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-stacktrace.tc index 0f41e441c203..98305d76bd04 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-stacktrace.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-stacktrace.tc @@ -4,9 +4,9 @@ # requires: set_ftrace_filter # flags: instance -echo _do_fork:stacktrace >> set_ftrace_filter +echo kernel_clone:stacktrace >> set_ftrace_filter -grep -q "_do_fork:stacktrace:unlimited" set_ftrace_filter +grep -q "kernel_clone:stacktrace:unlimited" set_ftrace_filter (echo "forked"; sleep 1) diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/add_and_remove.tc b/tools/testing/selftests/ftrace/test.d/kprobe/add_and_remove.tc index eba858c21815..9737cd0578a7 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/add_and_remove.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/add_and_remove.tc @@ -3,7 +3,7 @@ # description: Kprobe dynamic event - adding and removing # requires: kprobe_events -echo p:myevent _do_fork > kprobe_events +echo p:myevent kernel_clone > kprobe_events grep myevent kprobe_events test -d events/kprobes/myevent echo > kprobe_events diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/busy_check.tc b/tools/testing/selftests/ftrace/test.d/kprobe/busy_check.tc index d10bf4f05bc8..f9a40af76888 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/busy_check.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/busy_check.tc @@ -3,7 +3,7 @@ # description: Kprobe dynamic event - busy event check # requires: kprobe_events -echo p:myevent _do_fork > kprobe_events +echo p:myevent kernel_clone > kprobe_events test -d events/kprobes/myevent echo 1 > events/kprobes/myevent/enable echo > kprobe_events && exit_fail # this must fail diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args.tc index 61f2ac441aec..eb543d3cfe5f 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args.tc @@ -3,13 +3,13 @@ # description: Kprobe dynamic event with arguments # requires: kprobe_events -echo 'p:testprobe _do_fork $stack $stack0 +0($stack)' > kprobe_events +echo 'p:testprobe kernel_clone $stack $stack0 +0($stack)' > kprobe_events grep testprobe kprobe_events | grep -q 'arg1=\$stack arg2=\$stack0 arg3=+0(\$stack)' test -d events/kprobes/testprobe echo 1 > events/kprobes/testprobe/enable ( echo "forked") -grep testprobe trace | grep '_do_fork' | \ +grep testprobe trace | grep 'kernel_clone' | \ grep -q 'arg1=0x[[:xdigit:]]* arg2=0x[[:xdigit:]]* arg3=0x[[:xdigit:]]*$' echo 0 > events/kprobes/testprobe/enable diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_comm.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_comm.tc index 05aaeed6987f..4e5b63be51c9 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_comm.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_comm.tc @@ -5,7 +5,7 @@ grep -A1 "fetcharg:" README | grep -q "\$comm" || exit_unsupported # this is too old -echo 'p:testprobe _do_fork comm=$comm ' > kprobe_events +echo 'p:testprobe kernel_clone comm=$comm ' > kprobe_events grep testprobe kprobe_events | grep -q 'comm=$comm' test -d events/kprobes/testprobe diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc index b5fa05443b39..a1d70588ab21 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc @@ -30,13 +30,13 @@ esac : "Test get argument (1)" echo "p:testprobe tracefs_create_dir arg1=+0(${ARG1}):string" > kprobe_events echo 1 > events/kprobes/testprobe/enable -echo "p:test _do_fork" >> kprobe_events +echo "p:test kernel_clone" >> kprobe_events grep -qe "testprobe.* arg1=\"test\"" trace echo 0 > events/kprobes/testprobe/enable : "Test get argument (2)" echo "p:testprobe tracefs_create_dir arg1=+0(${ARG1}):string arg2=+0(${ARG1}):string" > kprobe_events echo 1 > events/kprobes/testprobe/enable -echo "p:test _do_fork" >> kprobe_events +echo "p:test kernel_clone" >> kprobe_events grep -qe "testprobe.* arg1=\"test\" arg2=\"test\"" trace diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_symbol.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_symbol.tc index b8c75a3d003c..bd25dd0ba0d0 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_symbol.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_symbol.tc @@ -14,12 +14,12 @@ elif ! grep "$SYMBOL\$" /proc/kallsyms; then fi : "Test get basic types symbol argument" -echo "p:testprobe_u _do_fork arg1=@linux_proc_banner:u64 arg2=@linux_proc_banner:u32 arg3=@linux_proc_banner:u16 arg4=@linux_proc_banner:u8" > kprobe_events -echo "p:testprobe_s _do_fork arg1=@linux_proc_banner:s64 arg2=@linux_proc_banner:s32 arg3=@linux_proc_banner:s16 arg4=@linux_proc_banner:s8" >> kprobe_events +echo "p:testprobe_u kernel_clone arg1=@linux_proc_banner:u64 arg2=@linux_proc_banner:u32 arg3=@linux_proc_banner:u16 arg4=@linux_proc_banner:u8" > kprobe_events +echo "p:testprobe_s kernel_clone arg1=@linux_proc_banner:s64 arg2=@linux_proc_banner:s32 arg3=@linux_proc_banner:s16 arg4=@linux_proc_banner:s8" >> kprobe_events if grep -q "x8/16/32/64" README; then - echo "p:testprobe_x _do_fork arg1=@linux_proc_banner:x64 arg2=@linux_proc_banner:x32 arg3=@linux_proc_banner:x16 arg4=@linux_proc_banner:x8" >> kprobe_events + echo "p:testprobe_x kernel_clone arg1=@linux_proc_banner:x64 arg2=@linux_proc_banner:x32 arg3=@linux_proc_banner:x16 arg4=@linux_proc_banner:x8" >> kprobe_events fi -echo "p:testprobe_bf _do_fork arg1=@linux_proc_banner:b8@4/32" >> kprobe_events +echo "p:testprobe_bf kernel_clone arg1=@linux_proc_banner:b8@4/32" >> kprobe_events echo 1 > events/kprobes/enable (echo "forked") echo 0 > events/kprobes/enable @@ -27,7 +27,7 @@ grep "testprobe_[usx]:.* arg1=.* arg2=.* arg3=.* arg4=.*" trace grep "testprobe_bf:.* arg1=.*" trace : "Test get string symbol argument" -echo "p:testprobe_str _do_fork arg1=@linux_proc_banner:string" > kprobe_events +echo "p:testprobe_str kernel_clone arg1=@linux_proc_banner:string" > kprobe_events echo 1 > events/kprobes/enable (echo "forked") echo 0 > events/kprobes/enable diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_type.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_type.tc index 0610e0b5587c..91fcce1c241c 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_type.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_type.tc @@ -4,7 +4,7 @@ # requires: kprobe_events "x8/16/32/64":README gen_event() { # Bitsize - echo "p:testprobe _do_fork \$stack0:s$1 \$stack0:u$1 \$stack0:x$1 \$stack0:b4@4/$1" + echo "p:testprobe kernel_clone \$stack0:s$1 \$stack0:u$1 \$stack0:x$1 \$stack0:b4@4/$1" } check_types() { # s-type u-type x-type bf-type width diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_ftrace.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_ftrace.tc index 81d8b58c03bc..0d179094191f 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_ftrace.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_ftrace.tc @@ -5,29 +5,29 @@ # prepare echo nop > current_tracer -echo _do_fork > set_ftrace_filter -echo 'p:testprobe _do_fork' > kprobe_events +echo kernel_clone > set_ftrace_filter +echo 'p:testprobe kernel_clone' > kprobe_events # kprobe on / ftrace off echo 1 > events/kprobes/testprobe/enable echo > trace ( echo "forked") grep testprobe trace -! grep '_do_fork <-' trace +! grep 'kernel_clone <-' trace # kprobe on / ftrace on echo function > current_tracer echo > trace ( echo "forked") grep testprobe trace -grep '_do_fork <-' trace +grep 'kernel_clone <-' trace # kprobe off / ftrace on echo 0 > events/kprobes/testprobe/enable echo > trace ( echo "forked") ! grep testprobe trace -grep '_do_fork <-' trace +grep 'kernel_clone <-' trace # kprobe on / ftrace on echo 1 > events/kprobes/testprobe/enable @@ -35,11 +35,11 @@ echo function > current_tracer echo > trace ( echo "forked") grep testprobe trace -grep '_do_fork <-' trace +grep 'kernel_clone <-' trace # kprobe on / ftrace off echo nop > current_tracer echo > trace ( echo "forked") grep testprobe trace -! grep '_do_fork <-' trace +! grep 'kernel_clone <-' trace diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_multiprobe.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_multiprobe.tc index 366b7e1b6718..45d90b6c763d 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_multiprobe.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_multiprobe.tc @@ -4,7 +4,7 @@ # requires: kprobe_events "Create/append/":README # Choose 2 symbols for target -SYM1=_do_fork +SYM1=kernel_clone SYM2=do_exit EVENT_NAME=kprobes/testevent diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc index b4d834675e59..c02ea50d63ea 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc @@ -86,15 +86,15 @@ esac # multiprobe errors if grep -q "Create/append/" README && grep -q "imm-value" README; then -echo 'p:kprobes/testevent _do_fork' > kprobe_events +echo 'p:kprobes/testevent kernel_clone' > kprobe_events check_error '^r:kprobes/testevent do_exit' # DIFF_PROBE_TYPE # Explicitly use printf "%s" to not interpret \1 -printf "%s" 'p:kprobes/testevent _do_fork abcd=\1' > kprobe_events -check_error 'p:kprobes/testevent _do_fork ^bcd=\1' # DIFF_ARG_TYPE -check_error 'p:kprobes/testevent _do_fork ^abcd=\1:u8' # DIFF_ARG_TYPE -check_error 'p:kprobes/testevent _do_fork ^abcd=\"foo"' # DIFF_ARG_TYPE -check_error '^p:kprobes/testevent _do_fork abcd=\1' # SAME_PROBE +printf "%s" 'p:kprobes/testevent kernel_clone abcd=\1' > kprobe_events +check_error 'p:kprobes/testevent kernel_clone ^bcd=\1' # DIFF_ARG_TYPE +check_error 'p:kprobes/testevent kernel_clone ^abcd=\1:u8' # DIFF_ARG_TYPE +check_error 'p:kprobes/testevent kernel_clone ^abcd=\"foo"' # DIFF_ARG_TYPE +check_error '^p:kprobes/testevent kernel_clone abcd=\1' # SAME_PROBE fi exit 0 diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_args.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_args.tc index 523fde6d1aa5..7ae492c204a4 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_args.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_args.tc @@ -4,14 +4,14 @@ # requires: kprobe_events # Add new kretprobe event -echo 'r:testprobe2 _do_fork $retval' > kprobe_events +echo 'r:testprobe2 kernel_clone $retval' > kprobe_events grep testprobe2 kprobe_events | grep -q 'arg1=\$retval' test -d events/kprobes/testprobe2 echo 1 > events/kprobes/testprobe2/enable ( echo "forked") -cat trace | grep testprobe2 | grep -q '<- _do_fork' +cat trace | grep testprobe2 | grep -q '<- kernel_clone' echo 0 > events/kprobes/testprobe2/enable echo '-:testprobe2' >> kprobe_events diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/profile.tc b/tools/testing/selftests/ftrace/test.d/kprobe/profile.tc index ff6c44adc8a0..c4093fc1a773 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/profile.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/profile.tc @@ -4,7 +4,7 @@ # requires: kprobe_events ! grep -q 'myevent' kprobe_profile -echo p:myevent _do_fork > kprobe_events +echo p:myevent kernel_clone > kprobe_events grep -q 'myevent[[:space:]]*0[[:space:]]*0$' kprobe_profile echo 1 > events/kprobes/myevent/enable ( echo "forked" ) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 4f78e4805633..f19804df244c 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -971,6 +971,11 @@ void __run_test(struct __fixture_metadata *f, ksft_print_msg(" RUN %s%s%s.%s ...\n", f->name, variant->name[0] ? "." : "", variant->name, t->name); + + /* Make sure output buffers are flushed before fork */ + fflush(stdout); + fflush(stderr); + t->pid = fork(); if (t->pid < 0) { ksft_print_msg("ERROR SPAWNING TEST CHILD\n"); diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index a2c80914e3dc..01f8d3c0cf2c 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -46,6 +46,10 @@ #define __NR_pidfd_getfd -1 #endif +#ifndef PIDFD_NONBLOCK +#define PIDFD_NONBLOCK O_NONBLOCK +#endif + /* * The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c * That means, when it wraps around any pid < 300 will be skipped. diff --git a/tools/testing/selftests/pidfd/pidfd_wait.c b/tools/testing/selftests/pidfd/pidfd_wait.c index 7079f8eef792..be2943f072f6 100644 --- a/tools/testing/selftests/pidfd/pidfd_wait.c +++ b/tools/testing/selftests/pidfd/pidfd_wait.c @@ -17,10 +17,15 @@ #include #include "pidfd.h" -#include "../kselftest.h" +#include "../kselftest_harness.h" #define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr))) +/* Attempt to de-conflict with the selftests tree. */ +#ifndef SKIP +#define SKIP(s, ...) XFAIL(s, ##__VA_ARGS__) +#endif + static pid_t sys_clone3(struct clone_args *args) { return syscall(__NR_clone3, args, sizeof(struct clone_args)); @@ -32,9 +37,8 @@ static int sys_waitid(int which, pid_t pid, siginfo_t *info, int options, return syscall(__NR_waitid, which, pid, info, options, ru); } -static int test_pidfd_wait_simple(void) +TEST(wait_simple) { - const char *test_name = "pidfd wait simple"; int pidfd = -1, status = 0; pid_t parent_tid = -1; struct clone_args args = { @@ -50,76 +54,40 @@ static int test_pidfd_wait_simple(void) }; pidfd = open("/proc/self", O_DIRECTORY | O_RDONLY | O_CLOEXEC); - if (pidfd < 0) - ksft_exit_fail_msg("%s test: failed to open /proc/self %s\n", - test_name, strerror(errno)); + ASSERT_GE(pidfd, 0); pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL); - if (pid == 0) - ksft_exit_fail_msg( - "%s test: succeeded to wait on invalid pidfd %s\n", - test_name, strerror(errno)); - close(pidfd); + ASSERT_NE(pid, 0); + EXPECT_EQ(close(pidfd), 0); pidfd = -1; pidfd = open("/dev/null", O_RDONLY | O_CLOEXEC); - if (pidfd == 0) - ksft_exit_fail_msg("%s test: failed to open /dev/null %s\n", - test_name, strerror(errno)); + ASSERT_GE(pidfd, 0); pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL); - if (pid == 0) - ksft_exit_fail_msg( - "%s test: succeeded to wait on invalid pidfd %s\n", - test_name, strerror(errno)); - close(pidfd); + ASSERT_NE(pid, 0); + EXPECT_EQ(close(pidfd), 0); pidfd = -1; pid = sys_clone3(&args); - if (pid < 0) - ksft_exit_fail_msg("%s test: failed to create new process %s\n", - test_name, strerror(errno)); + ASSERT_GE(pid, 0); if (pid == 0) exit(EXIT_SUCCESS); pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL); - if (pid < 0) - ksft_exit_fail_msg( - "%s test: failed to wait on process with pid %d and pidfd %d: %s\n", - test_name, parent_tid, pidfd, strerror(errno)); + ASSERT_GE(pid, 0); + ASSERT_EQ(WIFEXITED(info.si_status), true); + ASSERT_EQ(WEXITSTATUS(info.si_status), 0); + EXPECT_EQ(close(pidfd), 0); - if (!WIFEXITED(info.si_status) || WEXITSTATUS(info.si_status)) - ksft_exit_fail_msg( - "%s test: unexpected status received after waiting on process with pid %d and pidfd %d: %s\n", - test_name, parent_tid, pidfd, strerror(errno)); - close(pidfd); - - if (info.si_signo != SIGCHLD) - ksft_exit_fail_msg( - "%s test: unexpected si_signo value %d received after waiting on process with pid %d and pidfd %d: %s\n", - test_name, info.si_signo, parent_tid, pidfd, - strerror(errno)); - - if (info.si_code != CLD_EXITED) - ksft_exit_fail_msg( - "%s test: unexpected si_code value %d received after waiting on process with pid %d and pidfd %d: %s\n", - test_name, info.si_code, parent_tid, pidfd, - strerror(errno)); - - if (info.si_pid != parent_tid) - ksft_exit_fail_msg( - "%s test: unexpected si_pid value %d received after waiting on process with pid %d and pidfd %d: %s\n", - test_name, info.si_pid, parent_tid, pidfd, - strerror(errno)); - - ksft_test_result_pass("%s test: Passed\n", test_name); - return 0; + ASSERT_EQ(info.si_signo, SIGCHLD); + ASSERT_EQ(info.si_code, CLD_EXITED); + ASSERT_EQ(info.si_pid, parent_tid); } -static int test_pidfd_wait_states(void) +TEST(wait_states) { - const char *test_name = "pidfd wait states"; int pidfd = -1, status = 0; pid_t parent_tid = -1; struct clone_args args = { @@ -135,9 +103,7 @@ static int test_pidfd_wait_states(void) }; pid = sys_clone3(&args); - if (pid < 0) - ksft_exit_fail_msg("%s test: failed to create new process %s\n", - test_name, strerror(errno)); + ASSERT_GE(pid, 0); if (pid == 0) { kill(getpid(), SIGSTOP); @@ -145,127 +111,115 @@ static int test_pidfd_wait_states(void) exit(EXIT_SUCCESS); } - ret = sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED, NULL); - if (ret < 0) - ksft_exit_fail_msg( - "%s test: failed to wait on WSTOPPED process with pid %d and pidfd %d: %s\n", - test_name, parent_tid, pidfd, strerror(errno)); + ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED, NULL), 0); + ASSERT_EQ(info.si_signo, SIGCHLD); + ASSERT_EQ(info.si_code, CLD_STOPPED); + ASSERT_EQ(info.si_pid, parent_tid); - if (info.si_signo != SIGCHLD) - ksft_exit_fail_msg( - "%s test: unexpected si_signo value %d received after waiting on process with pid %d and pidfd %d: %s\n", - test_name, info.si_signo, parent_tid, pidfd, - strerror(errno)); + ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGCONT, NULL, 0), 0); - if (info.si_code != CLD_STOPPED) - ksft_exit_fail_msg( - "%s test: unexpected si_code value %d received after waiting on process with pid %d and pidfd %d: %s\n", - test_name, info.si_code, parent_tid, pidfd, - strerror(errno)); + ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WCONTINUED, NULL), 0); + ASSERT_EQ(info.si_signo, SIGCHLD); + ASSERT_EQ(info.si_code, CLD_CONTINUED); + ASSERT_EQ(info.si_pid, parent_tid); - if (info.si_pid != parent_tid) - ksft_exit_fail_msg( - "%s test: unexpected si_pid value %d received after waiting on process with pid %d and pidfd %d: %s\n", - test_name, info.si_pid, parent_tid, pidfd, - strerror(errno)); + ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WUNTRACED, NULL), 0); + ASSERT_EQ(info.si_signo, SIGCHLD); + ASSERT_EQ(info.si_code, CLD_STOPPED); + ASSERT_EQ(info.si_pid, parent_tid); - ret = sys_pidfd_send_signal(pidfd, SIGCONT, NULL, 0); - if (ret < 0) - ksft_exit_fail_msg( - "%s test: failed to send signal to process with pid %d and pidfd %d: %s\n", - test_name, parent_tid, pidfd, strerror(errno)); + ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0), 0); - ret = sys_waitid(P_PIDFD, pidfd, &info, WCONTINUED, NULL); - if (ret < 0) - ksft_exit_fail_msg( - "%s test: failed to wait WCONTINUED on process with pid %d and pidfd %d: %s\n", - test_name, parent_tid, pidfd, strerror(errno)); + ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL), 0); + ASSERT_EQ(info.si_signo, SIGCHLD); + ASSERT_EQ(info.si_code, CLD_KILLED); + ASSERT_EQ(info.si_pid, parent_tid); - if (info.si_signo != SIGCHLD) - ksft_exit_fail_msg( - "%s test: unexpected si_signo value %d received after waiting on process with pid %d and pidfd %d: %s\n", - test_name, info.si_signo, parent_tid, pidfd, - strerror(errno)); + EXPECT_EQ(close(pidfd), 0); +} - if (info.si_code != CLD_CONTINUED) - ksft_exit_fail_msg( - "%s test: unexpected si_code value %d received after waiting on process with pid %d and pidfd %d: %s\n", - test_name, info.si_code, parent_tid, pidfd, - strerror(errno)); +TEST(wait_nonblock) +{ + int pidfd, status = 0; + unsigned int flags = 0; + pid_t parent_tid = -1; + struct clone_args args = { + .parent_tid = ptr_to_u64(&parent_tid), + .flags = CLONE_PARENT_SETTID, + .exit_signal = SIGCHLD, + }; + int ret; + pid_t pid; + siginfo_t info = { + .si_signo = 0, + }; - if (info.si_pid != parent_tid) - ksft_exit_fail_msg( - "%s test: unexpected si_pid value %d received after waiting on process with pid %d and pidfd %d: %s\n", - test_name, info.si_pid, parent_tid, pidfd, - strerror(errno)); - - ret = sys_waitid(P_PIDFD, pidfd, &info, WUNTRACED, NULL); - if (ret < 0) - ksft_exit_fail_msg( - "%s test: failed to wait on WUNTRACED process with pid %d and pidfd %d: %s\n", - test_name, parent_tid, pidfd, strerror(errno)); - - if (info.si_signo != SIGCHLD) - ksft_exit_fail_msg( - "%s test: unexpected si_signo value %d received after waiting on process with pid %d and pidfd %d: %s\n", - test_name, info.si_signo, parent_tid, pidfd, - strerror(errno)); - - if (info.si_code != CLD_STOPPED) - ksft_exit_fail_msg( - "%s test: unexpected si_code value %d received after waiting on process with pid %d and pidfd %d: %s\n", - test_name, info.si_code, parent_tid, pidfd, - strerror(errno)); - - if (info.si_pid != parent_tid) - ksft_exit_fail_msg( - "%s test: unexpected si_pid value %d received after waiting on process with pid %d and pidfd %d: %s\n", - test_name, info.si_pid, parent_tid, pidfd, - strerror(errno)); - - ret = sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0); - if (ret < 0) - ksft_exit_fail_msg( - "%s test: failed to send SIGKILL to process with pid %d and pidfd %d: %s\n", - test_name, parent_tid, pidfd, strerror(errno)); + /* + * Callers need to see ECHILD with non-blocking pidfds when no child + * processes exists. + */ + pidfd = sys_pidfd_open(getpid(), PIDFD_NONBLOCK); + EXPECT_GE(pidfd, 0) { + /* pidfd_open() doesn't support PIDFD_NONBLOCK. */ + ASSERT_EQ(errno, EINVAL); + SKIP(return, "Skipping PIDFD_NONBLOCK test"); + } ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL); - if (ret < 0) - ksft_exit_fail_msg( - "%s test: failed to wait on WEXITED process with pid %d and pidfd %d: %s\n", - test_name, parent_tid, pidfd, strerror(errno)); + ASSERT_LT(ret, 0); + ASSERT_EQ(errno, ECHILD); + EXPECT_EQ(close(pidfd), 0); - if (info.si_signo != SIGCHLD) - ksft_exit_fail_msg( - "%s test: unexpected si_signo value %d received after waiting on process with pid %d and pidfd %d: %s\n", - test_name, info.si_signo, parent_tid, pidfd, - strerror(errno)); + pid = sys_clone3(&args); + ASSERT_GE(pid, 0); - if (info.si_code != CLD_KILLED) - ksft_exit_fail_msg( - "%s test: unexpected si_code value %d received after waiting on process with pid %d and pidfd %d: %s\n", - test_name, info.si_code, parent_tid, pidfd, - strerror(errno)); + if (pid == 0) { + kill(getpid(), SIGSTOP); + exit(EXIT_SUCCESS); + } - if (info.si_pid != parent_tid) - ksft_exit_fail_msg( - "%s test: unexpected si_pid value %d received after waiting on process with pid %d and pidfd %d: %s\n", - test_name, info.si_pid, parent_tid, pidfd, - strerror(errno)); + pidfd = sys_pidfd_open(pid, PIDFD_NONBLOCK); + EXPECT_GE(pidfd, 0) { + /* pidfd_open() doesn't support PIDFD_NONBLOCK. */ + ASSERT_EQ(errno, EINVAL); + SKIP(return, "Skipping PIDFD_NONBLOCK test"); + } - close(pidfd); + flags = fcntl(pidfd, F_GETFL, 0); + ASSERT_GT(flags, 0); + ASSERT_GT((flags & O_NONBLOCK), 0); - ksft_test_result_pass("%s test: Passed\n", test_name); - return 0; + /* + * Callers need to see EAGAIN/EWOULDBLOCK with non-blocking pidfd when + * child processes exist but none have exited. + */ + ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL); + ASSERT_LT(ret, 0); + ASSERT_EQ(errno, EAGAIN); + + /* + * Callers need to continue seeing 0 with non-blocking pidfd and + * WNOHANG raised explicitly when child processes exist but none have + * exited. + */ + ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED | WNOHANG, NULL); + ASSERT_EQ(ret, 0); + + ASSERT_EQ(fcntl(pidfd, F_SETFL, (flags & ~O_NONBLOCK)), 0); + + ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED, NULL), 0); + ASSERT_EQ(info.si_signo, SIGCHLD); + ASSERT_EQ(info.si_code, CLD_STOPPED); + ASSERT_EQ(info.si_pid, parent_tid); + + ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGCONT, NULL, 0), 0); + + ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL), 0); + ASSERT_EQ(info.si_signo, SIGCHLD); + ASSERT_EQ(info.si_code, CLD_EXITED); + ASSERT_EQ(info.si_pid, parent_tid); + + EXPECT_EQ(close(pidfd), 0); } -int main(int argc, char **argv) -{ - ksft_print_header(); - ksft_set_plan(2); - - test_pidfd_wait_simple(); - test_pidfd_wait_states(); - - return ksft_exit_pass(); -} +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c index 31f8bb086907..1d4359341e44 100644 --- a/tools/testing/selftests/vm/gup_benchmark.c +++ b/tools/testing/selftests/vm/gup_benchmark.c @@ -105,12 +105,16 @@ int main(int argc, char **argv) gup.flags |= FOLL_WRITE; fd = open("/sys/kernel/debug/gup_benchmark", O_RDWR); - if (fd == -1) - perror("open"), exit(1); + if (fd == -1) { + perror("open"); + exit(1); + } p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0); - if (p == MAP_FAILED) - perror("mmap"), exit(1); + if (p == MAP_FAILED) { + perror("mmap"); + exit(1); + } gup.addr = (unsigned long)p; if (thp == 1) @@ -123,8 +127,10 @@ int main(int argc, char **argv) for (i = 0; i < repeats; i++) { gup.size = size; - if (ioctl(fd, cmd, &gup)) - perror("ioctl"), exit(1); + if (ioctl(fd, cmd, &gup)) { + perror("ioctl"); + exit(1); + } printf("Time: get:%lld put:%lld us", gup.get_delta_usec, gup.put_delta_usec); diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 61e5cfeb1350..9b0912a01777 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -227,8 +227,10 @@ static void hugetlb_allocate_area(void **alloc_area) huge_fd, *alloc_area == area_src ? 0 : nr_pages * page_size); if (area_alias == MAP_FAILED) { - if (munmap(*alloc_area, nr_pages * page_size) < 0) - perror("hugetlb munmap"), exit(1); + if (munmap(*alloc_area, nr_pages * page_size) < 0) { + perror("hugetlb munmap"); + exit(1); + } *alloc_area = NULL; return; } @@ -337,9 +339,10 @@ static void wp_range(int ufd, __u64 start, __u64 len, bool wp) /* Undo write-protect, do wakeup after that */ prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; - if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) - fprintf(stderr, "clear WP failed for address 0x%Lx\n", - start), exit(1); + if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) { + fprintf(stderr, "clear WP failed for address 0x%Lx\n", start); + exit(1); + } } static void *locking_thread(void *arg) @@ -359,8 +362,10 @@ static void *locking_thread(void *arg) seed += cpu; bzero(&rand, sizeof(rand)); bzero(&randstate, sizeof(randstate)); - if (initstate_r(seed, randstate, sizeof(randstate), &rand)) - fprintf(stderr, "srandom_r error\n"), exit(1); + if (initstate_r(seed, randstate, sizeof(randstate), &rand)) { + fprintf(stderr, "srandom_r error\n"); + exit(1); + } } else { page_nr = -bounces; if (!(bounces & BOUNCE_RACINGFAULTS)) @@ -369,12 +374,16 @@ static void *locking_thread(void *arg) while (!finished) { if (bounces & BOUNCE_RANDOM) { - if (random_r(&rand, &rand_nr)) - fprintf(stderr, "random_r 1 error\n"), exit(1); + if (random_r(&rand, &rand_nr)) { + fprintf(stderr, "random_r 1 error\n"); + exit(1); + } page_nr = rand_nr; if (sizeof(page_nr) > sizeof(rand_nr)) { - if (random_r(&rand, &rand_nr)) - fprintf(stderr, "random_r 2 error\n"), exit(1); + if (random_r(&rand, &rand_nr)) { + fprintf(stderr, "random_r 2 error\n"); + exit(1); + } page_nr |= (((unsigned long) rand_nr) << 16) << 16; } @@ -385,11 +394,13 @@ static void *locking_thread(void *arg) start = time(NULL); if (bounces & BOUNCE_VERIFY) { count = *area_count(area_dst, page_nr); - if (!count) + if (!count) { fprintf(stderr, "page_nr %lu wrong count %Lu %Lu\n", page_nr, count, - count_verify[page_nr]), exit(1); + count_verify[page_nr]); + exit(1); + } /* @@ -401,11 +412,12 @@ static void *locking_thread(void *arg) */ #if 1 if (!my_bcmp(area_dst + page_nr * page_size, zeropage, - page_size)) + page_size)) { fprintf(stderr, "my_bcmp page_nr %lu wrong count %Lu %Lu\n", - page_nr, count, - count_verify[page_nr]), exit(1); + page_nr, count, count_verify[page_nr]); + exit(1); + } #else unsigned long loops; @@ -437,7 +449,7 @@ static void *locking_thread(void *arg) fprintf(stderr, "page_nr %lu memory corruption %Lu %Lu\n", page_nr, count, - count_verify[page_nr]), exit(1); + count_verify[page_nr]); exit(1); } count++; *area_count(area_dst, page_nr) = count_verify[page_nr] = count; @@ -461,12 +473,14 @@ static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, offset); if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { /* real retval in ufdio_copy.copy */ - if (uffdio_copy->copy != -EEXIST) + if (uffdio_copy->copy != -EEXIST) { fprintf(stderr, "UFFDIO_COPY retry error %Ld\n", - uffdio_copy->copy), exit(1); + uffdio_copy->copy); + exit(1); + } } else { fprintf(stderr, "UFFDIO_COPY retry unexpected %Ld\n", - uffdio_copy->copy), exit(1); + uffdio_copy->copy); exit(1); } } @@ -474,9 +488,10 @@ static int __copy_page(int ufd, unsigned long offset, bool retry) { struct uffdio_copy uffdio_copy; - if (offset >= nr_pages * page_size) - fprintf(stderr, "unexpected offset %lu\n", - offset), exit(1); + if (offset >= nr_pages * page_size) { + fprintf(stderr, "unexpected offset %lu\n", offset); + exit(1); + } uffdio_copy.dst = (unsigned long) area_dst + offset; uffdio_copy.src = (unsigned long) area_src + offset; uffdio_copy.len = page_size; @@ -487,12 +502,14 @@ static int __copy_page(int ufd, unsigned long offset, bool retry) uffdio_copy.copy = 0; if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { /* real retval in ufdio_copy.copy */ - if (uffdio_copy.copy != -EEXIST) + if (uffdio_copy.copy != -EEXIST) { fprintf(stderr, "UFFDIO_COPY error %Ld\n", - uffdio_copy.copy), exit(1); + uffdio_copy.copy); + exit(1); + } } else if (uffdio_copy.copy != page_size) { fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n", - uffdio_copy.copy), exit(1); + uffdio_copy.copy); exit(1); } else { if (test_uffdio_copy_eexist && retry) { test_uffdio_copy_eexist = false; @@ -521,11 +538,11 @@ static int uffd_read_msg(int ufd, struct uffd_msg *msg) if (ret < 0) { if (errno == EAGAIN) return 1; - else - perror("blocking read error"), exit(1); + perror("blocking read error"); } else { - fprintf(stderr, "short read\n"), exit(1); + fprintf(stderr, "short read\n"); } + exit(1); } return 0; @@ -536,9 +553,10 @@ static void uffd_handle_page_fault(struct uffd_msg *msg, { unsigned long offset; - if (msg->event != UFFD_EVENT_PAGEFAULT) - fprintf(stderr, "unexpected msg event %u\n", - msg->event), exit(1); + if (msg->event != UFFD_EVENT_PAGEFAULT) { + fprintf(stderr, "unexpected msg event %u\n", msg->event); + exit(1); + } if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { wp_range(uffd, msg->arg.pagefault.address, page_size, false); @@ -546,8 +564,10 @@ static void uffd_handle_page_fault(struct uffd_msg *msg, } else { /* Missing page faults */ if (bounces & BOUNCE_VERIFY && - msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) - fprintf(stderr, "unexpected write fault\n"), exit(1); + msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) { + fprintf(stderr, "unexpected write fault\n"); + exit(1); + } offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; offset &= ~(page_size-1); @@ -574,25 +594,32 @@ static void *uffd_poll_thread(void *arg) for (;;) { ret = poll(pollfd, 2, -1); - if (!ret) - fprintf(stderr, "poll error %d\n", ret), exit(1); - if (ret < 0) - perror("poll"), exit(1); + if (!ret) { + fprintf(stderr, "poll error %d\n", ret); + exit(1); + } + if (ret < 0) { + perror("poll"); + exit(1); + } if (pollfd[1].revents & POLLIN) { - if (read(pollfd[1].fd, &tmp_chr, 1) != 1) - fprintf(stderr, "read pipefd error\n"), - exit(1); + if (read(pollfd[1].fd, &tmp_chr, 1) != 1) { + fprintf(stderr, "read pipefd error\n"); + exit(1); + } break; } - if (!(pollfd[0].revents & POLLIN)) + if (!(pollfd[0].revents & POLLIN)) { fprintf(stderr, "pollfd[0].revents %d\n", - pollfd[0].revents), exit(1); + pollfd[0].revents); + exit(1); + } if (uffd_read_msg(uffd, &msg)) continue; switch (msg.event) { default: fprintf(stderr, "unexpected msg event %u\n", - msg.event), exit(1); + msg.event); exit(1); break; case UFFD_EVENT_PAGEFAULT: uffd_handle_page_fault(&msg, stats); @@ -606,8 +633,10 @@ static void *uffd_poll_thread(void *arg) uffd_reg.range.start = msg.arg.remove.start; uffd_reg.range.len = msg.arg.remove.end - msg.arg.remove.start; - if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) - fprintf(stderr, "remove failure\n"), exit(1); + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) { + fprintf(stderr, "remove failure\n"); + exit(1); + } break; case UFFD_EVENT_REMAP: area_dst = (char *)(unsigned long)msg.arg.remap.to; @@ -879,8 +908,10 @@ static int faulting_process(int signal_test) area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, area_src); - if (area_dst == MAP_FAILED) - perror("mremap"), exit(1); + if (area_dst == MAP_FAILED) { + perror("mremap"); + exit(1); + } for (; nr < nr_pages; nr++) { count = *area_count(area_dst, nr); @@ -888,7 +919,7 @@ static int faulting_process(int signal_test) fprintf(stderr, "nr %lu memory corruption %Lu %Lu\n", nr, count, - count_verify[nr]), exit(1); + count_verify[nr]); exit(1); } /* * Trigger write protection if there is by writting @@ -901,8 +932,10 @@ static int faulting_process(int signal_test) return 1; for (nr = 0; nr < nr_pages; nr++) { - if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) - fprintf(stderr, "nr %lu is not zero\n", nr), exit(1); + if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) { + fprintf(stderr, "nr %lu is not zero\n", nr); + exit(1); + } } return 0; @@ -916,12 +949,14 @@ static void retry_uffdio_zeropage(int ufd, uffdio_zeropage->range.len, offset); if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { - if (uffdio_zeropage->zeropage != -EEXIST) + if (uffdio_zeropage->zeropage != -EEXIST) { fprintf(stderr, "UFFDIO_ZEROPAGE retry error %Ld\n", - uffdio_zeropage->zeropage), exit(1); + uffdio_zeropage->zeropage); + exit(1); + } } else { fprintf(stderr, "UFFDIO_ZEROPAGE retry unexpected %Ld\n", - uffdio_zeropage->zeropage), exit(1); + uffdio_zeropage->zeropage); exit(1); } } @@ -933,9 +968,10 @@ static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE); - if (offset >= nr_pages * page_size) - fprintf(stderr, "unexpected offset %lu\n", - offset), exit(1); + if (offset >= nr_pages * page_size) { + fprintf(stderr, "unexpected offset %lu\n", offset); + exit(1); + } uffdio_zeropage.range.start = (unsigned long) area_dst + offset; uffdio_zeropage.range.len = page_size; uffdio_zeropage.mode = 0; @@ -943,22 +979,26 @@ static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) if (ret) { /* real retval in ufdio_zeropage.zeropage */ if (has_zeropage) { - if (uffdio_zeropage.zeropage == -EEXIST) - fprintf(stderr, "UFFDIO_ZEROPAGE -EEXIST\n"), - exit(1); - else + if (uffdio_zeropage.zeropage == -EEXIST) { + fprintf(stderr, "UFFDIO_ZEROPAGE -EEXIST\n"); + exit(1); + } else { fprintf(stderr, "UFFDIO_ZEROPAGE error %Ld\n", - uffdio_zeropage.zeropage), exit(1); + uffdio_zeropage.zeropage); + exit(1); + } } else { - if (uffdio_zeropage.zeropage != -EINVAL) + if (uffdio_zeropage.zeropage != -EINVAL) { fprintf(stderr, "UFFDIO_ZEROPAGE not -EINVAL %Ld\n", - uffdio_zeropage.zeropage), exit(1); + uffdio_zeropage.zeropage); + exit(1); + } } } else if (has_zeropage) { if (uffdio_zeropage.zeropage != page_size) { fprintf(stderr, "UFFDIO_ZEROPAGE unexpected %Ld\n", - uffdio_zeropage.zeropage), exit(1); + uffdio_zeropage.zeropage); exit(1); } else { if (test_uffdio_zeropage_eexist && retry) { test_uffdio_zeropage_eexist = false; @@ -970,7 +1010,7 @@ static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) } else { fprintf(stderr, "UFFDIO_ZEROPAGE succeeded %Ld\n", - uffdio_zeropage.zeropage), exit(1); + uffdio_zeropage.zeropage); exit(1); } return 0; @@ -1000,19 +1040,24 @@ static int userfaultfd_zeropage_test(void) uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; if (test_uffdio_wp) uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - fprintf(stderr, "register failure\n"), exit(1); + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { + fprintf(stderr, "register failure\n"); + exit(1); + } expected_ioctls = uffd_test_ops->expected_ioctls; if ((uffdio_register.ioctls & expected_ioctls) != - expected_ioctls) + expected_ioctls) { fprintf(stderr, - "unexpected missing ioctl for anon memory\n"), - exit(1); + "unexpected missing ioctl for anon memory\n"); + exit(1); + } if (uffdio_zeropage(uffd, 0)) { - if (my_bcmp(area_dst, zeropage, page_size)) - fprintf(stderr, "zeropage is not zero\n"), exit(1); + if (my_bcmp(area_dst, zeropage, page_size)) { + fprintf(stderr, "zeropage is not zero\n"); + exit(1); + } } close(uffd); @@ -1047,32 +1092,41 @@ static int userfaultfd_events_test(void) uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; if (test_uffdio_wp) uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - fprintf(stderr, "register failure\n"), exit(1); + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { + fprintf(stderr, "register failure\n"); + exit(1); + } expected_ioctls = uffd_test_ops->expected_ioctls; - if ((uffdio_register.ioctls & expected_ioctls) != - expected_ioctls) - fprintf(stderr, - "unexpected missing ioctl for anon memory\n"), - exit(1); + if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) { + fprintf(stderr, "unexpected missing ioctl for anon memory\n"); + exit(1); + } - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) - perror("uffd_poll_thread create"), exit(1); + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) { + perror("uffd_poll_thread create"); + exit(1); + } pid = fork(); - if (pid < 0) - perror("fork"), exit(1); + if (pid < 0) { + perror("fork"); + exit(1); + } if (!pid) return faulting_process(0); waitpid(pid, &err, 0); - if (err) - fprintf(stderr, "faulting process failed\n"), exit(1); + if (err) { + fprintf(stderr, "faulting process failed\n"); + exit(1); + } - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) - perror("pipe write"), exit(1); + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) { + perror("pipe write"); + exit(1); + } if (pthread_join(uffd_mon, NULL)) return 1; @@ -1110,38 +1164,49 @@ static int userfaultfd_sig_test(void) uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; if (test_uffdio_wp) uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - fprintf(stderr, "register failure\n"), exit(1); + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { + fprintf(stderr, "register failure\n"); + exit(1); + } expected_ioctls = uffd_test_ops->expected_ioctls; - if ((uffdio_register.ioctls & expected_ioctls) != - expected_ioctls) - fprintf(stderr, - "unexpected missing ioctl for anon memory\n"), - exit(1); + if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) { + fprintf(stderr, "unexpected missing ioctl for anon memory\n"); + exit(1); + } - if (faulting_process(1)) - fprintf(stderr, "faulting process failed\n"), exit(1); + if (faulting_process(1)) { + fprintf(stderr, "faulting process failed\n"); + exit(1); + } if (uffd_test_ops->release_pages(area_dst)) return 1; - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) - perror("uffd_poll_thread create"), exit(1); + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) { + perror("uffd_poll_thread create"); + exit(1); + } pid = fork(); - if (pid < 0) - perror("fork"), exit(1); + if (pid < 0) { + perror("fork"); + exit(1); + } if (!pid) exit(faulting_process(2)); waitpid(pid, &err, 0); - if (err) - fprintf(stderr, "faulting process failed\n"), exit(1); + if (err) { + fprintf(stderr, "faulting process failed\n"); + exit(1); + } - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) - perror("pipe write"), exit(1); + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) { + perror("pipe write"); + exit(1); + } if (pthread_join(uffd_mon, (void **)&userfaults)) return 1; @@ -1395,7 +1460,7 @@ static void set_test_type(const char *type) test_type = TEST_SHMEM; uffd_test_ops = &shmem_uffd_test_ops; } else { - fprintf(stderr, "Unknown test type: %s\n", type), exit(1); + fprintf(stderr, "Unknown test type: %s\n", type); exit(1); } if (test_type == TEST_HUGETLB) @@ -1403,12 +1468,15 @@ static void set_test_type(const char *type) else page_size = sysconf(_SC_PAGE_SIZE); - if (!page_size) - fprintf(stderr, "Unable to determine page size\n"), - exit(2); + if (!page_size) { + fprintf(stderr, "Unable to determine page size\n"); + exit(2); + } if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 - > page_size) - fprintf(stderr, "Impossible to run this test\n"), exit(2); + > page_size) { + fprintf(stderr, "Impossible to run this test\n"); + exit(2); + } } static void sigalrm(int sig) @@ -1425,8 +1493,10 @@ int main(int argc, char **argv) if (argc < 4) usage(); - if (signal(SIGALRM, sigalrm) == SIG_ERR) - fprintf(stderr, "failed to arm SIGALRM"), exit(1); + if (signal(SIGALRM, sigalrm) == SIG_ERR) { + fprintf(stderr, "failed to arm SIGALRM"); + exit(1); + } alarm(ALARM_INTERVAL_SECS); set_test_type(argv[1]);