From b3830e8d478cd9fe33e820425ce431c8ef280967 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 1 Aug 2016 12:05:02 +0200 Subject: [PATCH 001/538] x86/entry: Remove duplicated comment Ok, ok, we see it is called from C :-) Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160801100502.29796-1-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index b846875aeea6..8956eae04c25 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -347,8 +347,7 @@ ENTRY(stub_ptregs_64) jmp entry_SYSCALL64_slow_path 1: - /* Called from C */ - jmp *%rax /* called from C */ + jmp *%rax /* Called from C */ END(stub_ptregs_64) .macro ptregs_stub func From a9da291f25f014c8ee999f498305949332d58cd6 Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Thu, 4 Aug 2016 04:30:37 +0000 Subject: [PATCH 002/538] dt-bindings: irqchip: Add J-Core interrupt controller bindings Signed-off-by: Rich Felker Acked-by: Rob Herring Link: https://lkml.kernel.org/r/c8aae4597153595cf965efe96422f699639c9d51.147018b6529.git.dalias@libc.org Signed-off-by: Jason Cooper --- .../interrupt-controller/jcore,aic.txt | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 Documentation/devicetree/bindings/interrupt-controller/jcore,aic.txt diff --git a/Documentation/devicetree/bindings/interrupt-controller/jcore,aic.txt b/Documentation/devicetree/bindings/interrupt-controller/jcore,aic.txt new file mode 100644 index 000000000000..ee2ad36f8df8 --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/jcore,aic.txt @@ -0,0 +1,26 @@ +J-Core Advanced Interrupt Controller + +Required properties: + +- compatible: Should be "jcore,aic1" for the (obsolete) first-generation aic + with 8 interrupt lines with programmable priorities, or "jcore,aic2" for + the "aic2" core with 64 interrupts. + +- reg: Memory region(s) for configuration. For SMP, there should be one + region per cpu, indexed by the sequential, zero-based hardware cpu + number. + +- interrupt-controller: Identifies the node as an interrupt controller + +- #interrupt-cells: Specifies the number of cells needed to encode an + interrupt source. The value shall be 1. + + +Example: + +aic: interrupt-controller@200 { + compatible = "jcore,aic2"; + reg = < 0x200 0x30 0x500 0x30 >; + interrupt-controller; + #interrupt-cells = <1>; +}; From 981b58f66cfcd32dc4ebbaeef8451daf393b6c94 Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Thu, 4 Aug 2016 04:30:37 +0000 Subject: [PATCH 003/538] irqchip/jcore-aic: Add J-Core AIC driver There are two versions of the J-Core interrupt controller in use, aic1 which generates interrupts with programmable priorities, but only supports 8 irq lines and maps them to cpu traps in the range 17 to 24, and aic2 which uses traps in the range 64-127 and supports up to 128 irqs, with priorities dependent on the interrupt number. The Linux driver does not make use of priorities anyway. For simplicity, there is no aic1-specific logic in the driver beyond setting the priority register, which is necessary for interrupts to work at all. Eventually aic1 will likely be phased out, but it's currently in use in deployments and all released bitstream binaries. Signed-off-by: Rich Felker Link: https://lkml.kernel.org/r/c3b89ef74aaa6477575dbe2d410eb1d182503243.147018b6529.git.dalias@libc.org Signed-off-by: Jason Cooper --- drivers/irqchip/Kconfig | 7 +++ drivers/irqchip/Makefile | 1 + drivers/irqchip/irq-jcore-aic.c | 94 +++++++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+) create mode 100644 drivers/irqchip/irq-jcore-aic.c diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 7f8728984f44..43bed4e5c7ae 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -156,6 +156,13 @@ config PIC32_EVIC select GENERIC_IRQ_CHIP select IRQ_DOMAIN +config JCORE_AIC + bool "J-Core integrated AIC" + depends on OF && (SUPERH || COMPILE_TEST) + select IRQ_DOMAIN + help + Support for the J-Core integrated AIC. + config RENESAS_INTC_IRQPIN bool select IRQ_DOMAIN diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 4c203b6b8163..ee7e3ca0ac23 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_I8259) += irq-i8259.o obj-$(CONFIG_IMGPDC_IRQ) += irq-imgpdc.o obj-$(CONFIG_IRQ_MIPS_CPU) += irq-mips-cpu.o obj-$(CONFIG_SIRF_IRQ) += irq-sirfsoc.o +obj-$(CONFIG_JCORE_AIC) += irq-jcore-aic.o obj-$(CONFIG_RENESAS_INTC_IRQPIN) += irq-renesas-intc-irqpin.o obj-$(CONFIG_RENESAS_IRQC) += irq-renesas-irqc.o obj-$(CONFIG_VERSATILE_FPGA_IRQ) += irq-versatile-fpga.o diff --git a/drivers/irqchip/irq-jcore-aic.c b/drivers/irqchip/irq-jcore-aic.c new file mode 100644 index 000000000000..5e5e3bb7d3c7 --- /dev/null +++ b/drivers/irqchip/irq-jcore-aic.c @@ -0,0 +1,94 @@ +/* + * J-Core SoC AIC driver + * + * Copyright (C) 2015-2016 Smart Energy Instruments, Inc. + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define JCORE_AIC_MAX_HWIRQ 127 +#define JCORE_AIC1_MIN_HWIRQ 16 +#define JCORE_AIC2_MIN_HWIRQ 64 + +#define JCORE_AIC1_INTPRI_REG 8 + +static struct irq_chip jcore_aic; + +static int jcore_aic_irqdomain_map(struct irq_domain *d, unsigned int irq, + irq_hw_number_t hwirq) +{ + struct irq_chip *aic = d->host_data; + + irq_set_chip_and_handler(irq, aic, handle_simple_irq); + + return 0; +} + +static const struct irq_domain_ops jcore_aic_irqdomain_ops = { + .map = jcore_aic_irqdomain_map, + .xlate = irq_domain_xlate_onecell, +}; + +static void noop(struct irq_data *data) +{ +} + +int __init aic_irq_of_init(struct device_node *node, struct device_node *parent) +{ + unsigned min_irq = JCORE_AIC2_MIN_HWIRQ; + unsigned dom_sz = JCORE_AIC_MAX_HWIRQ+1; + struct irq_domain *domain; + + pr_info("Initializing J-Core AIC\n"); + + /* AIC1 needs priority initialization to receive interrupts. */ + if (of_device_is_compatible(node, "jcore,aic1")) { + unsigned cpu; + + for_each_present_cpu(cpu) { + void __iomem *base = of_iomap(node, cpu); + + if (!base) { + pr_err("Unable to map AIC for cpu %u\n", cpu); + return -ENOMEM; + } + __raw_writel(0xffffffff, base + JCORE_AIC1_INTPRI_REG); + iounmap(base); + } + min_irq = JCORE_AIC1_MIN_HWIRQ; + } + + /* + * The irq chip framework requires either mask/unmask or enable/disable + * function pointers to be provided, but the hardware does not have any + * such mechanism; the only interrupt masking is at the cpu level and + * it affects all interrupts. We provide dummy mask/unmask. The hardware + * handles all interrupt control and clears pending status when the cpu + * accepts the interrupt. + */ + jcore_aic.irq_mask = noop; + jcore_aic.irq_unmask = noop; + jcore_aic.name = "AIC"; + + domain = irq_domain_add_linear(node, dom_sz, &jcore_aic_irqdomain_ops, + &jcore_aic); + if (!domain) + return -ENOMEM; + irq_create_strict_mappings(domain, min_irq, min_irq, dom_sz - min_irq); + + return 0; +} + +IRQCHIP_DECLARE(jcore_aic2, "jcore,aic2", aic_irq_of_init); +IRQCHIP_DECLARE(jcore_aic1, "jcore,aic1", aic_irq_of_init); From 31851a9874d63dbb532910a86b2be49c15997ea3 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Fri, 5 Aug 2016 14:31:29 +0800 Subject: [PATCH 004/538] sched/fair: Remove 'cpu_busy' parameter from update_next_balance() The update_next_balance() function is only used by idle balancing, so its 'cpu_busy' parameter is always 0. Open code it instead of passing it around. Signed-off-by: Leo Yan Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1470378689-14892-1-git-send-email-leo.yan@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 039de34f1521..d3005364fb03 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7704,11 +7704,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) } static inline void -update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) +update_next_balance(struct sched_domain *sd, unsigned long *next_balance) { unsigned long interval, next; - interval = get_sd_balance_interval(sd, cpu_busy); + /* used by idle balance, so cpu_busy = 0 */ + interval = get_sd_balance_interval(sd, 0); next = sd->last_balance + interval; if (time_after(*next_balance, next)) @@ -7738,7 +7739,7 @@ static int idle_balance(struct rq *this_rq) rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) - update_next_balance(sd, 0, &next_balance); + update_next_balance(sd, &next_balance); rcu_read_unlock(); goto out; @@ -7756,7 +7757,7 @@ static int idle_balance(struct rq *this_rq) continue; if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { - update_next_balance(sd, 0, &next_balance); + update_next_balance(sd, &next_balance); break; } @@ -7774,7 +7775,7 @@ static int idle_balance(struct rq *this_rq) curr_cost += domain_cost; } - update_next_balance(sd, 0, &next_balance); + update_next_balance(sd, &next_balance); /* * Stop searching for tasks to pull if there are From a1fd46565bea62840a24bee7b7c60f65bb12bd21 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Fri, 5 Aug 2016 14:32:38 +0800 Subject: [PATCH 005/538] sched/core: Fix one typo Fix one minor typo in the comment: s/targer/target/. Signed-off-by: Leo Yan Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1470378758-15066-1-git-send-email-leo.yan@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2a906f20fba7..4a5f52e79c77 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1265,7 +1265,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) /* * Task isn't running anymore; make it appear like we migrated * it before it went to sleep. This means on wakeup we make the - * previous cpu our targer instead of where it really is. + * previous cpu our target instead of where it really is. */ p->wake_cpu = cpu; } From 7c3edd2c300b7ef2005a69dc727692ee07434aa5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Jul 2016 10:56:25 +0200 Subject: [PATCH 006/538] sched/fair: Improve PELT stuff some more Vincent noted that the update_tg_load_avg() usage in commit: 3d30544f0212 ("sched/fair: Apply more PELT fixes") isn't entirely sufficient. We need to call this function every time cfs_rq->avg.load changes, this includes when update_cfs_rq_load_avg() returns true, but {attach,detach}_entity_load_avg() themselves also change it. This means we need to unconditionally call update_tg_load_avg(). Also, add more comments. Reported-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d3005364fb03..9f9a4e5bbfa9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -726,7 +726,6 @@ void post_init_entity_util_avg(struct sched_entity *se) struct sched_avg *sa = &se->avg; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; u64 now = cfs_rq_clock_task(cfs_rq); - int tg_update; if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -759,10 +758,9 @@ void post_init_entity_util_avg(struct sched_entity *se) } } - tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); + update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); - if (tg_update) - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq, false); } #else /* !CONFIG_SMP */ @@ -2803,9 +2801,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, } #ifdef CONFIG_FAIR_GROUP_SCHED -/* - * Updating tg's load_avg is necessary before update_cfs_share (which is done) - * and effective_load (which is not done because it is too costly). +/** + * update_tg_load_avg - update the tg's load avg + * @cfs_rq: the cfs_rq whose avg changed + * @force: update regardless of how small the difference + * + * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. + * However, because tg->load_avg is a global value there are performance + * considerations. + * + * In order to avoid having to look at the other cfs_rq's, we use a + * differential update where we store the last value we propagated. This in + * turn allows skipping updates if the differential is 'small'. + * + * Updating tg's load_avg is necessary before update_cfs_share() (which is + * done) and effective_load() (which is not done because it is too costly). */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { @@ -2931,10 +2941,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * - * Returns true if the load decayed or we removed utilization. It is expected - * that one calls update_tg_load_avg() on this condition, but after you've - * modified the cfs_rq avg (attach/detach), such that we propagate the new - * avg up. + * Returns true if the load decayed or we removed load. + * + * Since both these conditions indicate a changed cfs_rq->avg.load we should + * call update_tg_load_avg() when this function returns true. */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) @@ -8442,7 +8452,6 @@ static void detach_task_cfs_rq(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); - int tg_update; if (!vruntime_normalized(p)) { /* @@ -8454,10 +8463,9 @@ static void detach_task_cfs_rq(struct task_struct *p) } /* Catch up with the cfs_rq and remove our load when we leave */ - tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); + update_cfs_rq_load_avg(now, cfs_rq, false); detach_entity_load_avg(cfs_rq, se); - if (tg_update) - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq, false); } static void attach_task_cfs_rq(struct task_struct *p) @@ -8465,7 +8473,6 @@ static void attach_task_cfs_rq(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); - int tg_update; #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -8476,10 +8483,9 @@ static void attach_task_cfs_rq(struct task_struct *p) #endif /* Synchronize task with its cfs_rq */ - tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); + update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); - if (tg_update) - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq, false); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; From bd425d4bfc7a1a6064dbbadfbac9c7eec0e426ec Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 22 Jun 2016 18:03:12 +0100 Subject: [PATCH 007/538] sched/core: Fix power to capacity renaming in comment It is seems that this one escaped Nico's renaming of cpu_power to cpu_capacity a while back. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: linux-kernel@vger.kernel.org Cc: mgalbraith@suse.de Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1466615004-3503-2-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 62c68e513e39..f3db596efd2c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1022,7 +1022,7 @@ extern void wake_up_q(struct wake_q_head *head); #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ -#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu power */ +#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */ #define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ From 772bd008cd9a1d4e8ce566f2edcc61d1c28fcbe5 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 22 Jun 2016 18:03:13 +0100 Subject: [PATCH 008/538] sched/fair: Make the use of prev_cpu consistent in the wakeup path In commit: ac66f5477239 ("sched/numa: Introduce migrate_swap()") select_task_rq() got a 'cpu' argument to enable overriding of prev_cpu in special cases (NUMA task swapping). However, the select_task_rq_fair() helper functions: wake_affine() and select_idle_sibling(), still use task_cpu(p) directly to work out prev_cpu, which leads to inconsistencies. This patch passes prev_cpu (potentially overridden by NUMA code) into the helper functions to ensure prev_cpu is indeed the same CPU everywhere in the wakeup path. cc: Ingo Molnar cc: Rik van Riel Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: linux-kernel@vger.kernel.org Cc: mgalbraith@suse.de Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1466615004-3503-3-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9f9a4e5bbfa9..d819da68857f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -656,7 +656,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP -static int select_idle_sibling(struct task_struct *p, int cpu); +static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); /* @@ -1512,7 +1512,8 @@ static void task_numa_compare(struct task_numa_env *env, * Call select_idle_sibling to maybe find a better one. */ if (!cur) - env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); + env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, + env->dst_cpu); assign: task_numa_assign(env, cur, imp); @@ -5101,18 +5102,18 @@ static int wake_wide(struct task_struct *p) return 1; } -static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) +static int wake_affine(struct sched_domain *sd, struct task_struct *p, + int prev_cpu, int sync) { s64 this_load, load; s64 this_eff_load, prev_eff_load; - int idx, this_cpu, prev_cpu; + int idx, this_cpu; struct task_group *tg; unsigned long weight; int balanced; idx = sd->wake_idx; this_cpu = smp_processor_id(); - prev_cpu = task_cpu(p); load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); @@ -5277,11 +5278,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* * Try and locate an idle CPU in the sched_domain. */ -static int select_idle_sibling(struct task_struct *p, int target) +static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; struct sched_group *sg; - int i = task_cpu(p); if (idle_cpu(target)) return target; @@ -5289,8 +5289,8 @@ static int select_idle_sibling(struct task_struct *p, int target) /* * If the prevous cpu is cache affine and idle, don't be stupid. */ - if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) - return i; + if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) + return prev; /* * Otherwise, iterate the domains and find an eligible idle cpu. @@ -5311,6 +5311,8 @@ static int select_idle_sibling(struct task_struct *p, int target) for_each_lower_domain(sd) { sg = sd->groups; do { + int i; + if (!cpumask_intersects(sched_group_cpus(sg), tsk_cpus_allowed(p))) goto next; @@ -5419,13 +5421,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (affine_sd) { sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) new_cpu = cpu; } if (!sd) { if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ - new_cpu = select_idle_sibling(p, new_cpu); + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } else while (sd) { struct sched_group *group; From eaecf41f5abf80b63c8e025fcb9ee4aa203c3038 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 22 Jun 2016 18:03:14 +0100 Subject: [PATCH 009/538] sched/fair: Optimize find_idlest_cpu() when there is no choice In the current find_idlest_group()/find_idlest_cpu() search we end up calling find_idlest_cpu() in a sched_group containing only one CPU in the end. Checking idle-states becomes pointless when there is no alternative, so bail out instead. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: linux-kernel@vger.kernel.org Cc: mgalbraith@suse.de Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1466615004-3503-4-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d819da68857f..acdc351d2386 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5239,6 +5239,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) int shallowest_idle_cpu = -1; int i; + /* Check if we have any choice: */ + if (group->group_weight == 1) + return cpumask_first(sched_group_cpus(group)); + /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { if (idle_cpu(i)) { From 9279e0d2e565e0217618c2087de83d3239811329 Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Sun, 10 Jul 2016 15:00:26 +0100 Subject: [PATCH 010/538] sched/core: Add documentation for 'cookie' argument Add documentation for the cookie argument in try_to_wake_up_local(). This caused the following warning when building documentation: kernel/sched/core.c:2088: warning: No description found for parameter 'cookie' Signed-off-by: Luis de Bethencourt Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Fixes: e7904a28f533 ("ilocking/lockdep, sched/core: Implement a better lock pinning scheme") Link: http://lkml.kernel.org/r/1468159226-17674-1-git-send-email-luisbg@osg.samsung.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4a5f52e79c77..10f2595c408a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2073,6 +2073,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /** * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened + * @cookie: context's cookie for pinning * * Put @p on the run-queue if it's not already there. The caller must * ensure that this_rq() is locked, @p is bound to this_rq() and not From 98b0a857805080db04f50b8c71438c9c369ef0b3 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 5 Aug 2016 16:07:55 +0100 Subject: [PATCH 011/538] sched/deadline: Remove useless parameter from setup_new_dl_entity() setup_new_dl_entity() takes two parameters, but it only actually uses one of them, under a different name, to setup a new dl_entity, after: 2f9f3fdc928 "sched/deadline: Remove dl_new from struct sched_dl_entity" as we currently do: setup_new_dl_entity(&p->dl, &p->dl) However, before Luca's change we were doing: setup_new_dl_entity(dl_se, pi_se) in update_dl_entity() for a dl_se->new entity: we were using pi_se's parameters (the potential PI donor) for setting up a new entity. This change removes the useless second parameter of setup_new_dl_entity(). While we are at it we also optimize things further calling setup_new_dl_ entity() only for already queued tasks, since (as pointed out by Xunlei) we already do the very same update at tasks wakeup time anyway. By doing so, we don't need to worry about a potential PI donor anymore, as rt_mutex_setprio() takes care of that already for us. Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt Cc: Linus Torvalds Cc: Luca Abeni Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Xunlei Pang Link: http://lkml.kernel.org/r/1470409675-20935-1-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 1ce8867283dc..d091f4a95416 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -346,12 +346,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, * one, and to (try to!) reconcile itself with its own scheduling * parameters. */ -static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); + WARN_ON(dl_se->dl_boosted); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); /* @@ -367,8 +367,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, * future; in fact, we must consider execution overheads (time * spent on hardirq context, etc.). */ - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; + dl_se->runtime = dl_se->dl_runtime; } /* @@ -1723,10 +1723,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { + + /* If p is not queued we will update its parameters at next wakeup. */ + if (!task_on_rq_queued(p)) + return; + + /* + * If p is boosted we already updated its params in + * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), + * p's deadline being now already after rq_clock(rq). + */ if (dl_time_before(p->dl.deadline, rq_clock(rq))) - setup_new_dl_entity(&p->dl, &p->dl); + setup_new_dl_entity(&p->dl); - if (task_on_rq_queued(p) && rq->curr != p) { + if (rq->curr != p) { #ifdef CONFIG_SMP if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) queue_push_tasks(rq); From 64a5e3cb308028dba0676daae0a7821d770036fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Jul 2016 14:26:11 +0200 Subject: [PATCH 012/538] locking/qspinlock: Improve readability Restructure pv_queued_spin_steal_lock() as I found it hard to read. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long --- kernel/locking/qspinlock_paravirt.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 8a99abf58080..429c3dc2a5f3 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -70,11 +70,14 @@ struct pv_node { static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; - int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && - (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); - qstat_inc(qstat_pv_lock_stealing, ret); - return ret; + if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && + (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { + qstat_inc(qstat_pv_lock_stealing, true); + return true; + } + + return false; } /* @@ -257,7 +260,6 @@ static struct pv_node *pv_unhash(struct qspinlock *lock) static inline bool pv_wait_early(struct pv_node *prev, int loop) { - if ((loop & PV_PREV_CHECK_MASK) != 0) return false; From 08be8f63c40c030b5cf95b4368e314e563a86301 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 31 May 2016 12:53:47 -0400 Subject: [PATCH 013/538] locking/pvstat: Separate wait_again and spurious wakeup stats Currently there are overlap in the pvqspinlock wait_again and spurious_wakeup stat counters. Because of lock stealing, it is no longer possible to accurately determine if spurious wakeup has happened in the queue head. As they track both the queue node and queue head status, it is also hard to tell how many of those comes from the queue head and how many from the queue node. This patch changes the accounting rules so that spurious wakeup is only tracked in the queue node. The wait_again count, however, is only tracked in the queue head when the vCPU failed to acquire the lock after a vCPU kick. This should give a much better indication of the wait-kick dynamics in the queue node and the queue head. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Boqun Feng Cc: Douglas Hatch Cc: Linus Torvalds Cc: Pan Xinhui Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1464713631-1066-2-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 12 +++--------- kernel/locking/qspinlock_stat.h | 4 ++-- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 429c3dc2a5f3..3acf16d79cf4 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -288,12 +288,10 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) { struct pv_node *pn = (struct pv_node *)node; struct pv_node *pp = (struct pv_node *)prev; - int waitcnt = 0; int loop; bool wait_early; - /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */ - for (;; waitcnt++) { + for (;;) { for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) { if (READ_ONCE(node->locked)) return; @@ -317,7 +315,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) if (!READ_ONCE(node->locked)) { qstat_inc(qstat_pv_wait_node, true); - qstat_inc(qstat_pv_wait_again, waitcnt); qstat_inc(qstat_pv_wait_early, wait_early); pv_wait(&pn->state, vcpu_halted); } @@ -458,12 +455,9 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) pv_wait(&l->locked, _Q_SLOW_VAL); /* - * The unlocker should have freed the lock before kicking the - * CPU. So if the lock is still not free, it is a spurious - * wakeup or another vCPU has stolen the lock. The current - * vCPU should spin again. + * Because of lock stealing, the queue head vCPU may not be + * able to acquire the lock before it has to wait again. */ - qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked)); } /* diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index b9d031516254..eb0a599fcf58 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -24,8 +24,8 @@ * pv_latency_wake - average latency (ns) from vCPU kick to wakeup * pv_lock_slowpath - # of locking operations via the slowpath * pv_lock_stealing - # of lock stealing operations - * pv_spurious_wakeup - # of spurious wakeups - * pv_wait_again - # of vCPU wait's that happened after a vCPU kick + * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs + * pv_wait_again - # of wait's after a queue head vCPU kick * pv_wait_early - # of early vCPU wait's * pv_wait_head - # of vCPU wait's at the queue head * pv_wait_node - # of vCPU wait's at a non-head queue node From 80127a39681bd68c959f0953f84a830cbd7c3b1c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Jul 2016 20:08:46 +0200 Subject: [PATCH 014/538] locking/percpu-rwsem: Optimize readers and reduce global impact Currently the percpu-rwsem switches to (global) atomic ops while a writer is waiting; which could be quite a while and slows down releasing the readers. This patch cures this problem by ordering the reader-state vs reader-count (see the comments in __percpu_down_read() and percpu_down_write()). This changes a global atomic op into a full memory barrier, which doesn't have the global cacheline contention. This also enables using the percpu-rwsem with rcu_sync disabled in order to bias the implementation differently, reducing the writer latency by adding some cost to readers. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Paul McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org [ Fixed modular build. ] Signed-off-by: Ingo Molnar --- include/linux/percpu-rwsem.h | 84 +++++++++++-- kernel/locking/percpu-rwsem.c | 228 +++++++++++++++++++--------------- kernel/rcu/sync.c | 2 + 3 files changed, 208 insertions(+), 106 deletions(-) diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index c2fa3ecb0dce..146efefde2a1 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -10,30 +10,96 @@ struct percpu_rw_semaphore { struct rcu_sync rss; - unsigned int __percpu *fast_read_ctr; + unsigned int __percpu *read_count; struct rw_semaphore rw_sem; - atomic_t slow_read_ctr; - wait_queue_head_t write_waitq; + wait_queue_head_t writer; + int readers_block; }; -extern void percpu_down_read(struct percpu_rw_semaphore *); -extern int percpu_down_read_trylock(struct percpu_rw_semaphore *); -extern void percpu_up_read(struct percpu_rw_semaphore *); +extern int __percpu_down_read(struct percpu_rw_semaphore *, int); +extern void __percpu_up_read(struct percpu_rw_semaphore *); + +static inline void percpu_down_read(struct percpu_rw_semaphore *sem) +{ + might_sleep(); + + rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_); + + preempt_disable(); + /* + * We are in an RCU-sched read-side critical section, so the writer + * cannot both change sem->state from readers_fast and start checking + * counters while we are here. So if we see !sem->state, we know that + * the writer won't be checking until we're past the preempt_enable() + * and that one the synchronize_sched() is done, the writer will see + * anything we did within this RCU-sched read-size critical section. + */ + __this_cpu_inc(*sem->read_count); + if (unlikely(!rcu_sync_is_idle(&sem->rss))) + __percpu_down_read(sem, false); /* Unconditional memory barrier */ + preempt_enable(); + /* + * The barrier() from preempt_enable() prevents the compiler from + * bleeding the critical section out. + */ +} + +static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem) +{ + int ret = 1; + + preempt_disable(); + /* + * Same as in percpu_down_read(). + */ + __this_cpu_inc(*sem->read_count); + if (unlikely(!rcu_sync_is_idle(&sem->rss))) + ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */ + preempt_enable(); + /* + * The barrier() from preempt_enable() prevents the compiler from + * bleeding the critical section out. + */ + + if (ret) + rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_); + + return ret; +} + +static inline void percpu_up_read(struct percpu_rw_semaphore *sem) +{ + /* + * The barrier() in preempt_disable() prevents the compiler from + * bleeding the critical section out. + */ + preempt_disable(); + /* + * Same as in percpu_down_read(). + */ + if (likely(rcu_sync_is_idle(&sem->rss))) + __this_cpu_dec(*sem->read_count); + else + __percpu_up_read(sem); /* Unconditional memory barrier */ + preempt_enable(); + + rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_); +} extern void percpu_down_write(struct percpu_rw_semaphore *); extern void percpu_up_write(struct percpu_rw_semaphore *); extern int __percpu_init_rwsem(struct percpu_rw_semaphore *, const char *, struct lock_class_key *); + extern void percpu_free_rwsem(struct percpu_rw_semaphore *); -#define percpu_init_rwsem(brw) \ +#define percpu_init_rwsem(sem) \ ({ \ static struct lock_class_key rwsem_key; \ - __percpu_init_rwsem(brw, #brw, &rwsem_key); \ + __percpu_init_rwsem(sem, #sem, &rwsem_key); \ }) - #define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem) static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index bec0b647f9cc..ce182599cf2e 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -8,152 +8,186 @@ #include #include -int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, +int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, const char *name, struct lock_class_key *rwsem_key) { - brw->fast_read_ctr = alloc_percpu(int); - if (unlikely(!brw->fast_read_ctr)) + sem->read_count = alloc_percpu(int); + if (unlikely(!sem->read_count)) return -ENOMEM; /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ - __init_rwsem(&brw->rw_sem, name, rwsem_key); - rcu_sync_init(&brw->rss, RCU_SCHED_SYNC); - atomic_set(&brw->slow_read_ctr, 0); - init_waitqueue_head(&brw->write_waitq); + rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); + __init_rwsem(&sem->rw_sem, name, rwsem_key); + init_waitqueue_head(&sem->writer); + sem->readers_block = 0; return 0; } EXPORT_SYMBOL_GPL(__percpu_init_rwsem); -void percpu_free_rwsem(struct percpu_rw_semaphore *brw) +void percpu_free_rwsem(struct percpu_rw_semaphore *sem) { /* * XXX: temporary kludge. The error path in alloc_super() * assumes that percpu_free_rwsem() is safe after kzalloc(). */ - if (!brw->fast_read_ctr) + if (!sem->read_count) return; - rcu_sync_dtor(&brw->rss); - free_percpu(brw->fast_read_ctr); - brw->fast_read_ctr = NULL; /* catch use after free bugs */ + rcu_sync_dtor(&sem->rss); + free_percpu(sem->read_count); + sem->read_count = NULL; /* catch use after free bugs */ } EXPORT_SYMBOL_GPL(percpu_free_rwsem); -/* - * This is the fast-path for down_read/up_read. If it succeeds we rely - * on the barriers provided by rcu_sync_enter/exit; see the comments in - * percpu_down_write() and percpu_up_write(). - * - * If this helper fails the callers rely on the normal rw_semaphore and - * atomic_dec_and_test(), so in this case we have the necessary barriers. - */ -static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) +int __percpu_down_read(struct percpu_rw_semaphore *sem, int try) { - bool success; + /* + * Due to having preemption disabled the decrement happens on + * the same CPU as the increment, avoiding the + * increment-on-one-CPU-and-decrement-on-another problem. + * + * If the reader misses the writer's assignment of readers_block, then + * the writer is guaranteed to see the reader's increment. + * + * Conversely, any readers that increment their sem->read_count after + * the writer looks are guaranteed to see the readers_block value, + * which in turn means that they are guaranteed to immediately + * decrement their sem->read_count, so that it doesn't matter that the + * writer missed them. + */ - preempt_disable(); - success = rcu_sync_is_idle(&brw->rss); - if (likely(success)) - __this_cpu_add(*brw->fast_read_ctr, val); - preempt_enable(); + smp_mb(); /* A matches D */ - return success; -} + /* + * If !readers_block the critical section starts here, matched by the + * release in percpu_up_write(). + */ + if (likely(!smp_load_acquire(&sem->readers_block))) + return 1; -/* - * Like the normal down_read() this is not recursive, the writer can - * come after the first percpu_down_read() and create the deadlock. - * - * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, - * percpu_up_read() does rwsem_release(). This pairs with the usage - * of ->rw_sem in percpu_down/up_write(). - */ -void percpu_down_read(struct percpu_rw_semaphore *brw) -{ - might_sleep(); - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); + /* + * Per the above comment; we still have preemption disabled and + * will thus decrement on the same CPU as we incremented. + */ + __percpu_up_read(sem); - if (likely(update_fast_ctr(brw, +1))) - return; + if (try) + return 0; - /* Avoid rwsem_acquire_read() and rwsem_release() */ - __down_read(&brw->rw_sem); - atomic_inc(&brw->slow_read_ctr); - __up_read(&brw->rw_sem); -} -EXPORT_SYMBOL_GPL(percpu_down_read); + /* + * We either call schedule() in the wait, or we'll fall through + * and reschedule on the preempt_enable() in percpu_down_read(). + */ + preempt_enable_no_resched(); -int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) -{ - if (unlikely(!update_fast_ctr(brw, +1))) { - if (!__down_read_trylock(&brw->rw_sem)) - return 0; - atomic_inc(&brw->slow_read_ctr); - __up_read(&brw->rw_sem); - } - - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); + /* + * Avoid lockdep for the down/up_read() we already have them. + */ + __down_read(&sem->rw_sem); + this_cpu_inc(*sem->read_count); + __up_read(&sem->rw_sem); + + preempt_disable(); return 1; } +EXPORT_SYMBOL_GPL(__percpu_down_read); -void percpu_up_read(struct percpu_rw_semaphore *brw) +void __percpu_up_read(struct percpu_rw_semaphore *sem) { - rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); - - if (likely(update_fast_ctr(brw, -1))) - return; + smp_mb(); /* B matches C */ + /* + * In other words, if they see our decrement (presumably to aggregate + * zero, as that is the only time it matters) they will also see our + * critical section. + */ + __this_cpu_dec(*sem->read_count); - /* false-positive is possible but harmless */ - if (atomic_dec_and_test(&brw->slow_read_ctr)) - wake_up_all(&brw->write_waitq); + /* Prod writer to recheck readers_active */ + wake_up(&sem->writer); } -EXPORT_SYMBOL_GPL(percpu_up_read); +EXPORT_SYMBOL_GPL(__percpu_up_read); + +#define per_cpu_sum(var) \ +({ \ + typeof(var) __sum = 0; \ + int cpu; \ + compiletime_assert_atomic_type(__sum); \ + for_each_possible_cpu(cpu) \ + __sum += per_cpu(var, cpu); \ + __sum; \ +}) -static int clear_fast_ctr(struct percpu_rw_semaphore *brw) +/* + * Return true if the modular sum of the sem->read_count per-CPU variable is + * zero. If this sum is zero, then it is stable due to the fact that if any + * newly arriving readers increment a given counter, they will immediately + * decrement that same counter. + */ +static bool readers_active_check(struct percpu_rw_semaphore *sem) { - unsigned int sum = 0; - int cpu; + if (per_cpu_sum(*sem->read_count) != 0) + return false; + + /* + * If we observed the decrement; ensure we see the entire critical + * section. + */ - for_each_possible_cpu(cpu) { - sum += per_cpu(*brw->fast_read_ctr, cpu); - per_cpu(*brw->fast_read_ctr, cpu) = 0; - } + smp_mb(); /* C matches B */ - return sum; + return true; } -void percpu_down_write(struct percpu_rw_semaphore *brw) +void percpu_down_write(struct percpu_rw_semaphore *sem) { + /* Notify readers to take the slow path. */ + rcu_sync_enter(&sem->rss); + + down_write(&sem->rw_sem); + /* - * Make rcu_sync_is_idle() == F and thus disable the fast-path in - * percpu_down_read() and percpu_up_read(), and wait for gp pass. - * - * The latter synchronises us with the preceding readers which used - * the fast-past, so we can not miss the result of __this_cpu_add() - * or anything else inside their criticial sections. + * Notify new readers to block; up until now, and thus throughout the + * longish rcu_sync_enter() above, new readers could still come in. */ - rcu_sync_enter(&brw->rss); + WRITE_ONCE(sem->readers_block, 1); - /* exclude other writers, and block the new readers completely */ - down_write(&brw->rw_sem); + smp_mb(); /* D matches A */ - /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ - atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); + /* + * If they don't see our writer of readers_block, then we are + * guaranteed to see their sem->read_count increment, and therefore + * will wait for them. + */ - /* wait for all readers to complete their percpu_up_read() */ - wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); + /* Wait for all now active readers to complete. */ + wait_event(sem->writer, readers_active_check(sem)); } EXPORT_SYMBOL_GPL(percpu_down_write); -void percpu_up_write(struct percpu_rw_semaphore *brw) +void percpu_up_write(struct percpu_rw_semaphore *sem) { - /* release the lock, but the readers can't use the fast-path */ - up_write(&brw->rw_sem); /* - * Enable the fast-path in percpu_down_read() and percpu_up_read() - * but only after another gp pass; this adds the necessary barrier - * to ensure the reader can't miss the changes done by us. + * Signal the writer is done, no fast path yet. + * + * One reason that we cannot just immediately flip to readers_fast is + * that new readers might fail to see the results of this writer's + * critical section. + * + * Therefore we force it through the slow path which guarantees an + * acquire and thereby guarantees the critical section's consistency. + */ + smp_store_release(&sem->readers_block, 0); + + /* + * Release the write lock, this will allow readers back in the game. + */ + up_write(&sem->rw_sem); + + /* + * Once this completes (at least one RCU-sched grace period hence) the + * reader fast path will be available again. Safe to use outside the + * exclusive write lock because its counting. */ - rcu_sync_exit(&brw->rss); + rcu_sync_exit(&sem->rss); } EXPORT_SYMBOL_GPL(percpu_up_write); diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index be922c9f3d37..198473d90f81 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -68,6 +68,8 @@ void rcu_sync_lockdep_assert(struct rcu_sync *rsp) RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), "suspicious rcu_sync_is_idle() usage"); } + +EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert); #endif /** From 404f6aac9b3ef595735feca99979db084ea48315 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 8 Aug 2016 16:29:06 -0700 Subject: [PATCH 015/538] x86: Apply more __ro_after_init and const Guided by grsecurity's analogous __read_only markings in arch/x86, this applies several uses of __ro_after_init to structures that are only updated during __init, and const for some structures that are never updated. Additionally extends __init markings to some functions that are only used during __init, and cleans up some missing C99 style static initializers. Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brad Spengler Cc: Brian Gerst Cc: David Brown Cc: Denys Vlasenko Cc: Emese Revfy Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Mathias Krause Cc: Michael Ellerman Cc: PaX Team Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20160808232906.GA29731@www.outflux.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 2 +- arch/x86/include/asm/fpu/xstate.h | 3 ++- arch/x86/kernel/apic/apic_flat_64.c | 6 +++--- arch/x86/kernel/apic/apic_noop.c | 2 +- arch/x86/kernel/apic/bigsmp_32.c | 2 +- arch/x86/kernel/apic/msi.c | 2 +- arch/x86/kernel/apic/probe_32.c | 4 ++-- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- arch/x86/kernel/apic/x2apic_phys.c | 2 +- arch/x86/kernel/apic/x2apic_uv_x.c | 2 +- arch/x86/kernel/cpu/common.c | 11 ++++++++--- arch/x86/kernel/cpu/mtrr/main.c | 4 ++-- arch/x86/kernel/cpu/mtrr/mtrr.h | 2 +- arch/x86/kernel/ksysfs.c | 2 +- arch/x86/kernel/kvmclock.c | 2 +- arch/x86/kernel/paravirt.c | 2 +- arch/x86/kernel/ptrace.c | 6 +++--- arch/x86/kernel/reboot.c | 2 +- arch/x86/kernel/setup.c | 4 ++-- arch/x86/kernel/setup_percpu.c | 2 +- arch/x86/kernel/x86_init.c | 6 +++--- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 2 +- arch/x86/pci/pcbios.c | 7 +++++-- 24 files changed, 45 insertions(+), 36 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 4e10d73cf018..12080d87da3b 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -36,7 +36,7 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in extern struct desc_ptr idt_descr; extern gate_desc idt_table[]; -extern struct desc_ptr debug_idt_descr; +extern const struct desc_ptr debug_idt_descr; extern gate_desc debug_idt_table[]; struct gdt_page { diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index ae55a43e09c0..d4957ac72b48 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -45,7 +45,8 @@ extern u64 xfeatures_mask; extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; -extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); +extern void __init update_regset_xstate_info(unsigned int size, + u64 xstate_mask); void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 5b2ae106bd4a..8862da76ef6f 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -25,7 +25,7 @@ static struct apic apic_physflat; static struct apic apic_flat; -struct apic __read_mostly *apic = &apic_flat; +struct apic *apic __ro_after_init = &apic_flat; EXPORT_SYMBOL_GPL(apic); static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) @@ -154,7 +154,7 @@ static int flat_probe(void) return 1; } -static struct apic apic_flat = { +static struct apic apic_flat __ro_after_init = { .name = "flat", .probe = flat_probe, .acpi_madt_oem_check = flat_acpi_madt_oem_check, @@ -248,7 +248,7 @@ static int physflat_probe(void) return 0; } -static struct apic apic_physflat = { +static struct apic apic_physflat __ro_after_init = { .name = "physical flat", .probe = physflat_probe, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index c05688b2deff..b109e4389c92 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -108,7 +108,7 @@ static void noop_apic_write(u32 reg, u32 v) WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); } -struct apic apic_noop = { +struct apic apic_noop __ro_after_init = { .name = "noop", .probe = noop_probe, .acpi_madt_oem_check = NULL, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 06dbaa458bfe..56012010332c 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -142,7 +142,7 @@ static int probe_bigsmp(void) return dmi_bigsmp; } -static struct apic apic_bigsmp = { +static struct apic apic_bigsmp __ro_after_init = { .name = "bigsmp", .probe = probe_bigsmp, diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index ade25320df96..015bbf30e3e3 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -269,7 +269,7 @@ static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) hpet_msi_write(irq_data_get_irq_handler_data(data), msg); } -static struct irq_chip hpet_msi_controller = { +static struct irq_chip hpet_msi_controller __ro_after_init = { .name = "HPET-MSI", .irq_unmask = hpet_msi_unmask, .irq_mask = hpet_msi_mask, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 7c43e716c158..e5fb2f086460 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -72,7 +72,7 @@ static int probe_default(void) return 1; } -static struct apic apic_default = { +static struct apic apic_default __ro_after_init = { .name = "default", .probe = probe_default, @@ -126,7 +126,7 @@ static struct apic apic_default = { apic_driver(apic_default); -struct apic *apic = &apic_default; +struct apic *apic __ro_after_init = &apic_default; EXPORT_SYMBOL_GPL(apic); static int cmdline_apic __initdata; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 6368fa69d2af..766bdef1e1d7 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -222,7 +222,7 @@ static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask, cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu)); } -static struct apic apic_x2apic_cluster = { +static struct apic apic_x2apic_cluster __ro_after_init = { .name = "cluster x2apic", .probe = x2apic_cluster_probe, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 4f13f54f1b1f..ff111f05a314 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -98,7 +98,7 @@ static int x2apic_phys_probe(void) return apic == &apic_x2apic_phys; } -static struct apic apic_x2apic_phys = { +static struct apic apic_x2apic_phys __ro_after_init = { .name = "physical x2apic", .probe = x2apic_phys_probe, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 09b59adaea3f..ed887dedd35e 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -554,7 +554,7 @@ static int uv_probe(void) return apic == &apic_x2apic_uv_x; } -static struct apic __refdata apic_x2apic_uv_x = { +static struct apic apic_x2apic_uv_x __ro_after_init = { .name = "UV large system", .probe = uv_probe, diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 809eda03c527..d3b91be4873b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1265,9 +1265,14 @@ static __init int setup_disablecpuid(char *arg) __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 -struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; -struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, - (unsigned long) debug_idt_table }; +struct desc_ptr idt_descr __ro_after_init = { + .size = NR_VECTORS * 16 - 1, + .address = (unsigned long) idt_table, +}; +const struct desc_ptr debug_idt_descr = { + .size = NR_VECTORS * 16 - 1, + .address = (unsigned long) debug_idt_table, +}; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE) __visible; diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 28f1b54b7fad..24e87e74990d 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -72,14 +72,14 @@ static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; static bool mtrr_aps_delayed_init; -static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; +static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init; const struct mtrr_ops *mtrr_if; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); -void set_mtrr_ops(const struct mtrr_ops *ops) +void __init set_mtrr_ops(const struct mtrr_ops *ops) { if (ops->vendor && ops->vendor < X86_VENDOR_NUM) mtrr_ops[ops->vendor] = ops; diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 6c7ced07d16d..ad8bd763efa5 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -54,7 +54,7 @@ void fill_mtrr_var_range(unsigned int index, bool get_mtrr_state(void); void mtrr_bp_pat_init(void); -extern void set_mtrr_ops(const struct mtrr_ops *ops); +extern void __init set_mtrr_ops(const struct mtrr_ops *ops); extern u64 size_or_mask, size_and_mask; extern const struct mtrr_ops *mtrr_if; diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index c2bedaea11f7..4afc67f5facc 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -184,7 +184,7 @@ static ssize_t setup_data_data_read(struct file *fp, static struct kobj_attribute type_attr = __ATTR_RO(type); -static struct bin_attribute data_attr = { +static struct bin_attribute data_attr __ro_after_init = { .attr = { .name = "data", .mode = S_IRUGO, diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1d39bfbd26bb..0964399ef942 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -29,7 +29,7 @@ #include #include -static int kvmclock = 1; +static int kvmclock __ro_after_init = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; static cycle_t kvm_sched_clock_offset; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index ad5bc9578a73..b8e4680a2e0b 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -389,7 +389,7 @@ NOKPROBE_SYMBOL(native_load_idt); #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) #endif -struct pv_mmu_ops pv_mmu_ops = { +struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .read_cr2 = native_read_cr2, .write_cr2 = native_write_cr2, diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index f79576a541ff..2537cfba4d89 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1250,7 +1250,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, #ifdef CONFIG_X86_64 -static struct user_regset x86_64_regsets[] __read_mostly = { +static struct user_regset x86_64_regsets[] __ro_after_init = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct) / sizeof(long), @@ -1291,7 +1291,7 @@ static const struct user_regset_view user_x86_64_view = { #endif /* CONFIG_X86_64 */ #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION -static struct user_regset x86_32_regsets[] __read_mostly = { +static struct user_regset x86_32_regsets[] __ro_after_init = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct32) / sizeof(u32), @@ -1344,7 +1344,7 @@ static const struct user_regset_view user_x86_32_view = { */ u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; -void update_regset_xstate_info(unsigned int size, u64 xstate_mask) +void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) { #ifdef CONFIG_X86_64 x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 63bf27d972b7..e244c19a2451 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -705,7 +705,7 @@ static void native_machine_power_off(void) tboot_shutdown(TB_SHUTDOWN_HALT); } -struct machine_ops machine_ops = { +struct machine_ops machine_ops __ro_after_init = { .power_off = native_machine_power_off, .shutdown = native_machine_shutdown, .emergency_restart = native_machine_emergency_restart, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 95cf31c9f4ec..2d98798d395e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -210,9 +210,9 @@ EXPORT_SYMBOL(boot_cpu_data); #if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) -__visible unsigned long mmu_cr4_features; +__visible unsigned long mmu_cr4_features __ro_after_init; #else -__visible unsigned long mmu_cr4_features = X86_CR4_PAE; +__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE; #endif /* Boot loader ID and version as integers, for the benefit of proc_dointvec */ diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 7a40e068302d..1d5c79473639 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -33,7 +33,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number); DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; EXPORT_PER_CPU_SYMBOL(this_cpu_off); -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { +unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = { [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET, }; EXPORT_SYMBOL(__per_cpu_offset); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 76c5e52436c4..0bd9f1287f39 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -91,7 +91,7 @@ struct x86_cpuinit_ops x86_cpuinit = { static void default_nmi_init(void) { }; static int default_i8042_detect(void) { return 1; }; -struct x86_platform_ops x86_platform = { +struct x86_platform_ops x86_platform __ro_after_init = { .calibrate_cpu = native_calibrate_cpu, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, @@ -108,7 +108,7 @@ struct x86_platform_ops x86_platform = { EXPORT_SYMBOL_GPL(x86_platform); #if defined(CONFIG_PCI_MSI) -struct x86_msi_ops x86_msi = { +struct x86_msi_ops x86_msi __ro_after_init = { .setup_msi_irqs = native_setup_msi_irqs, .teardown_msi_irq = native_teardown_msi_irq, .teardown_msi_irqs = default_teardown_msi_irqs, @@ -137,7 +137,7 @@ void arch_restore_msi_irqs(struct pci_dev *dev) } #endif -struct x86_io_apic_ops x86_io_apic_ops = { +struct x86_io_apic_ops x86_io_apic_ops __ro_after_init = { .read = native_io_apic_read, .disable = native_disable_io_apic, }; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index af523d84d102..1e6b84b96ea6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4961,7 +4961,7 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } -static struct kvm_x86_ops svm_x86_ops = { +static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, .hardware_setup = svm_hardware_setup, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a45d8580f91e..87eaa6bfc26a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11175,7 +11175,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) ~FEATURE_CONTROL_LMCE; } -static struct kvm_x86_ops vmx_x86_ops = { +static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, .hardware_setup = hardware_setup, diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 9770e55e768f..1d97cea3b3a4 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -120,9 +120,12 @@ static unsigned long __init bios32_service(unsigned long service) static struct { unsigned long address; unsigned short segment; -} pci_indirect = { 0, __KERNEL_CS }; +} pci_indirect __ro_after_init = { + .address = 0, + .segment = __KERNEL_CS, +}; -static int pci_bios_present; +static int pci_bios_present __ro_after_init; static int __init check_pcibios(void) { From aa877175e7a9982233ed8f10cb4bfddd78d82741 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Wed, 3 Aug 2016 13:22:28 -0400 Subject: [PATCH 016/538] cpu/hotplug: Prevent alloc/free of irq descriptors during CPU up/down (again) Now that Xen no longer allocates irqs in _cpu_up() we can restore commit: a89941816726 ("hotplug: Prevent alloc/free of irq descriptors during cpu up/down") Signed-off-by: Boris Ostrovsky Reviewed-by: Juergen Gross Acked-by: Thomas Gleixner Cc: Anna-Maria Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: david.vrabel@citrix.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1470244948-17674-3-git-send-email-boris.ostrovsky@oracle.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 11 ----------- kernel/cpu.c | 8 ++++++++ 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2a6e84a30a54..067de612d3fa 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1108,17 +1108,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) common_cpu_up(cpu, tidle); - /* - * We have to walk the irq descriptors to setup the vector - * space for the cpu which comes online. Prevent irq - * alloc/free across the bringup. - */ - irq_lock_sparse(); - err = do_boot_cpu(apicid, cpu, tidle); - if (err) { - irq_unlock_sparse(); pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); return -EIO; } @@ -1136,8 +1127,6 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) touch_nmi_watchdog(); } - irq_unlock_sparse(); - return 0; } diff --git a/kernel/cpu.c b/kernel/cpu.c index 341bf80f80bd..ec12b726fa6f 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -349,8 +349,16 @@ static int bringup_cpu(unsigned int cpu) struct task_struct *idle = idle_thread_get(cpu); int ret; + /* + * Some architectures have to walk the irq descriptors to + * setup the vector space for the cpu which comes online. + * Prevent irq alloc/free across the bringup. + */ + irq_lock_sparse(); + /* Arch-specific enabling code. */ ret = __cpu_up(cpu, idle); + irq_unlock_sparse(); if (ret) { cpu_notify(CPU_UP_CANCELED, cpu); return ret; From d1c6d149cf04d6c7c3c3ebf4b66c82500cbcf6e1 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sat, 23 Jul 2016 09:46:39 +0200 Subject: [PATCH 017/538] sched/debug: Make the "Preemption disabled at ..." message more useful This message is currently really useless since it always prints a value that comes from the printk() we just did, e.g.: BUG: sleeping function called from invalid context at mm/slab.h:388 in_atomic(): 0, irqs_disabled(): 0, pid: 31996, name: trinity-c1 Preemption disabled at:[] down_trylock+0x13/0x80 BUG: sleeping function called from invalid context at include/linux/freezer.h:56 in_atomic(): 0, irqs_disabled(): 0, pid: 31996, name: trinity-c1 Preemption disabled at:[] console_unlock+0x2f7/0x930 Here, both down_trylock() and console_unlock() is somewhere in the printk() path. We should save the value before calling printk() and use the saved value instead. That immediately reveals the offending callsite: BUG: sleeping function called from invalid context at mm/slab.h:388 in_atomic(): 0, irqs_disabled(): 0, pid: 14971, name: trinity-c2 Preemption disabled at:[] rhashtable_walk_start+0x46/0x150 Bug report: http://marc.info/?l=linux-netdev&m=146925979821849&w=2 Signed-off-by: Vegard Nossum Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rusty Russel Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/sched.h | 9 +++++++++ kernel/sched/core.c | 21 +++++++++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index f3db596efd2c..7f64e89a5873 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3236,6 +3236,15 @@ static inline void cond_resched_rcu(void) #endif } +static inline unsigned long get_preempt_disable_ip(struct task_struct *p) +{ +#ifdef CONFIG_DEBUG_PREEMPT + return p->preempt_disable_ip; +#else + return 0; +#endif +} + /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPT, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 10f2595c408a..a65681605aef 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3171,6 +3171,9 @@ static inline void preempt_latency_stop(int val) { } */ static noinline void __schedule_bug(struct task_struct *prev) { + /* Save this before calling printk(), since that will clobber it */ + unsigned long preempt_disable_ip = get_preempt_disable_ip(current); + if (oops_in_progress) return; @@ -3181,13 +3184,12 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); -#ifdef CONFIG_DEBUG_PREEMPT - if (in_atomic_preempt_off()) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) + && in_atomic_preempt_off()) { pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); + print_ip_sym(preempt_disable_ip); pr_cont("\n"); } -#endif if (panic_on_warn) panic("scheduling while atomic\n"); @@ -7571,6 +7573,7 @@ EXPORT_SYMBOL(__might_sleep); void ___might_sleep(const char *file, int line, int preempt_offset) { static unsigned long prev_jiffy; /* ratelimiting */ + unsigned long preempt_disable_ip; rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && @@ -7581,6 +7584,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset) return; prev_jiffy = jiffies; + /* Save this before calling printk(), since that will clobber it */ + preempt_disable_ip = get_preempt_disable_ip(current); + printk(KERN_ERR "BUG: sleeping function called from invalid context at %s:%d\n", file, line); @@ -7595,13 +7601,12 @@ void ___might_sleep(const char *file, int line, int preempt_offset) debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); -#ifdef CONFIG_DEBUG_PREEMPT - if (!preempt_count_equals(preempt_offset)) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) + && !preempt_count_equals(preempt_offset)) { pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); + print_ip_sym(preempt_disable_ip); pr_cont("\n"); } -#endif dump_stack(); } EXPORT_SYMBOL(___might_sleep); From 25dfe4785332723f09311dcb7fd91015a60c022f Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Wed, 27 Jul 2016 08:59:56 -0700 Subject: [PATCH 018/538] x86/mm/64: Enable KASLR for vmemmap memory region Add vmemmap in the list of randomized memory regions. The vmemmap region holds a representation of the physical memory (through a struct page array). An attacker could use this region to disclose the kernel memory layout (walking the page linked list). Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/1469635196-122447-1-git-send-email-thgarnie@google.com [ Minor edits. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kaslr.h | 1 + arch/x86/include/asm/pgtable_64_types.h | 4 +++- arch/x86/mm/kaslr.c | 26 +++++++++++++++++++++++-- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h index 2674ee3de748..1052a797d71d 100644 --- a/arch/x86/include/asm/kaslr.h +++ b/arch/x86/include/asm/kaslr.h @@ -6,6 +6,7 @@ unsigned long kaslr_get_random_long(const char *purpose); #ifdef CONFIG_RANDOMIZE_MEMORY extern unsigned long page_offset_base; extern unsigned long vmalloc_base; +extern unsigned long vmemmap_base; void kernel_randomize_memory(void); #else diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 6fdef9eef2d5..3a264200c62f 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -57,11 +57,13 @@ typedef struct { pteval_t pte; } pte_t; #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) #define VMALLOC_SIZE_TB _AC(32, UL) #define __VMALLOC_BASE _AC(0xffffc90000000000, UL) -#define VMEMMAP_START _AC(0xffffea0000000000, UL) +#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) #ifdef CONFIG_RANDOMIZE_MEMORY #define VMALLOC_START vmalloc_base +#define VMEMMAP_START vmemmap_base #else #define VMALLOC_START __VMALLOC_BASE +#define VMEMMAP_START __VMEMMAP_BASE #endif /* CONFIG_RANDOMIZE_MEMORY */ #define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index ec8654f117d8..aec03aa96312 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -40,17 +40,26 @@ * You need to add an if/def entry if you introduce a new memory region * compatible with KASLR. Your entry must be in logical order with memory * layout. For example, ESPFIX is before EFI because its virtual address is - * before. You also need to add a BUILD_BUG_ON in kernel_randomize_memory to + * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to * ensure that this order is correct and won't be changed. */ static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; -static const unsigned long vaddr_end = VMEMMAP_START; + +#if defined(CONFIG_X86_ESPFIX64) +static const unsigned long vaddr_end = ESPFIX_BASE_ADDR; +#elif defined(CONFIG_EFI) +static const unsigned long vaddr_end = EFI_VA_START; +#else +static const unsigned long vaddr_end = __START_KERNEL_map; +#endif /* Default values */ unsigned long page_offset_base = __PAGE_OFFSET_BASE; EXPORT_SYMBOL(page_offset_base); unsigned long vmalloc_base = __VMALLOC_BASE; EXPORT_SYMBOL(vmalloc_base); +unsigned long vmemmap_base = __VMEMMAP_BASE; +EXPORT_SYMBOL(vmemmap_base); /* * Memory regions randomized by KASLR (except modules that use a separate logic @@ -63,6 +72,7 @@ static __initdata struct kaslr_memory_region { } kaslr_regions[] = { { &page_offset_base, 64/* Maximum */ }, { &vmalloc_base, VMALLOC_SIZE_TB }, + { &vmemmap_base, 1 }, }; /* Get size in bytes used by the memory region */ @@ -89,6 +99,18 @@ void __init kernel_randomize_memory(void) struct rnd_state rand_state; unsigned long remain_entropy; + /* + * All these BUILD_BUG_ON checks ensures the memory layout is + * consistent with the vaddr_start/vaddr_end variables. + */ + BUILD_BUG_ON(vaddr_start >= vaddr_end); + BUILD_BUG_ON(config_enabled(CONFIG_X86_ESPFIX64) && + vaddr_end >= EFI_VA_START); + BUILD_BUG_ON((config_enabled(CONFIG_X86_ESPFIX64) || + config_enabled(CONFIG_EFI)) && + vaddr_end >= __START_KERNEL_map); + BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); + if (!kaslr_memory_enabled()) return; From f0b22e39e3409109d40ef036b1f46b419e82f58e Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 22 Jul 2016 21:46:02 +0200 Subject: [PATCH 019/538] sched/debug: Add taint on "BUG: Sleeping function called from invalid context" Seeing this, it occurs to me that we should probably add a taint here: BUG: sleeping function called from invalid context at mm/slab.h:388 in_atomic(): 0, irqs_disabled(): 0, pid: 32211, name: trinity-c3 Preemption disabled at:[] console_unlock+0x2f7/0x930 CPU: 3 PID: 32211 Comm: trinity-c3 Not tainted 4.7.0-rc7+ #19 ^^^^^^^^^^^ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 0000000000000000 ffff8800b8a17160 ffffffff81971441 ffff88011a3c4c80 ffff88011a3c4c80 ffff8800b8a17198 ffffffff81158067 0000000000000de6 ffff88011a3c4c80 ffffffff8390e07c 0000000000000184 0000000000000000 Call Trace: [...] BUG: sleeping function called from invalid context at arch/x86/mm/fault.c:1309 in_atomic(): 0, irqs_disabled(): 0, pid: 32211, name: trinity-c3 Preemption disabled at:[] down_trylock+0x13/0x80 CPU: 3 PID: 32211 Comm: trinity-c3 Not tainted 4.7.0-rc7+ #19 ^^^^^^^^^^^ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 0000000000000000 ffff8800b8a17e08 ffffffff81971441 ffff88011a3c4c80 ffff88011a3c4c80 ffff8800b8a17e40 ffffffff81158067 0000000000000000 ffff88011a3c4c80 ffffffff83437b20 000000000000051d 0000000000000000 Call Trace: [...] Signed-off-by: Vegard Nossum Acked-by: Thomas Gleixner Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rusty Russel Link: http://lkml.kernel.org/r/1469216762-19626-1-git-send-email-vegard.nossum@oracle.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a65681605aef..3b6b23c57418 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7608,6 +7608,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) pr_cont("\n"); } dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } EXPORT_SYMBOL(___might_sleep); #endif From dfeccea61716d3ca1bf3477610d1f29abf6d99ca Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 11 Aug 2016 11:17:40 -0700 Subject: [PATCH 020/538] locking/Documentation: Maintain consistent blank line Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dhowells@redhat.com Cc: linux-arch@vger.kernel.org Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1470939463-31950-1-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index a4d0a99de04d..e1926a096818 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -1928,6 +1928,7 @@ There are some more advanced barrier functions: See Documentation/DMA-API.txt for more information on consistent memory. + MMIO WRITE BARRIER ------------------ From d7cab36db83be458e8987ae352902958977e7925 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 11 Aug 2016 11:17:41 -0700 Subject: [PATCH 021/538] locking/Documentation: Fix wrong section reference Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dhowells@redhat.com Cc: linux-arch@vger.kernel.org Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1470939463-31950-2-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index e1926a096818..19c8eb6f246e 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -2076,7 +2076,7 @@ systems, and so cannot be counted on in such a situation to actually achieve anything at all - especially with respect to I/O accesses - unless combined with interrupt disabling operations. -See also the section on "Inter-CPU locking barrier effects". +See also the section on "Inter-CPU acquiring barrier effects". As an example, consider the following: From 8b9e771555745a029557a0a481e760fb84376a35 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 11 Aug 2016 11:17:42 -0700 Subject: [PATCH 022/538] locking/Documentation: Fix a typo of example result An example result for data dependent write has a typo. This commit fixes the wrong typo. Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dhowells@redhat.com Cc: linux-arch@vger.kernel.org Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1470939463-31950-3-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 19c8eb6f246e..ba818ecce6f9 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -609,7 +609,7 @@ A data-dependency barrier must also order against dependent writes: The data-dependency barrier must order the read into Q with the store into *Q. This prohibits this outcome: - (Q == B) && (B == 4) + (Q == &B) && (B == 4) Please note that this pattern should be rare. After all, the whole point of dependency ordering is to -prevent- writes to the data structure, along From e8cb0fe6e7cc9b02bd97eae6efa8fe927fbfc905 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 11 Aug 2016 11:17:43 -0700 Subject: [PATCH 023/538] locking/Documentation: Add Korean translation This commit adds Korean version of memory-barriers.txt document. The header is referred to HOWTO Korean version. The translation has started from Feb, 2016 and using a public git repository[1] to maintain the work. It's commit history says that it is following upstream changes as well. [1] https://github.com/sjp38/linux.doc_trans_membarrier Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney Reviewed-by: Byungchul Park Acked-by: David Howells Acked-by: Minchan Kim Acked-by: Jonathan Corbet Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1470939463-31950-4-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- Documentation/ko_KR/memory-barriers.txt | 3135 +++++++++++++++++++++++ 1 file changed, 3135 insertions(+) create mode 100644 Documentation/ko_KR/memory-barriers.txt diff --git a/Documentation/ko_KR/memory-barriers.txt b/Documentation/ko_KR/memory-barriers.txt new file mode 100644 index 000000000000..34d3d380893d --- /dev/null +++ b/Documentation/ko_KR/memory-barriers.txt @@ -0,0 +1,3135 @@ +NOTE: +This is a version of Documentation/memory-barriers.txt translated into Korean. +This document is maintained by SeongJae Park . +If you find any difference between this document and the original file or +a problem with the translation, please contact the maintainer of this file. + +Please also note that the purpose of this file is to be easier to +read for non English (read: Korean) speakers and is not intended as +a fork. So if you have any comments or updates for this file please +update the original English file first. The English version is +definitive, and readers should look there if they have any doubt. + +=================================== +이 문서는 +Documentation/memory-barriers.txt +의 한글 번역입니다. + +역자: 박성재 +=================================== + + + ========================= + 리눅스 커널 메모리 배리어 + ========================= + +저자: David Howells + Paul E. McKenney + Will Deacon + Peter Zijlstra + +======== +면책조항 +======== + +이 문서는 명세서가 아닙니다; 이 문서는 완벽하지 않은데, 간결성을 위해 의도된 +부분도 있고, 의도하진 않았지만 사람에 의해 쓰였다보니 불완전한 부분도 있습니다. +이 문서는 리눅스에서 제공하는 다양한 메모리 배리어들을 사용하기 위한 +안내서입니다만, 뭔가 이상하다 싶으면 (그런게 많을 겁니다) 질문을 부탁드립니다. + +다시 말하지만, 이 문서는 리눅스가 하드웨어에 기대하는 사항에 대한 명세서가 +아닙니다. + +이 문서의 목적은 두가지입니다: + + (1) 어떤 특정 배리어에 대해 기대할 수 있는 최소한의 기능을 명세하기 위해서, + 그리고 + + (2) 사용 가능한 배리어들에 대해 어떻게 사용해야 하는지에 대한 안내를 제공하기 + 위해서. + +어떤 아키텍쳐는 특정한 배리어들에 대해서는 여기서 이야기하는 최소한의 +요구사항들보다 많은 기능을 제공할 수도 있습니다만, 여기서 이야기하는 +요구사항들을 충족하지 않는 아키텍쳐가 있다면 그 아키텍쳐가 잘못된 것이란 점을 +알아두시기 바랍니다. + +또한, 특정 아키텍쳐에서 일부 배리어는 해당 아키텍쳐의 특수한 동작 방식으로 인해 +해당 배리어의 명시적 사용이 불필요해서 no-op 이 될수도 있음을 알아두시기 +바랍니다. + +역자: 본 번역 역시 완벽하지 않은데, 이 역시 부분적으로는 의도된 것이기도 +합니다. 여타 기술 문서들이 그렇듯 완벽한 이해를 위해서는 번역문과 원문을 함께 +읽으시되 번역문을 하나의 가이드로 활용하시길 추천드리며, 발견되는 오역 등에 +대해서는 언제든 의견을 부탁드립니다. 과한 번역으로 인한 오해를 최소화하기 위해 +애매한 부분이 있을 경우에는 어색함이 있더라도 원래의 용어를 차용합니다. + + +===== +목차: +===== + + (*) 추상 메모리 액세스 모델. + + - 디바이스 오퍼레이션. + - 보장사항. + + (*) 메모리 배리어란 무엇인가? + + - 메모리 배리어의 종류. + - 메모리 배리어에 대해 가정해선 안될 것. + - 데이터 의존성 배리어. + - 컨트롤 의존성. + - SMP 배리어 짝맞추기. + - 메모리 배리어 시퀀스의 예. + - 읽기 메모리 배리어 vs 로드 예측. + - 이행성 + + (*) 명시적 커널 배리어. + + - 컴파일러 배리어. + - CPU 메모리 배리어. + - MMIO 쓰기 배리어. + + (*) 암묵적 커널 메모리 배리어. + + - 락 Acquisition 함수. + - 인터럽트 비활성화 함수. + - 슬립과 웨이크업 함수. + - 그외의 함수들. + + (*) CPU 간 ACQUIRING 배리어의 효과. + + - Acquire vs 메모리 액세스. + - Acquire vs I/O 액세스. + + (*) 메모리 배리어가 필요한 곳 + + - 프로세서간 상호 작용. + - 어토믹 오퍼레이션. + - 디바이스 액세스. + - 인터럽트. + + (*) 커널 I/O 배리어의 효과. + + (*) 가정되는 가장 완화된 실행 순서 모델. + + (*) CPU 캐시의 영향. + + - 캐시 일관성. + - 캐시 일관성 vs DMA. + - 캐시 일관성 vs MMIO. + + (*) CPU 들이 저지르는 일들. + + - 그리고, Alpha 가 있다. + - 가상 머신 게스트. + + (*) 사용 예. + + - 순환식 버퍼. + + (*) 참고 문헌. + + +======================= +추상 메모리 액세스 모델 +======================= + +다음과 같이 추상화된 시스템 모델을 생각해 봅시다: + + : : + : : + : : + +-------+ : +--------+ : +-------+ + | | : | | : | | + | | : | | : | | + | CPU 1 |<----->| Memory |<----->| CPU 2 | + | | : | | : | | + | | : | | : | | + +-------+ : +--------+ : +-------+ + ^ : ^ : ^ + | : | : | + | : | : | + | : v : | + | : +--------+ : | + | : | | : | + | : | | : | + +---------->| Device |<----------+ + : | | : + : | | : + : +--------+ : + : : + +프로그램은 여러 메모리 액세스 오퍼레이션을 발생시키고, 각각의 CPU 는 그런 +프로그램들을 실행합니다. 추상화된 CPU 모델에서 메모리 오퍼레이션들의 순서는 +매우 완화되어 있고, CPU 는 프로그램이 인과관계를 어기지 않는 상태로 관리된다고 +보일 수만 있다면 메모리 오퍼레이션을 자신이 원하는 어떤 순서대로든 재배치해 +동작시킬 수 있습니다. 비슷하게, 컴파일러 또한 프로그램의 정상적 동작을 해치지 +않는 한도 내에서는 어떤 순서로든 자신이 원하는 대로 인스트럭션을 재배치 할 수 +있습니다. + +따라서 위의 다이어그램에서 한 CPU가 동작시키는 메모리 오퍼레이션이 만들어내는 +변화는 해당 오퍼레이션이 CPU 와 시스템의 다른 부분들 사이의 인터페이스(점선)를 +지나가면서 시스템의 나머지 부분들에 인지됩니다. + + +예를 들어, 다음의 일련의 이벤트들을 생각해 봅시다: + + CPU 1 CPU 2 + =============== =============== + { A == 1; B == 2 } + A = 3; x = B; + B = 4; y = A; + +다이어그램의 가운데에 위치한 메모리 시스템에 보여지게 되는 액세스들은 다음의 총 +24개의 조합으로 재구성될 수 있습니다: + + STORE A=3, STORE B=4, y=LOAD A->3, x=LOAD B->4 + STORE A=3, STORE B=4, x=LOAD B->4, y=LOAD A->3 + STORE A=3, y=LOAD A->3, STORE B=4, x=LOAD B->4 + STORE A=3, y=LOAD A->3, x=LOAD B->2, STORE B=4 + STORE A=3, x=LOAD B->2, STORE B=4, y=LOAD A->3 + STORE A=3, x=LOAD B->2, y=LOAD A->3, STORE B=4 + STORE B=4, STORE A=3, y=LOAD A->3, x=LOAD B->4 + STORE B=4, ... + ... + +따라서 다음의 네가지 조합의 값들이 나올 수 있습니다: + + x == 2, y == 1 + x == 2, y == 3 + x == 4, y == 1 + x == 4, y == 3 + + +한발 더 나아가서, 한 CPU 가 메모리 시스템에 반영한 스토어 오퍼레이션들의 결과는 +다른 CPU 에서의 로드 오퍼레이션을 통해 인지되는데, 이 때 스토어가 반영된 순서와 +다른 순서로 인지될 수도 있습니다. + + +예로, 아래의 일련의 이벤트들을 생각해 봅시다: + + CPU 1 CPU 2 + =============== =============== + { A == 1, B == 2, C == 3, P == &A, Q == &C } + B = 4; Q = P; + P = &B D = *Q; + +D 로 읽혀지는 값은 CPU 2 에서 P 로부터 읽혀진 주소값에 의존적이기 때문에 여기엔 +분명한 데이터 의존성이 있습니다. 하지만 이 이벤트들의 실행 결과로는 아래의 +결과들이 모두 나타날 수 있습니다: + + (Q == &A) and (D == 1) + (Q == &B) and (D == 2) + (Q == &B) and (D == 4) + +CPU 2 는 *Q 의 로드를 요청하기 전에 P 를 Q 에 넣기 때문에 D 에 C 를 집어넣는 +일은 없음을 알아두세요. + + +디바이스 오퍼레이션 +------------------- + +일부 디바이스는 자신의 컨트롤 인터페이스를 메모리의 특정 영역으로 매핑해서 +제공하는데(Memory mapped I/O), 해당 컨트롤 레지스터에 접근하는 순서는 매우 +중요합니다. 예를 들어, 어드레스 포트 레지스터 (A) 와 데이터 포트 레지스터 (D) +를 통해 접근되는 내부 레지스터 집합을 갖는 이더넷 카드를 생각해 봅시다. 내부의 +5번 레지스터를 읽기 위해 다음의 코드가 사용될 수 있습니다: + + *A = 5; + x = *D; + +하지만, 이건 다음의 두 조합 중 하나로 만들어질 수 있습니다: + + STORE *A = 5, x = LOAD *D + x = LOAD *D, STORE *A = 5 + +두번째 조합은 데이터를 읽어온 _후에_ 주소를 설정하므로, 오동작을 일으킬 겁니다. + + +보장사항 +-------- + +CPU 에게 기대할 수 있는 최소한의 보장사항 몇가지가 있습니다: + + (*) 어떤 CPU 든, 의존성이 존재하는 메모리 액세스들은 해당 CPU 자신에게 + 있어서는 순서대로 메모리 시스템에 수행 요청됩니다. 즉, 다음에 대해서: + + Q = READ_ONCE(P); smp_read_barrier_depends(); D = READ_ONCE(*Q); + + CPU 는 다음과 같은 메모리 오퍼레이션 시퀀스를 수행 요청합니다: + + Q = LOAD P, D = LOAD *Q + + 그리고 그 시퀀스 내에서의 순서는 항상 지켜집니다. 대부분의 시스템에서 + smp_read_barrier_depends() 는 아무일도 안하지만 DEC Alpha 에서는 + 명시적으로 사용되어야 합니다. 보통의 경우에는 smp_read_barrier_depends() + 를 직접 사용하는 대신 rcu_dereference() 같은 것들을 사용해야 함을 + 알아두세요. + + (*) 특정 CPU 내에서 겹치는 영역의 메모리에 행해지는 로드와 스토어 들은 해당 + CPU 안에서는 순서가 바뀌지 않은 것으로 보여집니다. 즉, 다음에 대해서: + + a = READ_ONCE(*X); WRITE_ONCE(*X, b); + + CPU 는 다음의 메모리 오퍼레이션 시퀀스만을 메모리에 요청할 겁니다: + + a = LOAD *X, STORE *X = b + + 그리고 다음에 대해서는: + + WRITE_ONCE(*X, c); d = READ_ONCE(*X); + + CPU 는 다음의 수행 요청만을 만들어 냅니다: + + STORE *X = c, d = LOAD *X + + (로드 오퍼레이션과 스토어 오퍼레이션이 겹치는 메모리 영역에 대해 + 수행된다면 해당 오퍼레이션들은 겹친다고 표현됩니다). + +그리고 _반드시_ 또는 _절대로_ 가정하거나 가정하지 말아야 하는 것들이 있습니다: + + (*) 컴파일러가 READ_ONCE() 나 WRITE_ONCE() 로 보호되지 않은 메모리 액세스를 + 당신이 원하는 대로 할 것이라는 가정은 _절대로_ 해선 안됩니다. 그것들이 + 없다면, 컴파일러는 컴파일러 배리어 섹션에서 다루게 될, 모든 "창의적인" + 변경들을 만들어낼 권한을 갖게 됩니다. + + (*) 개별적인 로드와 스토어들이 주어진 순서대로 요청될 것이라는 가정은 _절대로_ + 하지 말아야 합니다. 이 말은 곧: + + X = *A; Y = *B; *D = Z; + + 는 다음의 것들 중 어느 것으로든 만들어질 수 있다는 의미입니다: + + X = LOAD *A, Y = LOAD *B, STORE *D = Z + X = LOAD *A, STORE *D = Z, Y = LOAD *B + Y = LOAD *B, X = LOAD *A, STORE *D = Z + Y = LOAD *B, STORE *D = Z, X = LOAD *A + STORE *D = Z, X = LOAD *A, Y = LOAD *B + STORE *D = Z, Y = LOAD *B, X = LOAD *A + + (*) 겹치는 메모리 액세스들은 합쳐지거나 버려질 수 있음을 _반드시_ 가정해야 + 합니다. 다음의 코드는: + + X = *A; Y = *(A + 4); + + 다음의 것들 중 뭐든 될 수 있습니다: + + X = LOAD *A; Y = LOAD *(A + 4); + Y = LOAD *(A + 4); X = LOAD *A; + {X, Y} = LOAD {*A, *(A + 4) }; + + 그리고: + + *A = X; *(A + 4) = Y; + + 는 다음 중 뭐든 될 수 있습니다: + + STORE *A = X; STORE *(A + 4) = Y; + STORE *(A + 4) = Y; STORE *A = X; + STORE {*A, *(A + 4) } = {X, Y}; + +그리고 보장사항에 반대되는 것들(anti-guarantees)이 있습니다: + + (*) 이 보장사항들은 bitfield 에는 적용되지 않는데, 컴파일러들은 bitfield 를 + 수정하는 코드를 생성할 때 원자성 없는(non-atomic) 읽고-수정하고-쓰는 + 인스트럭션들의 조합을 만드는 경우가 많기 때문입니다. 병렬 알고리즘의 + 동기화에 bitfield 를 사용하려 하지 마십시오. + + (*) bitfield 들이 여러 락으로 보호되는 경우라 하더라도, 하나의 bitfield 의 + 모든 필드들은 하나의 락으로 보호되어야 합니다. 만약 한 bitfield 의 두 + 필드가 서로 다른 락으로 보호된다면, 컴파일러의 원자성 없는 + 읽고-수정하고-쓰는 인스트럭션 조합은 한 필드에의 업데이트가 근처의 + 필드에도 영향을 끼치게 할 수 있습니다. + + (*) 이 보장사항들은 적절하게 정렬되고 크기가 잡힌 스칼라 변수들에 대해서만 + 적용됩니다. "적절하게 크기가 잡힌" 이라함은 현재로써는 "char", "short", + "int" 그리고 "long" 과 같은 크기의 변수들을 의미합니다. "적절하게 정렬된" + 은 자연스런 정렬을 의미하는데, 따라서 "char" 에 대해서는 아무 제약이 없고, + "short" 에 대해서는 2바이트 정렬을, "int" 에는 4바이트 정렬을, 그리고 + "long" 에 대해서는 32-bit 시스템인지 64-bit 시스템인지에 따라 4바이트 또는 + 8바이트 정렬을 의미합니다. 이 보장사항들은 C11 표준에서 소개되었으므로, + C11 전의 오래된 컴파일러(예를 들어, gcc 4.6) 를 사용할 때엔 주의하시기 + 바랍니다. 표준에 이 보장사항들은 "memory location" 을 정의하는 3.14 + 섹션에 다음과 같이 설명되어 있습니다: + (역자: 인용문이므로 번역하지 않습니다) + + memory location + either an object of scalar type, or a maximal sequence + of adjacent bit-fields all having nonzero width + + NOTE 1: Two threads of execution can update and access + separate memory locations without interfering with + each other. + + NOTE 2: A bit-field and an adjacent non-bit-field member + are in separate memory locations. The same applies + to two bit-fields, if one is declared inside a nested + structure declaration and the other is not, or if the two + are separated by a zero-length bit-field declaration, + or if they are separated by a non-bit-field member + declaration. It is not safe to concurrently update two + bit-fields in the same structure if all members declared + between them are also bit-fields, no matter what the + sizes of those intervening bit-fields happen to be. + + +========================= +메모리 배리어란 무엇인가? +========================= + +앞에서 봤듯이, 상호간 의존성이 없는 메모리 오퍼레이션들은 실제로는 무작위적 +순서로 수행될 수 있으며, 이는 CPU 와 CPU 간의 상호작용이나 I/O 에 문제가 될 수 +있습니다. 따라서 컴파일러와 CPU 가 순서를 바꾸는데 제약을 걸 수 있도록 개입할 +수 있는 어떤 방법이 필요합니다. + +메모리 배리어는 그런 개입 수단입니다. 메모리 배리어는 배리어를 사이에 둔 앞과 +뒤 양측의 메모리 오퍼레이션들 간에 부분적 순서가 존재하도록 하는 효과를 줍니다. + +시스템의 CPU 들과 여러 디바이스들은 성능을 올리기 위해 명령어 재배치, 실행 +유예, 메모리 오퍼레이션들의 조합, 예측적 로드(speculative load), 브랜치 +예측(speculative branch prediction), 다양한 종류의 캐싱(caching) 등의 다양한 +트릭을 사용할 수 있기 때문에 이런 강제력은 중요합니다. 메모리 배리어들은 이런 +트릭들을 무효로 하거나 억제하는 목적으로 사용되어져서 코드가 여러 CPU 와 +디바이스들 간의 상호작용을 정상적으로 제어할 수 있게 해줍니다. + + +메모리 배리어의 종류 +-------------------- + +메모리 배리어는 네개의 기본 타입으로 분류됩니다: + + (1) 쓰기 (또는 스토어) 메모리 배리어. + + 쓰기 메모리 배리어는 시스템의 다른 컴포넌트들에 해당 배리어보다 앞서 + 명시된 모든 STORE 오퍼레이션들이 해당 배리어 뒤에 명시된 모든 STORE + 오퍼레이션들보다 먼저 수행된 것으로 보일 것을 보장합니다. + + 쓰기 배리어는 스토어 오퍼레이션들에 대한 부분적 순서 세우기입니다; 로드 + 오퍼레이션들에 대해서는 어떤 영향도 끼치지 않습니다. + + CPU 는 시간의 흐름에 따라 메모리 시스템에 일련의 스토어 오퍼레이션들을 + 하나씩 요청해 집어넣습니다. 쓰기 배리어 앞의 모든 스토어 오퍼레이션들은 + 쓰기 배리어 뒤의 모든 스토어 오퍼레이션들보다 _앞서_ 수행될 겁니다. + + [!] 쓰기 배리어들은 읽기 또는 데이터 의존성 배리어와 함께 짝을 맞춰 + 사용되어야만 함을 알아두세요; "SMP 배리어 짝맞추기" 서브섹션을 참고하세요. + + + (2) 데이터 의존성 배리어. + + 데이터 의존성 배리어는 읽기 배리어의 보다 완화된 형태입니다. 두개의 로드 + 오퍼레이션이 있고 두번째 것이 첫번째 것의 결과에 의존하고 있을 때(예: + 두번째 로드가 참조할 주소를 첫번째 로드가 읽는 경우), 두번째 로드가 읽어올 + 데이터는 첫번째 로드에 의해 그 주소가 얻어지기 전에 업데이트 되어 있음을 + 보장하기 위해서 데이터 의존성 배리어가 필요할 수 있습니다. + + 데이터 의존성 배리어는 상호 의존적인 로드 오퍼레이션들 사이의 부분적 순서 + 세우기입니다; 스토어 오퍼레이션들이나 독립적인 로드들, 또는 중복되는 + 로드들에 대해서는 어떤 영향도 끼치지 않습니다. + + (1) 에서 언급했듯이, 시스템의 CPU 들은 메모리 시스템에 일련의 스토어 + 오퍼레이션들을 던져 넣고 있으며, 거기에 관심이 있는 다른 CPU 는 그 + 오퍼레이션들을 메모리 시스템이 실행한 결과를 인지할 수 있습니다. 이처럼 + 다른 CPU 의 스토어 오퍼레이션의 결과에 관심을 두고 있는 CPU 가 수행 요청한 + 데이터 의존성 배리어는, 배리어 앞의 어떤 로드 오퍼레이션이 다른 CPU 에서 + 던져 넣은 스토어 오퍼레이션과 같은 영역을 향했다면, 그런 스토어 + 오퍼레이션들이 만들어내는 결과가 데이터 의존성 배리어 뒤의 로드 + 오퍼레이션들에게는 보일 것을 보장합니다. + + 이 순서 세우기 제약에 대한 그림을 보기 위해선 "메모리 배리어 시퀀스의 예" + 서브섹션을 참고하시기 바랍니다. + + [!] 첫번째 로드는 반드시 _데이터_ 의존성을 가져야지 컨트롤 의존성을 가져야 + 하는게 아님을 알아두십시오. 만약 두번째 로드를 위한 주소가 첫번째 로드에 + 의존적이지만 그 의존성은 조건적이지 그 주소 자체를 가져오는게 아니라면, + 그것은 _컨트롤_ 의존성이고, 이 경우에는 읽기 배리어나 그보다 강력한 + 무언가가 필요합니다. 더 자세한 내용을 위해서는 "컨트롤 의존성" 서브섹션을 + 참고하시기 바랍니다. + + [!] 데이터 의존성 배리어는 보통 쓰기 배리어들과 함께 짝을 맞춰 사용되어야 + 합니다; "SMP 배리어 짝맞추기" 서브섹션을 참고하세요. + + + (3) 읽기 (또는 로드) 메모리 배리어. + + 읽기 배리어는 데이터 의존성 배리어 기능의 보장사항에 더해서 배리어보다 + 앞서 명시된 모든 LOAD 오퍼레이션들이 배리어 뒤에 명시되는 모든 LOAD + 오퍼레이션들보다 먼저 행해진 것으로 시스템의 다른 컴포넌트들에 보여질 것을 + 보장합니다. + + 읽기 배리어는 로드 오퍼레이션에 행해지는 부분적 순서 세우기입니다; 스토어 + 오퍼레이션에 대해서는 어떤 영향도 끼치지 않습니다. + + 읽기 메모리 배리어는 데이터 의존성 배리어를 내장하므로 데이터 의존성 + 배리어를 대신할 수 있습니다. + + [!] 읽기 배리어는 일반적으로 쓰기 배리어들과 함께 짝을 맞춰 사용되어야 + 합니다; "SMP 배리어 짝맞추기" 서브섹션을 참고하세요. + + + (4) 범용 메모리 배리어. + + 범용(general) 메모리 배리어는 배리어보다 앞서 명시된 모든 LOAD 와 STORE + 오퍼레이션들이 배리어 뒤에 명시된 모든 LOAD 와 STORE 오퍼레이션들보다 + 먼저 수행된 것으로 시스템의 나머지 컴포넌트들에 보이게 됨을 보장합니다. + + 범용 메모리 배리어는 로드와 스토어 모두에 대한 부분적 순서 세우기입니다. + + 범용 메모리 배리어는 읽기 메모리 배리어, 쓰기 메모리 배리어 모두를 + 내장하므로, 두 배리어를 모두 대신할 수 있습니다. + + +그리고 두개의 명시적이지 않은 타입이 있습니다: + + (5) ACQUIRE 오퍼레이션. + + 이 타입의 오퍼레이션은 단방향의 투과성 배리어처럼 동작합니다. ACQUIRE + 오퍼레이션 뒤의 모든 메모리 오퍼레이션들이 ACQUIRE 오퍼레이션 후에 + 일어난 것으로 시스템의 나머지 컴포넌트들에 보이게 될 것이 보장됩니다. + LOCK 오퍼레이션과 smp_load_acquire(), smp_cond_acquire() 오퍼레이션도 + ACQUIRE 오퍼레이션에 포함됩니다. smp_cond_acquire() 오퍼레이션은 컨트롤 + 의존성과 smp_rmb() 를 사용해서 ACQUIRE 의 의미적 요구사항(semantic)을 + 충족시킵니다. + + ACQUIRE 오퍼레이션 앞의 메모리 오퍼레이션들은 ACQUIRE 오퍼레이션 완료 후에 + 수행된 것처럼 보일 수 있습니다. + + ACQUIRE 오퍼레이션은 거의 항상 RELEASE 오퍼레이션과 짝을 지어 사용되어야 + 합니다. + + + (6) RELEASE 오퍼레이션. + + 이 타입의 오퍼레이션들도 단방향 투과성 배리어처럼 동작합니다. RELEASE + 오퍼레이션 앞의 모든 메모리 오퍼레이션들은 RELEASE 오퍼레이션 전에 완료된 + 것으로 시스템의 다른 컴포넌트들에 보여질 것이 보장됩니다. UNLOCK 류의 + 오퍼레이션들과 smp_store_release() 오퍼레이션도 RELEASE 오퍼레이션의 + 일종입니다. + + RELEASE 오퍼레이션 뒤의 메모리 오퍼레이션들은 RELEASE 오퍼레이션이 + 완료되기 전에 행해진 것처럼 보일 수 있습니다. + + ACQUIRE 와 RELEASE 오퍼레이션의 사용은 일반적으로 다른 메모리 배리어의 + 필요성을 없앱니다 (하지만 "MMIO 쓰기 배리어" 서브섹션에서 설명되는 예외를 + 알아두세요). 또한, RELEASE+ACQUIRE 조합은 범용 메모리 배리어처럼 동작할 + 것을 보장하지 -않습니다-. 하지만, 어떤 변수에 대한 RELEASE 오퍼레이션을 + 앞서는 메모리 액세스들의 수행 결과는 이 RELEASE 오퍼레이션을 뒤이어 같은 + 변수에 대해 수행된 ACQUIRE 오퍼레이션을 뒤따르는 메모리 액세스에는 보여질 + 것이 보장됩니다. 다르게 말하자면, 주어진 변수의 크리티컬 섹션에서는, 해당 + 변수에 대한 앞의 크리티컬 섹션에서의 모든 액세스들이 완료되었을 것을 + 보장합니다. + + 즉, ACQUIRE 는 최소한의 "취득" 동작처럼, 그리고 RELEASE 는 최소한의 "공개" + 처럼 동작한다는 의미입니다. + +atomic_ops.txt 에서 설명되는 어토믹 오퍼레이션들 중에는 완전히 순서잡힌 것들과 +(배리어를 사용하지 않는) 완화된 순서의 것들 외에 ACQUIRE 와 RELEASE 부류의 +것들도 존재합니다. 로드와 스토어를 모두 수행하는 조합된 어토믹 오퍼레이션에서, +ACQUIRE 는 해당 오퍼레이션의 로드 부분에만 적용되고 RELEASE 는 해당 +오퍼레이션의 스토어 부분에만 적용됩니다. + +메모리 배리어들은 두 CPU 간, 또는 CPU 와 디바이스 간에 상호작용의 가능성이 있을 +때에만 필요합니다. 만약 어떤 코드에 그런 상호작용이 없을 것이 보장된다면, 해당 +코드에서는 메모리 배리어를 사용할 필요가 없습니다. + + +이것들은 _최소한의_ 보장사항들임을 알아두세요. 다른 아키텍쳐에서는 더 강력한 +보장사항을 제공할 수도 있습니다만, 그런 보장사항은 아키텍쳐 종속적 코드 이외의 +부분에서는 신뢰되지 _않을_ 겁니다. + + +메모리 배리어에 대해 가정해선 안될 것 +------------------------------------- + +리눅스 커널 메모리 배리어들이 보장하지 않는 것들이 있습니다: + + (*) 메모리 배리어 앞에서 명시된 어떤 메모리 액세스도 메모리 배리어 명령의 수행 + 완료 시점까지 _완료_ 될 것이란 보장은 없습니다; 배리어가 하는 일은 CPU 의 + 액세스 큐에 특정 타입의 액세스들은 넘을 수 없는 선을 긋는 것으로 생각될 수 + 있습니다. + + (*) 한 CPU 에서 메모리 배리어를 수행하는게 시스템의 다른 CPU 나 하드웨어에 + 어떤 직접적인 영향을 끼친다는 보장은 존재하지 않습니다. 배리어 수행이 + 만드는 간접적 영향은 두번째 CPU 가 첫번째 CPU 의 액세스들의 결과를 + 바라보는 순서가 됩니다만, 다음 항목을 보세요: + + (*) 첫번째 CPU 가 두번째 CPU 의 메모리 액세스들의 결과를 바라볼 때, _설령_ + 두번째 CPU 가 메모리 배리어를 사용한다 해도, 첫번째 CPU _또한_ 그에 맞는 + 메모리 배리어를 사용하지 않는다면 ("SMP 배리어 짝맞추기" 서브섹션을 + 참고하세요) 그 결과가 올바른 순서로 보여진다는 보장은 없습니다. + + (*) CPU 바깥의 하드웨어[*] 가 메모리 액세스들의 순서를 바꾸지 않는다는 보장은 + 존재하지 않습니다. CPU 캐시 일관성 메커니즘은 메모리 배리어의 간접적 + 영향을 CPU 사이에 전파하긴 하지만, 순서대로 전파하지는 않을 수 있습니다. + + [*] 버스 마스터링 DMA 와 일관성에 대해서는 다음을 참고하시기 바랍니다: + + Documentation/PCI/pci.txt + Documentation/DMA-API-HOWTO.txt + Documentation/DMA-API.txt + + +데이터 의존성 배리어 +-------------------- + +데이터 의존성 배리어의 사용에 있어 지켜야 하는 사항들은 약간 미묘하고, 데이터 +의존성 배리어가 사용되어야 하는 상황도 항상 명백하지는 않습니다. 설명을 위해 +다음의 이벤트 시퀀스를 생각해 봅시다: + + CPU 1 CPU 2 + =============== =============== + { A == 1, B == 2, C == 3, P == &A, Q == &C } + B = 4; + <쓰기 배리어> + WRITE_ONCE(P, &B) + Q = READ_ONCE(P); + D = *Q; + +여기엔 분명한 데이터 의존성이 존재하므로, 이 시퀀스가 끝났을 때 Q 는 &A 또는 &B +일 것이고, 따라서: + + (Q == &A) 는 (D == 1) 를, + (Q == &B) 는 (D == 4) 를 의미합니다. + +하지만! CPU 2 는 B 의 업데이트를 인식하기 전에 P 의 업데이트를 인식할 수 있고, +따라서 다음의 결과가 가능합니다: + + (Q == &B) and (D == 2) ???? + +이런 결과는 일관성이나 인과 관계 유지가 실패한 것처럼 보일 수도 있겠지만, +그렇지 않습니다, 그리고 이 현상은 (DEC Alpha 와 같은) 여러 CPU 에서 실제로 +발견될 수 있습니다. + +이 문제 상황을 제대로 해결하기 위해, 데이터 의존성 배리어나 그보다 강화된 +무언가가 주소를 읽어올 때와 데이터를 읽어올 때 사이에 추가되어야만 합니다: + + CPU 1 CPU 2 + =============== =============== + { A == 1, B == 2, C == 3, P == &A, Q == &C } + B = 4; + <쓰기 배리어> + WRITE_ONCE(P, &B); + Q = READ_ONCE(P); + <데이터 의존성 배리어> + D = *Q; + +이 변경은 앞의 처음 두가지 결과 중 하나만이 발생할 수 있고, 세번째의 결과는 +발생할 수 없도록 합니다. + +데이터 의존성 배리어는 의존적 쓰기에 대해서도 순서를 잡아줍니다: + + CPU 1 CPU 2 + =============== =============== + { A == 1, B == 2, C = 3, P == &A, Q == &C } + B = 4; + <쓰기 배리어> + WRITE_ONCE(P, &B); + Q = READ_ONCE(P); + <데이터 의존성 배리어> + *Q = 5; + +이 데이터 의존성 배리어는 Q 로의 읽기가 *Q 로의 스토어와 순서를 맞추게 +해줍니다. 이는 다음과 같은 결과를 막습니다: + + (Q == &B) && (B == 4) + +이런 패턴은 드물게 사용되어야 함을 알아 두시기 바랍니다. 무엇보다도, 의존성 +순서 규칙의 의도는 쓰기 작업을 -예방- 해서 그로 인해 발생하는 비싼 캐시 미스도 +없애려는 것입니다. 이 패턴은 드물게 발생하는 에러 조건 같은것들을 기록하는데 +사용될 수 있고, 이렇게 배리어를 사용해 순서를 지키게 함으로써 그런 기록이 +사라지는 것을 막습니다. + + +[!] 상당히 비직관적인 이 상황은 분리된 캐시를 가진 기계, 예를 들어 한 캐시 +뱅크가 짝수번 캐시 라인을 처리하고 다른 뱅크는 홀수번 캐시 라인을 처리하는 기계 +등에서 가장 잘 발생합니다. 포인터 P 는 홀수 번호의 캐시 라인에 있고, 변수 B 는 +짝수 번호 캐시 라인에 있다고 생각해 봅시다. 그런 상태에서 읽기 작업을 하는 CPU +의 짝수번 뱅크는 할 일이 쌓여 매우 바쁘지만 홀수번 뱅크는 할 일이 없어 아무 +일도 하지 않고 있었다면, 포인터 P 는 새 값 (&B) 을, 그리고 변수 B 는 옛날 값 +(2) 을 가지고 있는 상태가 보여질 수도 있습니다. + + +데이터 의존성 배리어는 매우 중요한데, 예를 들어 RCU 시스템에서 그렇습니다. +include/linux/rcupdate.h 의 rcu_assign_pointer() 와 rcu_dereference() 를 +참고하세요. 여기서 데이터 의존성 배리어는 RCU 로 관리되는 포인터의 타겟을 현재 +타겟에서 수정된 새로운 타겟으로 바꾸는 작업에서 새로 수정된 타겟이 초기화가 +완료되지 않은 채로 보여지는 일이 일어나지 않게 해줍니다. + +더 많은 예를 위해선 "캐시 일관성" 서브섹션을 참고하세요. + + +컨트롤 의존성 +------------- + +로드-로드 컨트롤 의존성은 데이터 의존성 배리어만으로는 정확히 동작할 수가 +없어서 읽기 메모리 배리어를 필요로 합니다. 아래의 코드를 봅시다: + + q = READ_ONCE(a); + if (q) { + <데이터 의존성 배리어> /* BUG: No data dependency!!! */ + p = READ_ONCE(b); + } + +이 코드는 원하는 대로의 효과를 내지 못할 수 있는데, 이 코드에는 데이터 의존성이 +아니라 컨트롤 의존성이 존재하기 때문으로, 이런 상황에서 CPU 는 실행 속도를 더 +빠르게 하기 위해 분기 조건의 결과를 예측하고 코드를 재배치 할 수 있어서 다른 +CPU 는 b 로부터의 로드 오퍼레이션이 a 로부터의 로드 오퍼레이션보다 먼저 발생한 +걸로 인식할 수 있습니다. 여기에 정말로 필요했던 건 다음과 같습니다: + + q = READ_ONCE(a); + if (q) { + <읽기 배리어> + p = READ_ONCE(b); + } + +하지만, 스토어 오퍼레이션은 예측적으로 수행되지 않습니다. 즉, 다음 예에서와 +같이 로드-스토어 컨트롤 의존성이 존재하는 경우에는 순서가 -지켜진다-는 +의미입니다. + + q = READ_ONCE(a); + if (q) { + WRITE_ONCE(b, p); + } + +컨트롤 의존성은 보통 다른 타입의 배리어들과 짝을 맞춰 사용됩니다. 그렇다곤 +하나, READ_ONCE() 는 반드시 사용해야 함을 부디 명심하세요! READ_ONCE() 가 +없다면, 컴파일러가 'a' 로부터의 로드를 'a' 로부터의 또다른 로드와, 'b' 로의 +스토어를 'b' 로의 또다른 스토어와 조합해 버려 매우 비직관적인 결과를 초래할 수 +있습니다. + +이걸로 끝이 아닌게, 컴파일러가 변수 'a' 의 값이 항상 0이 아니라고 증명할 수 +있다면, 앞의 예에서 "if" 문을 없애서 다음과 같이 최적화 할 수도 있습니다: + + q = a; + b = p; /* BUG: Compiler and CPU can both reorder!!! */ + +그러니 READ_ONCE() 를 반드시 사용하세요. + +다음과 같이 "if" 문의 양갈래 브랜치에 모두 존재하는 동일한 스토어에 대해 순서를 +강제하고 싶은 경우가 있을 수 있습니다: + + q = READ_ONCE(a); + if (q) { + barrier(); + WRITE_ONCE(b, p); + do_something(); + } else { + barrier(); + WRITE_ONCE(b, p); + do_something_else(); + } + +안타깝게도, 현재의 컴파일러들은 높은 최적화 레벨에서는 이걸 다음과 같이 +바꿔버립니다: + + q = READ_ONCE(a); + barrier(); + WRITE_ONCE(b, p); /* BUG: No ordering vs. load from a!!! */ + if (q) { + /* WRITE_ONCE(b, p); -- moved up, BUG!!! */ + do_something(); + } else { + /* WRITE_ONCE(b, p); -- moved up, BUG!!! */ + do_something_else(); + } + +이제 'a' 에서의 로드와 'b' 로의 스토어 사이에는 조건적 관계가 없기 때문에 CPU +는 이들의 순서를 바꿀 수 있게 됩니다: 이런 경우에 조건적 관계는 반드시 +필요한데, 모든 컴파일러 최적화가 이루어지고 난 후의 어셈블리 코드에서도 +마찬가지입니다. 따라서, 이 예에서 순서를 지키기 위해서는 smp_store_release() +와 같은 명시적 메모리 배리어가 필요합니다: + + q = READ_ONCE(a); + if (q) { + smp_store_release(&b, p); + do_something(); + } else { + smp_store_release(&b, p); + do_something_else(); + } + +반면에 명시적 메모리 배리어가 없다면, 이런 경우의 순서는 스토어 오퍼레이션들이 +서로 다를 때에만 보장되는데, 예를 들면 다음과 같은 경우입니다: + + q = READ_ONCE(a); + if (q) { + WRITE_ONCE(b, p); + do_something(); + } else { + WRITE_ONCE(b, r); + do_something_else(); + } + +처음의 READ_ONCE() 는 컴파일러가 'a' 의 값을 증명해내는 것을 막기 위해 여전히 +필요합니다. + +또한, 로컬 변수 'q' 를 가지고 하는 일에 대해 주의해야 하는데, 그러지 않으면 +컴파일러는 그 값을 추측하고 또다시 필요한 조건관계를 없애버릴 수 있습니다. +예를 들면: + + q = READ_ONCE(a); + if (q % MAX) { + WRITE_ONCE(b, p); + do_something(); + } else { + WRITE_ONCE(b, r); + do_something_else(); + } + +만약 MAX 가 1 로 정의된 상수라면, 컴파일러는 (q % MAX) 는 0이란 것을 알아채고, +위의 코드를 아래와 같이 바꿔버릴 수 있습니다: + + q = READ_ONCE(a); + WRITE_ONCE(b, p); + do_something_else(); + +이렇게 되면, CPU 는 변수 'a' 로부터의 로드와 변수 'b' 로의 스토어 사이의 순서를 +지켜줄 필요가 없어집니다. barrier() 를 추가해 해결해 보고 싶겠지만, 그건 +도움이 안됩니다. 조건 관계는 사라졌고, barrier() 는 이를 되돌리지 못합니다. +따라서, 이 순서를 지켜야 한다면, MAX 가 1 보다 크다는 것을, 다음과 같은 방법을 +사용해 분명히 해야 합니다: + + q = READ_ONCE(a); + BUILD_BUG_ON(MAX <= 1); /* Order load from a with store to b. */ + if (q % MAX) { + WRITE_ONCE(b, p); + do_something(); + } else { + WRITE_ONCE(b, r); + do_something_else(); + } + +'b' 로의 스토어들은 여전히 서로 다름을 알아두세요. 만약 그것들이 동일하면, +앞에서 이야기했듯, 컴파일러가 그 스토어 오퍼레이션들을 'if' 문 바깥으로 +끄집어낼 수 있습니다. + +또한 이진 조건문 평가에 너무 의존하지 않도록 조심해야 합니다. 다음의 예를 +봅시다: + + q = READ_ONCE(a); + if (q || 1 > 0) + WRITE_ONCE(b, 1); + +첫번째 조건만으로는 브랜치 조건 전체를 거짓으로 만들 수 없고 두번째 조건은 항상 +참이기 때문에, 컴파일러는 이 예를 다음과 같이 바꿔서 컨트롤 의존성을 없애버릴 +수 있습니다: + + q = READ_ONCE(a); + WRITE_ONCE(b, 1); + +이 예는 컴파일러가 코드를 추측으로 수정할 수 없도록 분명히 해야 한다는 점을 +강조합니다. 조금 더 일반적으로 말해서, READ_ONCE() 는 컴파일러에게 주어진 로드 +오퍼레이션을 위한 코드를 정말로 만들도록 하지만, 컴파일러가 그렇게 만들어진 +코드의 수행 결과를 사용하도록 강제하지는 않습니다. + +마지막으로, 컨트롤 의존성은 이행성 (transitivity) 을 제공하지 -않습니다-. 이건 +x 와 y 가 둘 다 0 이라는 초기값을 가졌다는 가정 하의 두개의 예제로 +보이겠습니다: + + CPU 0 CPU 1 + ======================= ======================= + r1 = READ_ONCE(x); r2 = READ_ONCE(y); + if (r1 > 0) if (r2 > 0) + WRITE_ONCE(y, 1); WRITE_ONCE(x, 1); + + assert(!(r1 == 1 && r2 == 1)); + +이 두 CPU 예제에서 assert() 의 조건은 항상 참일 것입니다. 그리고, 만약 컨트롤 +의존성이 이행성을 (실제로는 그러지 않지만) 보장한다면, 다음의 CPU 가 추가되어도 +아래의 assert() 조건은 참이 될것입니다: + + CPU 2 + ===================== + WRITE_ONCE(x, 2); + + assert(!(r1 == 2 && r2 == 1 && x == 2)); /* FAILS!!! */ + +하지만 컨트롤 의존성은 이행성을 제공하지 -않기- 때문에, 세개의 CPU 예제가 실행 +완료된 후에 위의 assert() 의 조건은 거짓으로 평가될 수 있습니다. 세개의 CPU +예제가 순서를 지키길 원한다면, CPU 0 와 CPU 1 코드의 로드와 스토어 사이, "if" +문 바로 다음에 smp_mb()를 넣어야 합니다. 더 나아가서, 최초의 두 CPU 예제는 +매우 위험하므로 사용되지 않아야 합니다. + +이 두개의 예제는 다음 논문: +http://www.cl.cam.ac.uk/users/pes20/ppc-supplemental/test6.pdf 와 +이 사이트: https://www.cl.cam.ac.uk/~pes20/ppcmem/index.html 에 나온 LB 와 WWC +리트머스 테스트입니다. + +요약하자면: + + (*) 컨트롤 의존성은 앞의 로드들을 뒤의 스토어들에 대해 순서를 맞춰줍니다. + 하지만, 그 외의 어떤 순서도 보장하지 -않습니다-: 앞의 로드와 뒤의 로드들 + 사이에도, 앞의 스토어와 뒤의 스토어들 사이에도요. 이런 다른 형태의 + 순서가 필요하다면 smp_rmb() 나 smp_wmb()를, 또는, 앞의 스토어들과 뒤의 + 로드들 사이의 순서를 위해서는 smp_mb() 를 사용하세요. + + (*) "if" 문의 양갈래 브랜치가 같은 변수에의 동일한 스토어로 시작한다면, 그 + 스토어들은 각 스토어 앞에 smp_mb() 를 넣거나 smp_store_release() 를 + 사용해서 스토어를 하는 식으로 순서를 맞춰줘야 합니다. 이 문제를 해결하기 + 위해 "if" 문의 양갈래 브랜치의 시작 지점에 barrier() 를 넣는 것만으로는 + 충분한 해결이 되지 않는데, 이는 앞의 예에서 본것과 같이, 컴파일러의 + 최적화는 barrier() 가 의미하는 바를 지키면서도 컨트롤 의존성을 손상시킬 + 수 있기 때문이라는 점을 부디 알아두시기 바랍니다. + + (*) 컨트롤 의존성은 앞의 로드와 뒤의 스토어 사이에 최소 하나의, 실행 + 시점에서의 조건관계를 필요로 하며, 이 조건관계는 앞의 로드와 관계되어야 + 합니다. 만약 컴파일러가 조건 관계를 최적화로 없앨수 있다면, 순서도 + 최적화로 없애버렸을 겁니다. READ_ONCE() 와 WRITE_ONCE() 의 주의 깊은 + 사용은 주어진 조건 관계를 유지하는데 도움이 될 수 있습니다. + + (*) 컨트롤 의존성을 위해선 컴파일러가 조건관계를 없애버리는 것을 막아야 + 합니다. 주의 깊은 READ_ONCE() 나 atomic{,64}_read() 의 사용이 컨트롤 + 의존성이 사라지지 않게 하는데 도움을 줄 수 있습니다. 더 많은 정보를 + 위해선 "컴파일러 배리어" 섹션을 참고하시기 바랍니다. + + (*) 컨트롤 의존성은 보통 다른 타입의 배리어들과 짝을 맞춰 사용됩니다. + + (*) 컨트롤 의존성은 이행성을 제공하지 -않습니다-. 이행성이 필요하다면, + smp_mb() 를 사용하세요. + + +SMP 배리어 짝맞추기 +-------------------- + +CPU 간 상호작용을 다룰 때에 일부 타입의 메모리 배리어는 항상 짝을 맞춰 +사용되어야 합니다. 적절하게 짝을 맞추지 않은 코드는 사실상 에러에 가깝습니다. + +범용 배리어들은 범용 배리어끼리도 짝을 맞추지만 이행성이 없는 대부분의 다른 +타입의 배리어들과도 짝을 맞춥니다. ACQUIRE 배리어는 RELEASE 배리어와 짝을 +맞춥니다만, 둘 다 범용 배리어를 포함해 다른 배리어들과도 짝을 맞출 수 있습니다. +쓰기 배리어는 데이터 의존성 배리어나 컨트롤 의존성, ACQUIRE 배리어, RELEASE +배리어, 읽기 배리어, 또는 범용 배리어와 짝을 맞춥니다. 비슷하게 읽기 배리어나 +컨트롤 의존성, 또는 데이터 의존성 배리어는 쓰기 배리어나 ACQUIRE 배리어, +RELEASE 배리어, 또는 범용 배리어와 짝을 맞추는데, 다음과 같습니다: + + CPU 1 CPU 2 + =============== =============== + WRITE_ONCE(a, 1); + <쓰기 배리어> + WRITE_ONCE(b, 2); x = READ_ONCE(b); + <읽기 배리어> + y = READ_ONCE(a); + +또는: + + CPU 1 CPU 2 + =============== =============================== + a = 1; + <쓰기 배리어> + WRITE_ONCE(b, &a); x = READ_ONCE(b); + <데이터 의존성 배리어> + y = *x; + +또는: + + CPU 1 CPU 2 + =============== =============================== + r1 = READ_ONCE(y); + <범용 배리어> + WRITE_ONCE(y, 1); if (r2 = READ_ONCE(x)) { + <묵시적 컨트롤 의존성> + WRITE_ONCE(y, 1); + } + + assert(r1 == 0 || r2 == 0); + +기본적으로, 여기서의 읽기 배리어는 "더 완화된" 타입일 순 있어도 항상 존재해야 +합니다. + +[!] 쓰기 배리어 앞의 스토어 오퍼레이션은 일반적으로 읽기 배리어나 데이터 +의존성 배리어 뒤의 로드 오퍼레이션과 매치될 것이고, 반대도 마찬가지입니다: + + CPU 1 CPU 2 + =================== =================== + WRITE_ONCE(a, 1); }---- --->{ v = READ_ONCE(c); + WRITE_ONCE(b, 2); } \ / { w = READ_ONCE(d); + <쓰기 배리어> \ <읽기 배리어> + WRITE_ONCE(c, 3); } / \ { x = READ_ONCE(a); + WRITE_ONCE(d, 4); }---- --->{ y = READ_ONCE(b); + + +메모리 배리어 시퀀스의 예 +------------------------- + +첫째, 쓰기 배리어는 스토어 오퍼레이션들의 부분적 순서 세우기로 동작합니다. +아래의 이벤트 시퀀스를 보세요: + + CPU 1 + ======================= + STORE A = 1 + STORE B = 2 + STORE C = 3 + <쓰기 배리어> + STORE D = 4 + STORE E = 5 + +이 이벤트 시퀀스는 메모리 일관성 시스템에 원소끼리의 순서가 존재하지 않는 집합 +{ STORE A, STORE B, STORE C } 가 역시 원소끼리의 순서가 존재하지 않는 집합 +{ STORE D, STORE E } 보다 먼저 일어난 것으로 시스템의 나머지 요소들에 보이도록 +전달됩니다: + + +-------+ : : + | | +------+ + | |------>| C=3 | } /\ + | | : +------+ }----- \ -----> 시스템의 나머지 요소에 + | | : | A=1 | } \/ 보여질 수 있는 이벤트들 + | | : +------+ } + | CPU 1 | : | B=2 | } + | | +------+ } + | | wwwwwwwwwwwwwwww } <--- 여기서 쓰기 배리어는 배리어 앞의 + | | +------+ } 모든 스토어가 배리어 뒤의 스토어 + | | : | E=5 | } 전에 메모리 시스템에 전달되도록 + | | : +------+ } 합니다 + | |------>| D=4 | } + | | +------+ + +-------+ : : + | + | CPU 1 에 의해 메모리 시스템에 전달되는 + | 일련의 스토어 오퍼레이션들 + V + + +둘째, 데이터 의존성 배리어는 데이터 의존적 로드 오퍼레이션들의 부분적 순서 +세우기로 동작합니다. 다음 일련의 이벤트들을 보세요: + + CPU 1 CPU 2 + ======================= ======================= + { B = 7; X = 9; Y = 8; C = &Y } + STORE A = 1 + STORE B = 2 + <쓰기 배리어> + STORE C = &B LOAD X + STORE D = 4 LOAD C (gets &B) + LOAD *C (reads B) + +여기에 별다른 개입이 없다면, CPU 1 의 쓰기 배리어에도 불구하고 CPU 2 는 CPU 1 +의 이벤트들을 완전히 무작위적 순서로 인지하게 됩니다: + + +-------+ : : : : + | | +------+ +-------+ | CPU 2 에 인지되는 + | |------>| B=2 |----- --->| Y->8 | | 업데이트 이벤트 + | | : +------+ \ +-------+ | 시퀀스 + | CPU 1 | : | A=1 | \ --->| C->&Y | V + | | +------+ | +-------+ + | | wwwwwwwwwwwwwwww | : : + | | +------+ | : : + | | : | C=&B |--- | : : +-------+ + | | : +------+ \ | +-------+ | | + | |------>| D=4 | ----------->| C->&B |------>| | + | | +------+ | +-------+ | | + +-------+ : : | : : | | + | : : | | + | : : | CPU 2 | + | +-------+ | | + 분명히 잘못된 ---> | | B->7 |------>| | + B 의 값 인지 (!) | +-------+ | | + | : : | | + | +-------+ | | + X 의 로드가 B 의 ---> \ | X->9 |------>| | + 일관성 유지를 \ +-------+ | | + 지연시킴 ----->| B->2 | +-------+ + +-------+ + : : + + +앞의 예에서, CPU 2 는 (B 의 값이 될) *C 의 값 읽기가 C 의 LOAD 뒤에 이어짐에도 +B 가 7 이라는 결과를 얻습니다. + +하지만, 만약 데이터 의존성 배리어가 C 의 로드와 *C (즉, B) 의 로드 사이에 +있었다면: + + CPU 1 CPU 2 + ======================= ======================= + { B = 7; X = 9; Y = 8; C = &Y } + STORE A = 1 + STORE B = 2 + <쓰기 배리어> + STORE C = &B LOAD X + STORE D = 4 LOAD C (gets &B) + <데이터 의존성 배리어> + LOAD *C (reads B) + +다음과 같이 됩니다: + + +-------+ : : : : + | | +------+ +-------+ + | |------>| B=2 |----- --->| Y->8 | + | | : +------+ \ +-------+ + | CPU 1 | : | A=1 | \ --->| C->&Y | + | | +------+ | +-------+ + | | wwwwwwwwwwwwwwww | : : + | | +------+ | : : + | | : | C=&B |--- | : : +-------+ + | | : +------+ \ | +-------+ | | + | |------>| D=4 | ----------->| C->&B |------>| | + | | +------+ | +-------+ | | + +-------+ : : | : : | | + | : : | | + | : : | CPU 2 | + | +-------+ | | + | | X->9 |------>| | + | +-------+ | | + C 로의 스토어 앞의 ---> \ ddddddddddddddddd | | + 모든 이벤트 결과가 \ +-------+ | | + 뒤의 로드에게 ----->| B->2 |------>| | + 보이게 강제한다 +-------+ | | + : : +-------+ + + +셋째, 읽기 배리어는 로드 오퍼레이션들에의 부분적 순서 세우기로 동작합니다. +아래의 일련의 이벤트를 봅시다: + + CPU 1 CPU 2 + ======================= ======================= + { A = 0, B = 9 } + STORE A=1 + <쓰기 배리어> + STORE B=2 + LOAD B + LOAD A + +CPU 1 은 쓰기 배리어를 쳤지만, 별다른 개입이 없다면 CPU 2 는 CPU 1 에서 행해진 +이벤트의 결과를 무작위적 순서로 인지하게 됩니다. + + +-------+ : : : : + | | +------+ +-------+ + | |------>| A=1 |------ --->| A->0 | + | | +------+ \ +-------+ + | CPU 1 | wwwwwwwwwwwwwwww \ --->| B->9 | + | | +------+ | +-------+ + | |------>| B=2 |--- | : : + | | +------+ \ | : : +-------+ + +-------+ : : \ | +-------+ | | + ---------->| B->2 |------>| | + | +-------+ | CPU 2 | + | | A->0 |------>| | + | +-------+ | | + | : : +-------+ + \ : : + \ +-------+ + ---->| A->1 | + +-------+ + : : + + +하지만, 만약 읽기 배리어가 B 의 로드와 A 의 로드 사이에 존재한다면: + + CPU 1 CPU 2 + ======================= ======================= + { A = 0, B = 9 } + STORE A=1 + <쓰기 배리어> + STORE B=2 + LOAD B + <읽기 배리어> + LOAD A + +CPU 1 에 의해 만들어진 부분적 순서가 CPU 2 에도 그대로 인지됩니다: + + +-------+ : : : : + | | +------+ +-------+ + | |------>| A=1 |------ --->| A->0 | + | | +------+ \ +-------+ + | CPU 1 | wwwwwwwwwwwwwwww \ --->| B->9 | + | | +------+ | +-------+ + | |------>| B=2 |--- | : : + | | +------+ \ | : : +-------+ + +-------+ : : \ | +-------+ | | + ---------->| B->2 |------>| | + | +-------+ | CPU 2 | + | : : | | + | : : | | + 여기서 읽기 배리어는 ----> \ rrrrrrrrrrrrrrrrr | | + B 로의 스토어 전의 \ +-------+ | | + 모든 결과를 CPU 2 에 ---->| A->1 |------>| | + 보이도록 한다 +-------+ | | + : : +-------+ + + +더 완벽한 설명을 위해, A 의 로드가 읽기 배리어 앞과 뒤에 있으면 어떻게 될지 +생각해 봅시다: + + CPU 1 CPU 2 + ======================= ======================= + { A = 0, B = 9 } + STORE A=1 + <쓰기 배리어> + STORE B=2 + LOAD B + LOAD A [first load of A] + <읽기 배리어> + LOAD A [second load of A] + +A 의 로드 두개가 모두 B 의 로드 뒤에 있지만, 서로 다른 값을 얻어올 수 +있습니다: + + +-------+ : : : : + | | +------+ +-------+ + | |------>| A=1 |------ --->| A->0 | + | | +------+ \ +-------+ + | CPU 1 | wwwwwwwwwwwwwwww \ --->| B->9 | + | | +------+ | +-------+ + | |------>| B=2 |--- | : : + | | +------+ \ | : : +-------+ + +-------+ : : \ | +-------+ | | + ---------->| B->2 |------>| | + | +-------+ | CPU 2 | + | : : | | + | : : | | + | +-------+ | | + | | A->0 |------>| 1st | + | +-------+ | | + 여기서 읽기 배리어는 ----> \ rrrrrrrrrrrrrrrrr | | + B 로의 스토어 전의 \ +-------+ | | + 모든 결과를 CPU 2 에 ---->| A->1 |------>| 2nd | + 보이도록 한다 +-------+ | | + : : +-------+ + + +하지만 CPU 1 에서의 A 업데이트는 읽기 배리어가 완료되기 전에도 보일 수도 +있긴 합니다: + + +-------+ : : : : + | | +------+ +-------+ + | |------>| A=1 |------ --->| A->0 | + | | +------+ \ +-------+ + | CPU 1 | wwwwwwwwwwwwwwww \ --->| B->9 | + | | +------+ | +-------+ + | |------>| B=2 |--- | : : + | | +------+ \ | : : +-------+ + +-------+ : : \ | +-------+ | | + ---------->| B->2 |------>| | + | +-------+ | CPU 2 | + | : : | | + \ : : | | + \ +-------+ | | + ---->| A->1 |------>| 1st | + +-------+ | | + rrrrrrrrrrrrrrrrr | | + +-------+ | | + | A->1 |------>| 2nd | + +-------+ | | + : : +-------+ + + +여기서 보장되는 건, 만약 B 의 로드가 B == 2 라는 결과를 봤다면, A 에의 두번째 +로드는 항상 A == 1 을 보게 될 것이라는 겁니다. A 에의 첫번째 로드에는 그런 +보장이 없습니다; A == 0 이거나 A == 1 이거나 둘 중 하나의 결과를 보게 될겁니다. + + +읽기 메모리 배리어 VS 로드 예측 +------------------------------- + +많은 CPU들이 로드를 예측적으로 (speculatively) 합니다: 어떤 데이터를 메모리에서 +로드해야 하게 될지 예측을 했다면, 해당 데이터를 로드하는 인스트럭션을 실제로는 +아직 만나지 않았더라도 다른 로드 작업이 없어 버스 (bus) 가 아무 일도 하고 있지 +않다면, 그 데이터를 로드합니다. 이후에 실제 로드 인스트럭션이 실행되면 CPU 가 +이미 그 값을 가지고 있기 때문에 그 로드 인스트럭션은 즉시 완료됩니다. + +해당 CPU 는 실제로는 그 값이 필요치 않았다는 사실이 나중에 드러날 수도 있는데 - +해당 로드 인스트럭션이 브랜치로 우회되거나 했을 수 있겠죠 - , 그렇게 되면 앞서 +읽어둔 값을 버리거나 나중의 사용을 위해 캐시에 넣어둘 수 있습니다. + +다음을 생각해 봅시다: + + CPU 1 CPU 2 + ======================= ======================= + LOAD B + DIVIDE } 나누기 명령은 일반적으로 + DIVIDE } 긴 시간을 필요로 합니다 + LOAD A + +는 이렇게 될 수 있습니다: + + : : +-------+ + +-------+ | | + --->| B->2 |------>| | + +-------+ | CPU 2 | + : :DIVIDE | | + +-------+ | | + 나누기 하느라 바쁜 ---> --->| A->0 |~~~~ | | + CPU 는 A 의 LOAD 를 +-------+ ~ | | + 예측해서 수행한다 : : ~ | | + : :DIVIDE | | + : : ~ | | + 나누기가 끝나면 ---> ---> : : ~-->| | + CPU 는 해당 LOAD 를 : : | | + 즉각 완료한다 : : +-------+ + + +읽기 배리어나 데이터 의존성 배리어를 두번째 로드 직전에 놓는다면: + + CPU 1 CPU 2 + ======================= ======================= + LOAD B + DIVIDE + DIVIDE + <읽기 배리어> + LOAD A + +예측으로 얻어진 값은 사용된 배리어의 타입에 따라서 해당 값이 옳은지 검토되게 +됩니다. 만약 해당 메모리 영역에 변화가 없었다면, 예측으로 얻어두었던 값이 +사용됩니다: + + : : +-------+ + +-------+ | | + --->| B->2 |------>| | + +-------+ | CPU 2 | + : :DIVIDE | | + +-------+ | | + 나누기 하느라 바쁜 ---> --->| A->0 |~~~~ | | + CPU 는 A 의 LOAD 를 +-------+ ~ | | + 예측한다 : : ~ | | + : :DIVIDE | | + : : ~ | | + : : ~ | | + rrrrrrrrrrrrrrrr~ | | + : : ~ | | + : : ~-->| | + : : | | + : : +-------+ + + +하지만 다른 CPU 에서 업데이트나 무효화가 있었다면, 그 예측은 무효화되고 그 값은 +다시 읽혀집니다: + + : : +-------+ + +-------+ | | + --->| B->2 |------>| | + +-------+ | CPU 2 | + : :DIVIDE | | + +-------+ | | + 나누기 하느라 바쁜 ---> --->| A->0 |~~~~ | | + CPU 는 A 의 LOAD 를 +-------+ ~ | | + 예측한다 : : ~ | | + : :DIVIDE | | + : : ~ | | + : : ~ | | + rrrrrrrrrrrrrrrrr | | + +-------+ | | + 예측성 동작은 무효화 되고 ---> --->| A->1 |------>| | + 업데이트된 값이 다시 읽혀진다 +-------+ | | + : : +-------+ + + +이행성 +------ + +이행성(transitivity)은 실제의 컴퓨터 시스템에서 항상 제공되지는 않는, 순서 +맞추기에 대한 상당히 직관적인 개념입니다. 다음의 예가 이행성을 보여줍니다: + + CPU 1 CPU 2 CPU 3 + ======================= ======================= ======================= + { X = 0, Y = 0 } + STORE X=1 LOAD X STORE Y=1 + <범용 배리어> <범용 배리어> + LOAD Y LOAD X + +CPU 2 의 X 로드가 1을 리턴했고 Y 로드가 0을 리턴했다고 해봅시다. 이는 CPU 2 의 +X 로드가 CPU 1 의 X 스토어 뒤에 이루어졌고 CPU 2 의 Y 로드는 CPU 3 의 Y 스토어 +전에 이루어졌음을 의미합니다. 그럼 "CPU 3 의 X 로드는 0을 리턴할 수 있나요?" + +CPU 2 의 X 로드는 CPU 1 의 스토어 후에 이루어졌으니, CPU 3 의 X 로드는 1을 +리턴하는게 자연스럽습니다. 이런 생각이 이행성의 한 예입니다: CPU A 에서 실행된 +로드가 CPU B 에서의 같은 변수에 대한 로드를 뒤따른다면, CPU A 의 로드는 CPU B +의 로드가 내놓은 값과 같거나 그 후의 값을 내놓아야 합니다. + +리눅스 커널에서 범용 배리어의 사용은 이행성을 보장합니다. 따라서, 앞의 예에서 +CPU 2 의 X 로드가 1을, Y 로드는 0을 리턴했다면, CPU 3 의 X 로드는 반드시 1을 +리턴합니다. + +하지만, 읽기나 쓰기 배리어에 대해서는 이행성이 보장되지 -않습니다-. 예를 들어, +앞의 예에서 CPU 2 의 범용 배리어가 아래처럼 읽기 배리어로 바뀐 경우를 생각해 +봅시다: + + CPU 1 CPU 2 CPU 3 + ======================= ======================= ======================= + { X = 0, Y = 0 } + STORE X=1 LOAD X STORE Y=1 + <읽기 배리어> <범용 배리어> + LOAD Y LOAD X + +이 코드는 이행성을 갖지 않습니다: 이 예에서는, CPU 2 의 X 로드가 1을 +리턴하고, Y 로드는 0을 리턴하지만 CPU 3 의 X 로드가 0을 리턴하는 것도 완전히 +합법적입니다. + +CPU 2 의 읽기 배리어가 자신의 읽기는 순서를 맞춰줘도, CPU 1 의 스토어와의 +순서를 맞춰준다고는 보장할 수 없다는게 핵심입니다. 따라서, CPU 1 과 CPU 2 가 +버퍼나 캐시를 공유하는 시스템에서 이 예제 코드가 실행된다면, CPU 2 는 CPU 1 이 +쓴 값에 좀 빨리 접근할 수 있을 것입니다. 따라서 CPU 1 과 CPU 2 의 접근으로 +조합된 순서를 모든 CPU 가 동의할 수 있도록 하기 위해 범용 배리어가 필요합니다. + +범용 배리어는 "글로벌 이행성"을 제공해서, 모든 CPU 들이 오퍼레이션들의 순서에 +동의하게 할 것입니다. 반면, release-acquire 조합은 "로컬 이행성" 만을 +제공해서, 해당 조합이 사용된 CPU 들만이 해당 액세스들의 조합된 순서에 동의함이 +보장됩니다. 예를 들어, 존경스런 Herman Hollerith 의 C 코드로 보면: + + int u, v, x, y, z; + + void cpu0(void) + { + r0 = smp_load_acquire(&x); + WRITE_ONCE(u, 1); + smp_store_release(&y, 1); + } + + void cpu1(void) + { + r1 = smp_load_acquire(&y); + r4 = READ_ONCE(v); + r5 = READ_ONCE(u); + smp_store_release(&z, 1); + } + + void cpu2(void) + { + r2 = smp_load_acquire(&z); + smp_store_release(&x, 1); + } + + void cpu3(void) + { + WRITE_ONCE(v, 1); + smp_mb(); + r3 = READ_ONCE(u); + } + +cpu0(), cpu1(), 그리고 cpu2() 는 smp_store_release()/smp_load_acquire() 쌍의 +연결을 통한 로컬 이행성에 동참하고 있으므로, 다음과 같은 결과는 나오지 않을 +겁니다: + + r0 == 1 && r1 == 1 && r2 == 1 + +더 나아가서, cpu0() 와 cpu1() 사이의 release-acquire 관계로 인해, cpu1() 은 +cpu0() 의 쓰기를 봐야만 하므로, 다음과 같은 결과도 없을 겁니다: + + r1 == 1 && r5 == 0 + +하지만, release-acquire 타동성은 동참한 CPU 들에만 적용되므로 cpu3() 에는 +적용되지 않습니다. 따라서, 다음과 같은 결과가 가능합니다: + + r0 == 0 && r1 == 1 && r2 == 1 && r3 == 0 && r4 == 0 + +비슷하게, 다음과 같은 결과도 가능합니다: + + r0 == 0 && r1 == 1 && r2 == 1 && r3 == 0 && r4 == 0 && r5 == 1 + +cpu0(), cpu1(), 그리고 cpu2() 는 그들의 읽기와 쓰기를 순서대로 보게 되지만, +release-acquire 체인에 관여되지 않은 CPU 들은 그 순서에 이견을 가질 수 +있습니다. 이런 이견은 smp_load_acquire() 와 smp_store_release() 의 구현에 +사용되는 완화된 메모리 배리어 인스트럭션들은 항상 배리어 앞의 스토어들을 뒤의 +로드들에 앞세울 필요는 없다는 사실에서 기인합니다. 이 말은 cpu3() 는 cpu0() 의 +u 로의 스토어를 cpu1() 의 v 로부터의 로드 뒤에 일어난 것으로 볼 수 있다는 +뜻입니다, cpu0() 와 cpu1() 은 이 두 오퍼레이션이 의도된 순서대로 일어났음에 +모두 동의하는데도 말입니다. + +하지만, smp_load_acquire() 는 마술이 아님을 명심하시기 바랍니다. 구체적으로, +이 함수는 단순히 순서 규칙을 지키며 인자로부터의 읽기를 수행합니다. 이것은 +어떤 특정한 값이 읽힐 것인지는 보장하지 -않습니다-. 따라서, 다음과 같은 결과도 +가능합니다: + + r0 == 0 && r1 == 0 && r2 == 0 && r5 == 0 + +이런 결과는 어떤 것도 재배치 되지 않는, 순차적 일관성을 가진 가상의 +시스템에서도 일어날 수 있음을 기억해 두시기 바랍니다. + +다시 말하지만, 당신의 코드가 글로벌 이행성을 필요로 한다면, 범용 배리어를 +사용하십시오. + + +================== +명시적 커널 배리어 +================== + +리눅스 커널은 서로 다른 단계에서 동작하는 다양한 배리어들을 가지고 있습니다: + + (*) 컴파일러 배리어. + + (*) CPU 메모리 배리어. + + (*) MMIO 쓰기 배리어. + + +컴파일러 배리어 +--------------- + +리눅스 커널은 컴파일러가 메모리 액세스를 재배치 하는 것을 막아주는 명시적인 +컴파일러 배리어를 가지고 있습니다: + + barrier(); + +이건 범용 배리어입니다 -- barrier() 의 읽기-읽기 나 쓰기-쓰기 변종은 없습니다. +하지만, READ_ONCE() 와 WRITE_ONCE() 는 특정 액세스들에 대해서만 동작하는 +barrier() 의 완화된 형태로 볼 수 있습니다. + +barrier() 함수는 다음과 같은 효과를 갖습니다: + + (*) 컴파일러가 barrier() 뒤의 액세스들이 barrier() 앞의 액세스보다 앞으로 + 재배치되지 못하게 합니다. 예를 들어, 인터럽트 핸들러 코드와 인터럽트 당한 + 코드 사이의 통신을 신중히 하기 위해 사용될 수 있습니다. + + (*) 루프에서, 컴파일러가 루프 조건에 사용된 변수를 매 이터레이션마다 + 메모리에서 로드하지 않아도 되도록 최적화 하는걸 방지합니다. + +READ_ONCE() 와 WRITE_ONCE() 함수는 싱글 쓰레드 코드에서는 문제 없지만 동시성이 +있는 코드에서는 문제가 될 수 있는 모든 최적화를 막습니다. 이런 류의 최적화에 +대한 예를 몇가지 들어보면 다음과 같습니다: + + (*) 컴파일러는 같은 변수에 대한 로드와 스토어를 재배치 할 수 있고, 어떤 + 경우에는 CPU가 같은 변수로부터의 로드들을 재배치할 수도 있습니다. 이는 + 다음의 코드가: + + a[0] = x; + a[1] = x; + + x 의 예전 값이 a[1] 에, 새 값이 a[0] 에 있게 할 수 있다는 뜻입니다. + 컴파일러와 CPU가 이런 일을 못하게 하려면 다음과 같이 해야 합니다: + + a[0] = READ_ONCE(x); + a[1] = READ_ONCE(x); + + 즉, READ_ONCE() 와 WRITE_ONCE() 는 여러 CPU 에서 하나의 변수에 가해지는 + 액세스들에 캐시 일관성을 제공합니다. + + (*) 컴파일러는 같은 변수에 대한 연속적인 로드들을 병합할 수 있습니다. 그런 + 병합 작업으로 컴파일러는 다음의 코드를: + + while (tmp = a) + do_something_with(tmp); + + 다음과 같이, 싱글 쓰레드 코드에서는 말이 되지만 개발자의 의도와 전혀 맞지 + 않는 방향으로 "최적화" 할 수 있습니다: + + if (tmp = a) + for (;;) + do_something_with(tmp); + + 컴파일러가 이런 짓을 하지 못하게 하려면 READ_ONCE() 를 사용하세요: + + while (tmp = READ_ONCE(a)) + do_something_with(tmp); + + (*) 예컨대 레지스터 사용량이 많아 컴파일러가 모든 데이터를 레지스터에 담을 수 + 없는 경우, 컴파일러는 변수를 다시 로드할 수 있습니다. 따라서 컴파일러는 + 앞의 예에서 변수 'tmp' 사용을 최적화로 없애버릴 수 있습니다: + + while (tmp = a) + do_something_with(tmp); + + 이 코드는 다음과 같이 싱글 쓰레드에서는 완벽하지만 동시성이 존재하는 + 경우엔 치명적인 코드로 바뀔 수 있습니다: + + while (a) + do_something_with(a); + + 예를 들어, 최적화된 이 코드는 변수 a 가 다른 CPU 에 의해 "while" 문과 + do_something_with() 호출 사이에 바뀌어 do_something_with() 에 0을 넘길 + 수도 있습니다. + + 이번에도, 컴파일러가 그런 짓을 하는걸 막기 위해 READ_ONCE() 를 사용하세요: + + while (tmp = READ_ONCE(a)) + do_something_with(tmp); + + 레지스터가 부족한 상황을 겪는 경우, 컴파일러는 tmp 를 스택에 저장해둘 수도 + 있습니다. 컴파일러가 변수를 다시 읽어들이는건 이렇게 저장해두고 후에 다시 + 읽어들이는데 드는 오버헤드 때문입니다. 그렇게 하는게 싱글 쓰레드 + 코드에서는 안전하므로, 안전하지 않은 경우에는 컴파일러에게 직접 알려줘야 + 합니다. + + (*) 컴파일러는 그 값이 무엇일지 알고 있다면 로드를 아예 안할 수도 있습니다. + 예를 들어, 다음의 코드는 변수 'a' 의 값이 항상 0임을 증명할 수 있다면: + + while (tmp = a) + do_something_with(tmp); + + 이렇게 최적화 되어버릴 수 있습니다: + + do { } while (0); + + 이 변환은 싱글 쓰레드 코드에서는 도움이 되는데 로드와 브랜치를 제거했기 + 때문입니다. 문제는 컴파일러가 'a' 의 값을 업데이트 하는건 현재의 CPU 하나 + 뿐이라는 가정 위에서 증명을 했다는데 있습니다. 만약 변수 'a' 가 공유되어 + 있다면, 컴파일러의 증명은 틀린 것이 될겁니다. 컴파일러는 그 자신이 + 생각하는 것만큼 많은 것을 알고 있지 못함을 컴파일러에게 알리기 위해 + READ_ONCE() 를 사용하세요: + + while (tmp = READ_ONCE(a)) + do_something_with(tmp); + + 하지만 컴파일러는 READ_ONCE() 뒤에 나오는 값에 대해서도 눈길을 두고 있음을 + 기억하세요. 예를 들어, 다음의 코드에서 MAX 는 전처리기 매크로로, 1의 값을 + 갖는다고 해봅시다: + + while ((tmp = READ_ONCE(a)) % MAX) + do_something_with(tmp); + + 이렇게 되면 컴파일러는 MAX 를 가지고 수행되는 "%" 오퍼레이터의 결과가 항상 + 0이라는 것을 알게 되고, 컴파일러가 코드를 실질적으로는 존재하지 않는 + 것처럼 최적화 하는 것이 허용되어 버립니다. ('a' 변수의 로드는 여전히 + 행해질 겁니다.) + + (*) 비슷하게, 컴파일러는 변수가 저장하려 하는 값을 이미 가지고 있다는 것을 + 알면 스토어 자체를 제거할 수 있습니다. 이번에도, 컴파일러는 현재의 CPU + 만이 그 변수에 값을 쓰는 오로지 하나의 존재라고 생각하여 공유된 변수에 + 대해서는 잘못된 일을 하게 됩니다. 예를 들어, 다음과 같은 경우가 있을 수 + 있습니다: + + a = 0; + ... 변수 a 에 스토어를 하지 않는 코드 ... + a = 0; + + 컴파일러는 변수 'a' 의 값은 이미 0이라는 것을 알고, 따라서 두번째 스토어를 + 삭제할 겁니다. 만약 다른 CPU 가 그 사이 변수 'a' 에 다른 값을 썼다면 + 황당한 결과가 나올 겁니다. + + 컴파일러가 그런 잘못된 추측을 하지 않도록 WRITE_ONCE() 를 사용하세요: + + WRITE_ONCE(a, 0); + ... 변수 a 에 스토어를 하지 않는 코드 ... + WRITE_ONCE(a, 0); + + (*) 컴파일러는 하지 말라고 하지 않으면 메모리 액세스들을 재배치 할 수 + 있습니다. 예를 들어, 다음의 프로세스 레벨 코드와 인터럽트 핸들러 사이의 + 상호작용을 생각해 봅시다: + + void process_level(void) + { + msg = get_message(); + flag = true; + } + + void interrupt_handler(void) + { + if (flag) + process_message(msg); + } + + 이 코드에는 컴파일러가 process_level() 을 다음과 같이 변환하는 것을 막을 + 수단이 없고, 이런 변환은 싱글쓰레드에서라면 실제로 훌륭한 선택일 수 + 있습니다: + + void process_level(void) + { + flag = true; + msg = get_message(); + } + + 이 두개의 문장 사이에 인터럽트가 발생한다면, interrupt_handler() 는 의미를 + 알 수 없는 메세지를 받을 수도 있습니다. 이걸 막기 위해 다음과 같이 + WRITE_ONCE() 를 사용하세요: + + void process_level(void) + { + WRITE_ONCE(msg, get_message()); + WRITE_ONCE(flag, true); + } + + void interrupt_handler(void) + { + if (READ_ONCE(flag)) + process_message(READ_ONCE(msg)); + } + + interrupt_handler() 안에서도 중첩된 인터럽트나 NMI 와 같이 인터럽트 핸들러 + 역시 'flag' 와 'msg' 에 접근하는 또다른 무언가에 인터럽트 될 수 있다면 + READ_ONCE() 와 WRITE_ONCE() 를 사용해야 함을 기억해 두세요. 만약 그런 + 가능성이 없다면, interrupt_handler() 안에서는 문서화 목적이 아니라면 + READ_ONCE() 와 WRITE_ONCE() 는 필요치 않습니다. (근래의 리눅스 커널에서 + 중첩된 인터럽트는 보통 잘 일어나지 않음도 기억해 두세요, 실제로, 어떤 + 인터럽트 핸들러가 인터럽트가 활성화된 채로 리턴하면 WARN_ONCE() 가 + 실행됩니다.) + + 컴파일러는 READ_ONCE() 와 WRITE_ONCE() 뒤의 READ_ONCE() 나 WRITE_ONCE(), + barrier(), 또는 비슷한 것들을 담고 있지 않은 코드를 움직일 수 있을 것으로 + 가정되어야 합니다. + + 이 효과는 barrier() 를 통해서도 만들 수 있지만, READ_ONCE() 와 + WRITE_ONCE() 가 좀 더 안목 높은 선택입니다: READ_ONCE() 와 WRITE_ONCE()는 + 컴파일러에 주어진 메모리 영역에 대해서만 최적화 가능성을 포기하도록 + 하지만, barrier() 는 컴파일러가 지금까지 기계의 레지스터에 캐시해 놓은 + 모든 메모리 영역의 값을 버려야 하게 하기 때문입니다. 물론, 컴파일러는 + READ_ONCE() 와 WRITE_ONCE() 가 일어난 순서도 지켜줍니다, CPU 는 당연히 + 그 순서를 지킬 의무가 없지만요. + + (*) 컴파일러는 다음의 예에서와 같이 변수에의 스토어를 날조해낼 수도 있습니다: + + if (a) + b = a; + else + b = 42; + + 컴파일러는 아래와 같은 최적화로 브랜치를 줄일 겁니다: + + b = 42; + if (a) + b = a; + + 싱글 쓰레드 코드에서 이 최적화는 안전할 뿐 아니라 브랜치 갯수를 + 줄여줍니다. 하지만 안타깝게도, 동시성이 있는 코드에서는 이 최적화는 다른 + CPU 가 'b' 를 로드할 때, -- 'a' 가 0이 아닌데도 -- 가짜인 값, 42를 보게 + 되는 경우를 가능하게 합니다. 이걸 방지하기 위해 WRITE_ONCE() 를 + 사용하세요: + + if (a) + WRITE_ONCE(b, a); + else + WRITE_ONCE(b, 42); + + 컴파일러는 로드를 만들어낼 수도 있습니다. 일반적으로는 문제를 일으키지 + 않지만, 캐시 라인 바운싱을 일으켜 성능과 확장성을 떨어뜨릴 수 있습니다. + 날조된 로드를 막기 위해선 READ_ONCE() 를 사용하세요. + + (*) 정렬된 메모리 주소에 위치한, 한번의 메모리 참조 인스트럭션으로 액세스 + 가능한 크기의 데이터는 하나의 큰 액세스가 여러개의 작은 액세스들로 + 대체되는 "로드 티어링(load tearing)" 과 "스토어 티어링(store tearing)" 을 + 방지합니다. 예를 들어, 주어진 아키텍쳐가 7-bit imeediate field 를 갖는 + 16-bit 스토어 인스트럭션을 제공한다면, 컴파일러는 다음의 32-bit 스토어를 + 구현하는데에 두개의 16-bit store-immediate 명령을 사용하려 할겁니다: + + p = 0x00010002; + + 스토어 할 상수를 만들고 그 값을 스토어 하기 위해 두개가 넘는 인스트럭션을 + 사용하게 되는, 이런 종류의 최적화를 GCC 는 실제로 함을 부디 알아 두십시오. + 이 최적화는 싱글 쓰레드 코드에서는 성공적인 최적화 입니다. 실제로, 근래에 + 발생한 (그리고 고쳐진) 버그는 GCC 가 volatile 스토어에 비정상적으로 이 + 최적화를 사용하게 했습니다. 그런 버그가 없다면, 다음의 예에서 + WRITE_ONCE() 의 사용은 스토어 티어링을 방지합니다: + + WRITE_ONCE(p, 0x00010002); + + Packed 구조체의 사용 역시 다음의 예처럼 로드 / 스토어 티어링을 유발할 수 + 있습니다: + + struct __attribute__((__packed__)) foo { + short a; + int b; + short c; + }; + struct foo foo1, foo2; + ... + + foo2.a = foo1.a; + foo2.b = foo1.b; + foo2.c = foo1.c; + + READ_ONCE() 나 WRITE_ONCE() 도 없고 volatile 마킹도 없기 때문에, + 컴파일러는 이 세개의 대입문을 두개의 32-bit 로드와 두개의 32-bit 스토어로 + 변환할 수 있습니다. 이는 'foo1.b' 의 값의 로드 티어링과 'foo2.b' 의 + 스토어 티어링을 초래할 겁니다. 이 예에서도 READ_ONCE() 와 WRITE_ONCE() + 가 티어링을 막을 수 있습니다: + + foo2.a = foo1.a; + WRITE_ONCE(foo2.b, READ_ONCE(foo1.b)); + foo2.c = foo1.c; + +그렇지만, volatile 로 마크된 변수에 대해서는 READ_ONCE() 와 WRITE_ONCE() 가 +필요치 않습니다. 예를 들어, 'jiffies' 는 volatile 로 마크되어 있기 때문에, +READ_ONCE(jiffies) 라고 할 필요가 없습니다. READ_ONCE() 와 WRITE_ONCE() 가 +실은 volatile 캐스팅으로 구현되어 있어서 인자가 이미 volatile 로 마크되어 +있다면 또다른 효과를 내지는 않기 때문입니다. + +이 컴파일러 배리어들은 CPU 에는 직접적 효과를 전혀 만들지 않기 때문에, 결국은 +재배치가 일어날 수도 있음을 부디 기억해 두십시오. + + +CPU 메모리 배리어 +----------------- + +리눅스 커널은 다음의 여덟개 기본 CPU 메모리 배리어를 가지고 있습니다: + + TYPE MANDATORY SMP CONDITIONAL + =============== ======================= =========================== + 범용 mb() smp_mb() + 쓰기 wmb() smp_wmb() + 읽기 rmb() smp_rmb() + 데이터 의존성 read_barrier_depends() smp_read_barrier_depends() + + +데이터 의존성 배리어를 제외한 모든 메모리 배리어는 컴파일러 배리어를 +포함합니다. 데이터 의존성은 컴파일러에의 추가적인 순서 보장을 포함하지 +않습니다. + +방백: 데이터 의존성이 있는 경우, 컴파일러는 해당 로드를 올바른 순서로 일으킬 +것으로 (예: `a[b]` 는 a[b] 를 로드 하기 전에 b 의 값을 먼저 로드한다) +기대되지만, C 언어 사양에는 컴파일러가 b 의 값을 추측 (예: 1 과 같음) 해서 +b 로드 전에 a 로드를 하는 코드 (예: tmp = a[1]; if (b != 1) tmp = a[b]; ) 를 +만들지 않아야 한다는 내용 같은 건 없습니다. 또한 컴파일러는 a[b] 를 로드한 +후에 b 를 또다시 로드할 수도 있어서, a[b] 보다 최신 버전의 b 값을 가질 수도 +있습니다. 이런 문제들의 해결책에 대한 의견 일치는 아직 없습니다만, 일단 +READ_ONCE() 매크로부터 보기 시작하는게 좋은 시작이 될겁니다. + +SMP 메모리 배리어들은 유니프로세서로 컴파일된 시스템에서는 컴파일러 배리어로 +바뀌는데, 하나의 CPU 는 스스로 일관성을 유지하고, 겹치는 액세스들 역시 올바른 +순서로 행해질 것으로 생각되기 때문입니다. 하지만, 아래의 "Virtual Machine +Guests" 서브섹션을 참고하십시오. + +[!] SMP 시스템에서 공유메모리로의 접근들을 순서 세워야 할 때, SMP 메모리 +배리어는 _반드시_ 사용되어야 함을 기억하세요, 그대신 락을 사용하는 것으로도 +충분하긴 하지만 말이죠. + +Mandatory 배리어들은 SMP 시스템에서도 UP 시스템에서도 SMP 효과만 통제하기에는 +불필요한 오버헤드를 갖기 때문에 SMP 효과만 통제하면 되는 곳에는 사용되지 않아야 +합니다. 하지만, 느슨한 순서 규칙의 메모리 I/O 윈도우를 통한 MMIO 의 효과를 +통제할 때에는 mandatory 배리어들이 사용될 수 있습니다. 이 배리어들은 +컴파일러와 CPU 모두 재배치를 못하도록 함으로써 메모리 오퍼레이션들이 디바이스에 +보여지는 순서에도 영향을 주기 때문에, SMP 가 아닌 시스템이라 할지라도 필요할 수 +있습니다. + + +일부 고급 배리어 함수들도 있습니다: + + (*) smp_store_mb(var, value) + + 이 함수는 특정 변수에 특정 값을 대입하고 범용 메모리 배리어를 칩니다. + UP 컴파일에서는 컴파일러 배리어보다 더한 것을 친다고는 보장되지 않습니다. + + + (*) smp_mb__before_atomic(); + (*) smp_mb__after_atomic(); + + 이것들은 값을 리턴하지 않는 (더하기, 빼기, 증가, 감소와 같은) 어토믹 + 함수들을 위한, 특히 그것들이 레퍼런스 카운팅에 사용될 때를 위한 + 함수들입니다. 이 함수들은 메모리 배리어를 내포하고 있지는 않습니다. + + 이것들은 값을 리턴하지 않으며 어토믹한 (set_bit 과 clear_bit 같은) 비트 + 연산에도 사용될 수 있습니다. + + 한 예로, 객체 하나를 무효한 것으로 표시하고 그 객체의 레퍼런스 카운트를 + 감소시키는 다음 코드를 보세요: + + obj->dead = 1; + smp_mb__before_atomic(); + atomic_dec(&obj->ref_count); + + 이 코드는 객체의 업데이트된 death 마크가 레퍼런스 카운터 감소 동작 + *전에* 보일 것을 보장합니다. + + 더 많은 정보를 위해선 Documentation/atomic_ops.txt 문서를 참고하세요. + 어디서 이것들을 사용해야 할지 궁금하다면 "어토믹 오퍼레이션" 서브섹션을 + 참고하세요. + + + (*) lockless_dereference(); + + 이 함수는 smp_read_barrier_depends() 데이터 의존성 배리어를 사용하는 + 포인터 읽어오기 래퍼(wrapper) 함수로 생각될 수 있습니다. + + 객체의 라이프타임이 RCU 외의 메커니즘으로 관리된다는 점을 제외하면 + rcu_dereference() 와도 유사한데, 예를 들면 객체가 시스템이 꺼질 때에만 + 제거되는 경우 등입니다. 또한, lockless_dereference() 은 RCU 와 함께 + 사용될수도, RCU 없이 사용될 수도 있는 일부 데이터 구조에 사용되고 + 있습니다. + + + (*) dma_wmb(); + (*) dma_rmb(); + + 이것들은 CPU 와 DMA 가능한 디바이스에서 모두 액세스 가능한 공유 메모리의 + 읽기, 쓰기 작업들의 순서를 보장하기 위해 consistent memory 에서 사용하기 + 위한 것들입니다. + + 예를 들어, 디바이스와 메모리를 공유하며, 디스크립터 상태 값을 사용해 + 디스크립터가 디바이스에 속해 있는지 아니면 CPU 에 속해 있는지 표시하고, + 공지용 초인종(doorbell) 을 사용해 업데이트된 디스크립터가 디바이스에 사용 + 가능해졌음을 공지하는 디바이스 드라이버를 생각해 봅시다: + + if (desc->status != DEVICE_OWN) { + /* 디스크립터를 소유하기 전에는 데이터를 읽지 않음 */ + dma_rmb(); + + /* 데이터를 읽고 씀 */ + read_data = desc->data; + desc->data = write_data; + + /* 상태 업데이트 전 수정사항을 반영 */ + dma_wmb(); + + /* 소유권을 수정 */ + desc->status = DEVICE_OWN; + + /* MMIO 를 통해 디바이스에 공지를 하기 전에 메모리를 동기화 */ + wmb(); + + /* 업데이트된 디스크립터의 디바이스에 공지 */ + writel(DESC_NOTIFY, doorbell); + } + + dma_rmb() 는 디스크립터로부터 데이터를 읽어오기 전에 디바이스가 소유권을 + 내놓았음을 보장하게 하고, dma_wmb() 는 디바이스가 자신이 소유권을 다시 + 가졌음을 보기 전에 디스크립터에 데이터가 쓰였음을 보장합니다. wmb() 는 + 캐시 일관성이 없는 (cache incoherent) MMIO 영역에 쓰기를 시도하기 전에 + 캐시 일관성이 있는 메모리 (cache coherent memory) 쓰기가 완료되었음을 + 보장해주기 위해 필요합니다. + + consistent memory 에 대한 자세한 내용을 위해선 Documentation/DMA-API.txt + 문서를 참고하세요. + + +MMIO 쓰기 배리어 +---------------- + +리눅스 커널은 또한 memory-mapped I/O 쓰기를 위한 특별한 배리어도 가지고 +있습니다: + + mmiowb(); + +이것은 mandatory 쓰기 배리어의 변종으로, 완화된 순서 규칙의 I/O 영역에으로의 +쓰기가 부분적으로 순서를 맞추도록 해줍니다. 이 함수는 CPU->하드웨어 사이를 +넘어서 실제 하드웨어에까지 일부 수준의 영향을 끼칩니다. + +더 많은 정보를 위해선 "Acquire vs I/O 액세스" 서브섹션을 참고하세요. + + +========================= +암묵적 커널 메모리 배리어 +========================= + +리눅스 커널의 일부 함수들은 메모리 배리어를 내장하고 있는데, 락(lock)과 +스케쥴링 관련 함수들이 대부분입니다. + +여기선 _최소한의_ 보장을 설명합니다; 특정 아키텍쳐에서는 이 설명보다 더 많은 +보장을 제공할 수도 있습니다만 해당 아키텍쳐에 종속적인 코드 외의 부분에서는 +그런 보장을 기대해선 안될겁니다. + + +락 ACQUISITION 함수 +------------------- + +리눅스 커널은 다양한 락 구성체를 가지고 있습니다: + + (*) 스핀 락 + (*) R/W 스핀 락 + (*) 뮤텍스 + (*) 세마포어 + (*) R/W 세마포어 + +각 구성체마다 모든 경우에 "ACQUIRE" 오퍼레이션과 "RELEASE" 오퍼레이션의 변종이 +존재합니다. 이 오퍼레이션들은 모두 적절한 배리어를 내포하고 있습니다: + + (1) ACQUIRE 오퍼레이션의 영향: + + ACQUIRE 뒤에서 요청된 메모리 오퍼레이션은 ACQUIRE 오퍼레이션이 완료된 + 뒤에 완료됩니다. + + ACQUIRE 앞에서 요청된 메모리 오퍼레이션은 ACQUIRE 오퍼레이션이 완료된 후에 + 완료될 수 있습니다. smp_mb__before_spinlock() 뒤에 ACQUIRE 가 실행되는 + 코드 블록은 블록 앞의 스토어를 블록 뒤의 로드와 스토어에 대해 순서 + 맞춥니다. 이건 smp_mb() 보다 완화된 것임을 기억하세요! 많은 아키텍쳐에서 + smp_mb__before_spinlock() 은 사실 아무일도 하지 않습니다. + + (2) RELEASE 오퍼레이션의 영향: + + RELEASE 앞에서 요청된 메모리 오퍼레이션은 RELEASE 오퍼레이션이 완료되기 + 전에 완료됩니다. + + RELEASE 뒤에서 요청된 메모리 오퍼레이션은 RELEASE 오퍼레이션 완료 전에 + 완료될 수 있습니다. + + (3) ACQUIRE vs ACQUIRE 영향: + + 어떤 ACQUIRE 오퍼레이션보다 앞에서 요청된 모든 ACQUIRE 오퍼레이션은 그 + ACQUIRE 오퍼레이션 전에 완료됩니다. + + (4) ACQUIRE vs RELEASE implication: + + 어떤 RELEASE 오퍼레이션보다 앞서 요청된 ACQUIRE 오퍼레이션은 그 RELEASE + 오퍼레이션보다 먼저 완료됩니다. + + (5) 실패한 조건적 ACQUIRE 영향: + + ACQUIRE 오퍼레이션의 일부 락(lock) 변종은 락이 곧바로 획득하기에는 + 불가능한 상태이거나 락이 획득 가능해지도록 기다리는 도중 시그널을 받거나 + 해서 실패할 수 있습니다. 실패한 락은 어떤 배리어도 내포하지 않습니다. + +[!] 참고: 락 ACQUIRE 와 RELEASE 가 단방향 배리어여서 나타나는 현상 중 하나는 +크리티컬 섹션 바깥의 인스트럭션의 영향이 크리티컬 섹션 내부로도 들어올 수 +있다는 것입니다. + +RELEASE 후에 요청되는 ACQUIRE 는 전체 메모리 배리어라 여겨지면 안되는데, +ACQUIRE 앞의 액세스가 ACQUIRE 후에 수행될 수 있고, RELEASE 후의 액세스가 +RELEASE 전에 수행될 수도 있으며, 그 두개의 액세스가 서로를 지나칠 수도 있기 +때문입니다: + + *A = a; + ACQUIRE M + RELEASE M + *B = b; + +는 다음과 같이 될 수도 있습니다: + + ACQUIRE M, STORE *B, STORE *A, RELEASE M + +ACQUIRE 와 RELEASE 가 락 획득과 해제라면, 그리고 락의 ACQUIRE 와 RELEASE 가 +같은 락 변수에 대한 것이라면, 해당 락을 쥐고 있지 않은 다른 CPU 의 시야에는 +이와 같은 재배치가 일어나는 것으로 보일 수 있습니다. 요약하자면, ACQUIRE 에 +이어 RELEASE 오퍼레이션을 순차적으로 실행하는 행위가 전체 메모리 배리어로 +생각되어선 -안됩니다-. + +비슷하게, 앞의 반대 케이스인 RELEASE 와 ACQUIRE 두개 오퍼레이션의 순차적 실행 +역시 전체 메모리 배리어를 내포하지 않습니다. 따라서, RELEASE, ACQUIRE 로 +규정되는 크리티컬 섹션의 CPU 수행은 RELEASE 와 ACQUIRE 를 가로지를 수 있으므로, +다음과 같은 코드는: + + *A = a; + RELEASE M + ACQUIRE N + *B = b; + +다음과 같이 수행될 수 있습니다: + + ACQUIRE N, STORE *B, STORE *A, RELEASE M + +이런 재배치는 데드락을 일으킬 수도 있을 것처럼 보일 수 있습니다. 하지만, 그런 +데드락의 조짐이 있다면 RELEASE 는 단순히 완료될 것이므로 데드락은 존재할 수 +없습니다. + + 이게 어떻게 올바른 동작을 할 수 있을까요? + + 우리가 이야기 하고 있는건 재배치를 하는 CPU 에 대한 이야기이지, + 컴파일러에 대한 것이 아니란 점이 핵심입니다. 컴파일러 (또는, 개발자) + 가 오퍼레이션들을 이렇게 재배치하면, 데드락이 일어날 수 -있습-니다. + + 하지만 CPU 가 오퍼레이션들을 재배치 했다는걸 생각해 보세요. 이 예에서, + 어셈블리 코드 상으로는 언락이 락을 앞서게 되어 있습니다. CPU 가 이를 + 재배치해서 뒤의 락 오퍼레이션을 먼저 실행하게 됩니다. 만약 데드락이 + 존재한다면, 이 락 오퍼레이션은 그저 스핀을 하며 계속해서 락을 + 시도합니다 (또는, 한참 후에겠지만, 잠듭니다). CPU 는 언젠가는 + (어셈블리 코드에서는 락을 앞서는) 언락 오퍼레이션을 실행하는데, 이 언락 + 오퍼레이션이 잠재적 데드락을 해결하고, 락 오퍼레이션도 뒤이어 성공하게 + 됩니다. + + 하지만 만약 락이 잠을 자는 타입이었다면요? 그런 경우에 코드는 + 스케쥴러로 들어가려 할 거고, 여기서 결국은 메모리 배리어를 만나게 + 되는데, 이 메모리 배리어는 앞의 언락 오퍼레이션이 완료되도록 만들고, + 데드락은 이번에도 해결됩니다. 잠을 자는 행위와 언락 사이의 경주 상황 + (race) 도 있을 수 있겠습니다만, 락 관련 기능들은 그런 경주 상황을 모든 + 경우에 제대로 해결할 수 있어야 합니다. + +락과 세마포어는 UP 컴파일된 시스템에서의 순서에 대해 보장을 하지 않기 때문에, +그런 상황에서 인터럽트 비활성화 오퍼레이션과 함께가 아니라면 어떤 일에도 - 특히 +I/O 액세스와 관련해서는 - 제대로 사용될 수 없을 겁니다. + +"CPU 간 ACQUIRING 배리어 효과" 섹션도 참고하시기 바랍니다. + + +예를 들어, 다음과 같은 코드를 생각해 봅시다: + + *A = a; + *B = b; + ACQUIRE + *C = c; + *D = d; + RELEASE + *E = e; + *F = f; + +여기선 다음의 이벤트 시퀀스가 생길 수 있습니다: + + ACQUIRE, {*F,*A}, *E, {*C,*D}, *B, RELEASE + + [+] {*F,*A} 는 조합된 액세스를 의미합니다. + +하지만 다음과 같은 건 불가능하죠: + + {*F,*A}, *B, ACQUIRE, *C, *D, RELEASE, *E + *A, *B, *C, ACQUIRE, *D, RELEASE, *E, *F + *A, *B, ACQUIRE, *C, RELEASE, *D, *E, *F + *B, ACQUIRE, *C, *D, RELEASE, {*F,*A}, *E + + + +인터럽트 비활성화 함수 +---------------------- + +인터럽트를 비활성화 하는 함수 (ACQUIRE 와 동일) 와 인터럽트를 활성화 하는 함수 +(RELEASE 와 동일) 는 컴파일러 배리어처럼만 동작합니다. 따라서, 별도의 메모리 +배리어나 I/O 배리어가 필요한 상황이라면 그 배리어들은 인터럽트 비활성화 함수 +외의 방법으로 제공되어야만 합니다. + + +슬립과 웨이크업 함수 +-------------------- + +글로벌 데이터에 표시된 이벤트에 의해 프로세스를 잠에 빠트리는 것과 깨우는 것은 +해당 이벤트를 기다리는 태스크의 태스크 상태와 그 이벤트를 알리기 위해 사용되는 +글로벌 데이터, 두 데이터간의 상호작용으로 볼 수 있습니다. 이것이 옳은 순서대로 +일어남을 분명히 하기 위해, 프로세스를 잠에 들게 하는 기능과 깨우는 기능은 +몇가지 배리어를 내포합니다. + +먼저, 잠을 재우는 쪽은 일반적으로 다음과 같은 이벤트 시퀀스를 따릅니다: + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (event_indicated) + break; + schedule(); + } + +set_current_state() 에 의해, 태스크 상태가 바뀐 후 범용 메모리 배리어가 +자동으로 삽입됩니다: + + CPU 1 + =============================== + set_current_state(); + smp_store_mb(); + STORE current->state + <범용 배리어> + LOAD event_indicated + +set_current_state() 는 다음의 것들로 감싸질 수도 있습니다: + + prepare_to_wait(); + prepare_to_wait_exclusive(); + +이것들 역시 상태를 설정한 후 범용 메모리 배리어를 삽입합니다. +앞의 전체 시퀀스는 다음과 같은 함수들로 한번에 수행 가능한데, 이것들은 모두 +올바른 장소에 메모리 배리어를 삽입합니다: + + wait_event(); + wait_event_interruptible(); + wait_event_interruptible_exclusive(); + wait_event_interruptible_timeout(); + wait_event_killable(); + wait_event_timeout(); + wait_on_bit(); + wait_on_bit_lock(); + + +두번째로, 깨우기를 수행하는 코드는 일반적으로 다음과 같을 겁니다: + + event_indicated = 1; + wake_up(&event_wait_queue); + +또는: + + event_indicated = 1; + wake_up_process(event_daemon); + +wake_up() 류에 의해 쓰기 메모리 배리어가 내포됩니다. 만약 그것들이 뭔가를 +깨운다면요. 이 배리어는 태스크 상태가 지워지기 전에 수행되므로, 이벤트를 +알리기 위한 STORE 와 태스크 상태를 TASK_RUNNING 으로 설정하는 STORE 사이에 +위치하게 됩니다. + + CPU 1 CPU 2 + =============================== =============================== + set_current_state(); STORE event_indicated + smp_store_mb(); wake_up(); + STORE current->state <쓰기 배리어> + <범용 배리어> STORE current->state + LOAD event_indicated + +한번더 말합니다만, 이 쓰기 메모리 배리어는 이 코드가 정말로 뭔가를 깨울 때에만 +실행됩니다. 이걸 설명하기 위해, X 와 Y 는 모두 0 으로 초기화 되어 있다는 가정 +하에 아래의 이벤트 시퀀스를 생각해 봅시다: + + CPU 1 CPU 2 + =============================== =============================== + X = 1; STORE event_indicated + smp_mb(); wake_up(); + Y = 1; wait_event(wq, Y == 1); + wake_up(); load from Y sees 1, no memory barrier + load from X might see 0 + +위 예제에서의 경우와 달리 깨우기가 정말로 행해졌다면, CPU 2 의 X 로드는 1 을 +본다고 보장될 수 있을 겁니다. + +사용 가능한 깨우기류 함수들로 다음과 같은 것들이 있습니다: + + complete(); + wake_up(); + wake_up_all(); + wake_up_bit(); + wake_up_interruptible(); + wake_up_interruptible_all(); + wake_up_interruptible_nr(); + wake_up_interruptible_poll(); + wake_up_interruptible_sync(); + wake_up_interruptible_sync_poll(); + wake_up_locked(); + wake_up_locked_poll(); + wake_up_nr(); + wake_up_poll(); + wake_up_process(); + + +[!] 잠재우는 코드와 깨우는 코드에 내포되는 메모리 배리어들은 깨우기 전에 +이루어진 스토어를 잠재우는 코드가 set_current_state() 를 호출한 후에 행하는 +로드에 대해 순서를 맞추지 _않는다는_ 점을 기억하세요. 예를 들어, 잠재우는 +코드가 다음과 같고: + + set_current_state(TASK_INTERRUPTIBLE); + if (event_indicated) + break; + __set_current_state(TASK_RUNNING); + do_something(my_data); + +깨우는 코드는 다음과 같다면: + + my_data = value; + event_indicated = 1; + wake_up(&event_wait_queue); + +event_indecated 에의 변경이 잠재우는 코드에게 my_data 에의 변경 후에 이루어진 +것으로 인지될 것이라는 보장이 없습니다. 이런 경우에는 양쪽 코드 모두 각각의 +데이터 액세스 사이에 메모리 배리어를 직접 쳐야 합니다. 따라서 앞의 재우는 +코드는 다음과 같이: + + set_current_state(TASK_INTERRUPTIBLE); + if (event_indicated) { + smp_rmb(); + do_something(my_data); + } + +그리고 깨우는 코드는 다음과 같이 되어야 합니다: + + my_data = value; + smp_wmb(); + event_indicated = 1; + wake_up(&event_wait_queue); + + +그외의 함수들 +------------- + +그외의 배리어를 내포하는 함수들은 다음과 같습니다: + + (*) schedule() 과 그 유사한 것들이 완전한 메모리 배리어를 내포합니다. + + +============================== +CPU 간 ACQUIRING 배리어의 효과 +============================== + +SMP 시스템에서의 락 기능들은 더욱 강력한 형태의 배리어를 제공합니다: 이 +배리어는 동일한 락을 사용하는 다른 CPU 들의 메모리 액세스 순서에도 영향을 +끼칩니다. + + +ACQUIRE VS 메모리 액세스 +------------------------ + +다음의 예를 생각해 봅시다: 시스템은 두개의 스핀락 (M) 과 (Q), 그리고 세개의 CPU +를 가지고 있습니다; 여기에 다음의 이벤트 시퀀스가 발생합니다: + + CPU 1 CPU 2 + =============================== =============================== + WRITE_ONCE(*A, a); WRITE_ONCE(*E, e); + ACQUIRE M ACQUIRE Q + WRITE_ONCE(*B, b); WRITE_ONCE(*F, f); + WRITE_ONCE(*C, c); WRITE_ONCE(*G, g); + RELEASE M RELEASE Q + WRITE_ONCE(*D, d); WRITE_ONCE(*H, h); + +*A 로의 액세스부터 *H 로의 액세스까지가 어떤 순서로 CPU 3 에게 보여질지에 +대해서는 각 CPU 에서의 락 사용에 의해 내포되어 있는 제약을 제외하고는 어떤 +보장도 존재하지 않습니다. 예를 들어, CPU 3 에게 다음과 같은 순서로 보여지는 +것이 가능합니다: + + *E, ACQUIRE M, ACQUIRE Q, *G, *C, *F, *A, *B, RELEASE Q, *D, *H, RELEASE M + +하지만 다음과 같이 보이지는 않을 겁니다: + + *B, *C or *D preceding ACQUIRE M + *A, *B or *C following RELEASE M + *F, *G or *H preceding ACQUIRE Q + *E, *F or *G following RELEASE Q + + + +ACQUIRE VS I/O 액세스 +---------------------- + +특정한 (특히 NUMA 가 관련된) 환경 하에서 두개의 CPU 에서 동일한 스핀락으로 +보호되는 두개의 크리티컬 섹션 안의 I/O 액세스는 PCI 브릿지에 겹쳐진 I/O +액세스로 보일 수 있는데, PCI 브릿지는 캐시 일관성 프로토콜과 합을 맞춰야 할 +의무가 없으므로, 필요한 읽기 메모리 배리어가 요청되지 않기 때문입니다. + +예를 들어서: + + CPU 1 CPU 2 + =============================== =============================== + spin_lock(Q) + writel(0, ADDR) + writel(1, DATA); + spin_unlock(Q); + spin_lock(Q); + writel(4, ADDR); + writel(5, DATA); + spin_unlock(Q); + +는 PCI 브릿지에 다음과 같이 보일 수 있습니다: + + STORE *ADDR = 0, STORE *ADDR = 4, STORE *DATA = 1, STORE *DATA = 5 + +이렇게 되면 하드웨어의 오동작을 일으킬 수 있습니다. + + +이런 경우엔 잡아둔 스핀락을 내려놓기 전에 mmiowb() 를 수행해야 하는데, 예를 +들면 다음과 같습니다: + + CPU 1 CPU 2 + =============================== =============================== + spin_lock(Q) + writel(0, ADDR) + writel(1, DATA); + mmiowb(); + spin_unlock(Q); + spin_lock(Q); + writel(4, ADDR); + writel(5, DATA); + mmiowb(); + spin_unlock(Q); + +이 코드는 CPU 1 에서 요청된 두개의 스토어가 PCI 브릿지에 CPU 2 에서 요청된 +스토어들보다 먼저 보여짐을 보장합니다. + + +또한, 같은 디바이스에서 스토어를 이어 로드가 수행되면 이 로드는 로드가 수행되기 +전에 스토어가 완료되기를 강제하므로 mmiowb() 의 필요가 없어집니다: + + CPU 1 CPU 2 + =============================== =============================== + spin_lock(Q) + writel(0, ADDR) + a = readl(DATA); + spin_unlock(Q); + spin_lock(Q); + writel(4, ADDR); + b = readl(DATA); + spin_unlock(Q); + + +더 많은 정보를 위해선 Documenataion/DocBook/deviceiobook.tmpl 을 참고하세요. + + +========================= +메모리 배리어가 필요한 곳 +========================= + +설령 SMP 커널을 사용하더라도 싱글 쓰레드로 동작하는 코드는 올바르게 동작하는 +것으로 보여질 것이기 때문에, 평범한 시스템 운영중에 메모리 오퍼레이션 재배치는 +일반적으로 문제가 되지 않습니다. 하지만, 재배치가 문제가 _될 수 있는_ 네가지 +환경이 있습니다: + + (*) 프로세서간 상호 작용. + + (*) 어토믹 오퍼레이션. + + (*) 디바이스 액세스. + + (*) 인터럽트. + + +프로세서간 상호 작용 +-------------------- + +두개 이상의 프로세서를 가진 시스템이 있다면, 시스템의 두개 이상의 CPU 는 동시에 +같은 데이터에 대한 작업을 할 수 있습니다. 이는 동기화 문제를 일으킬 수 있고, +이 문제를 해결하는 일반적 방법은 락을 사용하는 것입니다. 하지만, 락은 상당히 +비용이 비싸서 가능하면 락을 사용하지 않고 일을 처리하는 것이 낫습니다. 이런 +경우, 두 CPU 모두에 영향을 끼치는 오퍼레이션들은 오동작을 막기 위해 신중하게 +순서가 맞춰져야 합니다. + +예를 들어, R/W 세마포어의 느린 수행경로 (slow path) 를 생각해 봅시다. +세마포어를 위해 대기를 하는 하나의 프로세스가 자신의 스택 중 일부를 이 +세마포어의 대기 프로세스 리스트에 링크한 채로 있습니다: + + struct rw_semaphore { + ... + spinlock_t lock; + struct list_head waiters; + }; + + struct rwsem_waiter { + struct list_head list; + struct task_struct *task; + }; + +특정 대기 상태 프로세스를 깨우기 위해, up_read() 나 up_write() 함수는 다음과 +같은 일을 합니다: + + (1) 다음 대기 상태 프로세스 레코드는 어디있는지 알기 위해 이 대기 상태 + 프로세스 레코드의 next 포인터를 읽습니다; + + (2) 이 대기 상태 프로세스의 task 구조체로의 포인터를 읽습니다; + + (3) 이 대기 상태 프로세스가 세마포어를 획득했음을 알리기 위해 task + 포인터를 초기화 합니다; + + (4) 해당 태스크에 대해 wake_up_process() 를 호출합니다; 그리고 + + (5) 해당 대기 상태 프로세스의 task 구조체를 잡고 있던 레퍼런스를 해제합니다. + +달리 말하자면, 다음 이벤트 시퀀스를 수행해야 합니다: + + LOAD waiter->list.next; + LOAD waiter->task; + STORE waiter->task; + CALL wakeup + RELEASE task + +그리고 이 이벤트들이 다른 순서로 수행된다면, 오동작이 일어날 수 있습니다. + +한번 세마포어의 대기줄에 들어갔고 세마포어 락을 놓았다면, 해당 대기 프로세스는 +락을 다시는 잡지 않습니다; 대신 자신의 task 포인터가 초기화 되길 기다립니다. +그 레코드는 대기 프로세스의 스택에 있기 때문에, 리스트의 next 포인터가 읽혀지기 +_전에_ task 포인터가 지워진다면, 다른 CPU 는 해당 대기 프로세스를 시작해 버리고 +up*() 함수가 next 포인터를 읽기 전에 대기 프로세스의 스택을 마구 건드릴 수 +있습니다. + +그렇게 되면 위의 이벤트 시퀀스에 어떤 일이 일어나는지 생각해 보죠: + + CPU 1 CPU 2 + =============================== =============================== + down_xxx() + Queue waiter + Sleep + up_yyy() + LOAD waiter->task; + STORE waiter->task; + Woken up by other event + + Resume processing + down_xxx() returns + call foo() + foo() clobbers *waiter + + LOAD waiter->list.next; + --- OOPS --- + +이 문제는 세마포어 락의 사용으로 해결될 수도 있겠지만, 그렇게 되면 깨어난 후에 +down_xxx() 함수가 불필요하게 스핀락을 또다시 얻어야만 합니다. + +이 문제를 해결하는 방법은 범용 SMP 메모리 배리어를 추가하는 겁니다: + + LOAD waiter->list.next; + LOAD waiter->task; + smp_mb(); + STORE waiter->task; + CALL wakeup + RELEASE task + +이 경우에, 배리어는 시스템의 나머지 CPU 들에게 모든 배리어 앞의 메모리 액세스가 +배리어 뒤의 메모리 액세스보다 앞서 일어난 것으로 보이게 만듭니다. 배리어 앞의 +메모리 액세스들이 배리어 명령 자체가 완료되는 시점까지 완료된다고는 보장하지 +_않습니다_. + +(이게 문제가 되지 않을) 단일 프로세서 시스템에서 smp_mb() 는 실제로는 그저 +컴파일러가 CPU 안에서의 순서를 바꾸거나 하지 않고 주어진 순서대로 명령을 +내리도록 하는 컴파일러 배리어일 뿐입니다. 오직 하나의 CPU 만 있으니, CPU 의 +의존성 순서 로직이 그 외의 모든것을 알아서 처리할 겁니다. + + +어토믹 오퍼레이션 +----------------- + +어토믹 오퍼레이션은 기술적으로 프로세서간 상호작용으로 분류되며 그 중 일부는 +전체 메모리 배리어를 내포하고 또 일부는 내포하지 않지만, 커널에서 상당히 +의존적으로 사용하는 기능 중 하나입니다. + +메모리의 어떤 상태를 수정하고 해당 상태에 대한 (예전의 또는 최신의) 정보를 +리턴하는 어토믹 오퍼레이션은 모두 SMP-조건적 범용 메모리 배리어(smp_mb())를 +실제 오퍼레이션의 앞과 뒤에 내포합니다. 이런 오퍼레이션은 다음의 것들을 +포함합니다: + + xchg(); + atomic_xchg(); atomic_long_xchg(); + atomic_inc_return(); atomic_long_inc_return(); + atomic_dec_return(); atomic_long_dec_return(); + atomic_add_return(); atomic_long_add_return(); + atomic_sub_return(); atomic_long_sub_return(); + atomic_inc_and_test(); atomic_long_inc_and_test(); + atomic_dec_and_test(); atomic_long_dec_and_test(); + atomic_sub_and_test(); atomic_long_sub_and_test(); + atomic_add_negative(); atomic_long_add_negative(); + test_and_set_bit(); + test_and_clear_bit(); + test_and_change_bit(); + + /* exchange 조건이 성공할 때 */ + cmpxchg(); + atomic_cmpxchg(); atomic_long_cmpxchg(); + atomic_add_unless(); atomic_long_add_unless(); + +이것들은 메모리 배리어 효과가 필요한 ACQUIRE 부류와 RELEASE 부류 오퍼레이션들을 +구현할 때, 그리고 객체 해제를 위해 레퍼런스 카운터를 조정할 때, 암묵적 메모리 +배리어 효과가 필요한 곳 등에 사용됩니다. + + +다음의 오퍼레이션들은 메모리 배리어를 내포하지 _않기_ 때문에 문제가 될 수 +있지만, RELEASE 부류의 오퍼레이션들과 같은 것들을 구현할 때 사용될 수도 +있습니다: + + atomic_set(); + set_bit(); + clear_bit(); + change_bit(); + +이것들을 사용할 때에는 필요하다면 적절한 (예를 들면 smp_mb__before_atomic() +같은) 메모리 배리어가 명시적으로 함께 사용되어야 합니다. + + +아래의 것들도 메모리 배리어를 내포하지 _않기_ 때문에, 일부 환경에서는 (예를 +들면 smp_mb__before_atomic() 과 같은) 명시적인 메모리 배리어 사용이 필요합니다. + + atomic_add(); + atomic_sub(); + atomic_inc(); + atomic_dec(); + +이것들이 통계 생성을 위해 사용된다면, 그리고 통계 데이터 사이에 관계가 존재하지 +않는다면 메모리 배리어는 필요치 않을 겁니다. + +객체의 수명을 관리하기 위해 레퍼런스 카운팅 목적으로 사용된다면, 레퍼런스 +카운터는 락으로 보호되는 섹션에서만 조정되거나 호출하는 쪽이 이미 충분한 +레퍼런스를 잡고 있을 것이기 때문에 메모리 배리어는 아마 필요 없을 겁니다. + +만약 어떤 락을 구성하기 위해 사용된다면, 락 관련 동작은 일반적으로 작업을 특정 +순서대로 진행해야 하므로 메모리 배리어가 필요할 수 있습니다. + +기본적으로, 각 사용처에서는 메모리 배리어가 필요한지 아닌지 충분히 고려해야 +합니다. + +아래의 오퍼레이션들은 특별한 락 관련 동작들입니다: + + test_and_set_bit_lock(); + clear_bit_unlock(); + __clear_bit_unlock(); + +이것들은 ACQUIRE 류와 RELEASE 류의 오퍼레이션들을 구현합니다. 락 관련 도구를 +구현할 때에는 이것들을 좀 더 선호하는 편이 나은데, 이것들의 구현은 많은 +아키텍쳐에서 최적화 될 수 있기 때문입니다. + +[!] 이런 상황에 사용할 수 있는 특수한 메모리 배리어 도구들이 있습니다만, 일부 +CPU 에서는 사용되는 어토믹 인스트럭션 자체에 메모리 배리어가 내포되어 있어서 +어토믹 오퍼레이션과 메모리 배리어를 함께 사용하는 게 불필요한 일이 될 수 +있는데, 그런 경우에 이 특수 메모리 배리어 도구들은 no-op 이 되어 실질적으로 +아무일도 하지 않습니다. + +더 많은 내용을 위해선 Documentation/atomic_ops.txt 를 참고하세요. + + +디바이스 액세스 +--------------- + +많은 디바이스가 메모리 매핑 기법으로 제어될 수 있는데, 그렇게 제어되는 +디바이스는 CPU 에는 단지 특정 메모리 영역의 집합처럼 보이게 됩니다. 드라이버는 +그런 디바이스를 제어하기 위해 정확히 올바른 순서로 올바른 메모리 액세스를 +만들어야 합니다. + +하지만, 액세스들을 재배치 하거나 조합하거나 병합하는게 더 효율적이라 판단하는 +영리한 CPU 나 컴파일러들을 사용하면 드라이버 코드의 조심스럽게 순서 맞춰진 +액세스들이 디바이스에는 요청된 순서대로 도착하지 못하게 할 수 있는 - 디바이스가 +오동작을 하게 할 - 잠재적 문제가 생길 수 있습니다. + +리눅스 커널 내부에서, I/O 는 어떻게 액세스들을 적절히 순차적이게 만들 수 있는지 +알고 있는, - inb() 나 writel() 과 같은 - 적절한 액세스 루틴을 통해 이루어져야만 +합니다. 이것들은 대부분의 경우에는 명시적 메모리 배리어 와 함께 사용될 필요가 +없습니다만, 다음의 두가지 상황에서는 명시적 메모리 배리어가 필요할 수 있습니다: + + (1) 일부 시스템에서 I/O 스토어는 모든 CPU 에 일관되게 순서 맞춰지지 않는데, + 따라서 _모든_ 일반적인 드라이버들에 락이 사용되어야만 하고 이 크리티컬 + 섹션을 빠져나오기 전에 mmiowb() 가 꼭 호출되어야 합니다. + + (2) 만약 액세스 함수들이 완화된 메모리 액세스 속성을 갖는 I/O 메모리 윈도우를 + 사용한다면, 순서를 강제하기 위해선 _mandatory_ 메모리 배리어가 필요합니다. + +더 많은 정보를 위해선 Documentation/DocBook/deviceiobook.tmpl 을 참고하십시오. + + +인터럽트 +-------- + +드라이버는 자신의 인터럽트 서비스 루틴에 의해 인터럽트 당할 수 있기 때문에 +드라이버의 이 두 부분은 서로의 디바이스 제어 또는 액세스 부분과 상호 간섭할 수 +있습니다. + +스스로에게 인터럽트 당하는 걸 불가능하게 하고, 드라이버의 크리티컬한 +오퍼레이션들을 모두 인터럽트가 불가능하게 된 영역에 집어넣거나 하는 방법 (락의 +한 형태) 으로 이런 상호 간섭을 - 최소한 부분적으로라도 - 줄일 수 있습니다. +드라이버의 인터럽트 루틴이 실행 중인 동안, 해당 드라이버의 코어는 같은 CPU 에서 +수행되지 않을 것이며, 현재의 인터럽트가 처리되는 중에는 또다시 인터럽트가 +일어나지 못하도록 되어 있으니 인터럽트 핸들러는 그에 대해서는 락을 잡지 않아도 +됩니다. + +하지만, 어드레스 레지스터와 데이터 레지스터를 갖는 이더넷 카드를 다루는 +드라이버를 생각해 봅시다. 만약 이 드라이버의 코어가 인터럽트를 비활성화시킨 +채로 이더넷 카드와 대화하고 드라이버의 인터럽트 핸들러가 호출되었다면: + + LOCAL IRQ DISABLE + writew(ADDR, 3); + writew(DATA, y); + LOCAL IRQ ENABLE + + writew(ADDR, 4); + q = readw(DATA); + + +만약 순서 규칙이 충분히 완화되어 있다면 데이터 레지스터에의 스토어는 어드레스 +레지스터에 두번째로 행해지는 스토어 뒤에 일어날 수도 있습니다: + + STORE *ADDR = 3, STORE *ADDR = 4, STORE *DATA = y, q = LOAD *DATA + + +만약 순서 규칙이 충분히 완화되어 있고 묵시적으로든 명시적으로든 배리어가 +사용되지 않았다면 인터럽트 비활성화 섹션에서 일어난 액세스가 바깥으로 새어서 +인터럽트 내에서 일어난 액세스와 섞일 수 있다고 - 그리고 그 반대도 - 가정해야만 +합니다. + +그런 영역 안에서 일어나는 I/O 액세스들은 엄격한 순서 규칙의 I/O 레지스터에 +묵시적 I/O 배리어를 형성하는 동기적 (synchronous) 로드 오퍼레이션을 포함하기 +때문에 일반적으로는 이런게 문제가 되지 않습니다. 만약 이걸로는 충분치 않다면 +mmiowb() 가 명시적으로 사용될 필요가 있습니다. + + +하나의 인터럽트 루틴과 별도의 CPU 에서 수행중이며 서로 통신을 하는 두 루틴 +사이에도 비슷한 상황이 일어날 수 있습니다. 만약 그런 경우가 발생할 가능성이 +있다면, 순서를 보장하기 위해 인터럽트 비활성화 락이 사용되어져야만 합니다. + + +====================== +커널 I/O 배리어의 효과 +====================== + +I/O 메모리에 액세스할 때, 드라이버는 적절한 액세스 함수를 사용해야 합니다: + + (*) inX(), outX(): + + 이것들은 메모리 공간보다는 I/O 공간에 이야기를 하려는 의도로 + 만들어졌습니다만, 그건 기본적으로 CPU 마다 다른 컨셉입니다. i386 과 + x86_64 프로세서들은 특별한 I/O 공간 액세스 사이클과 명령어를 실제로 가지고 + 있지만, 다른 많은 CPU 들에는 그런 컨셉이 존재하지 않습니다. + + 다른 것들 중에서도 PCI 버스가 I/O 공간 컨셉을 정의하는데, 이는 - i386 과 + x86_64 같은 CPU 에서 - CPU 의 I/O 공간 컨셉으로 쉽게 매치됩니다. 하지만, + 대체할 I/O 공간이 없는 CPU 에서는 CPU 의 메모리 맵의 가상 I/O 공간으로 + 매핑될 수도 있습니다. + + 이 공간으로의 액세스는 (i386 등에서는) 완전하게 동기화 됩니다만, 중간의 + (PCI 호스트 브리지와 같은) 브리지들은 이를 완전히 보장하진 않을수도 + 있습니다. + + 이것들의 상호간의 순서는 완전하게 보장됩니다. + + 다른 타입의 메모리 오퍼레이션, I/O 오퍼레이션에 대한 순서는 완전하게 + 보장되지는 않습니다. + + (*) readX(), writeX(): + + 이것들이 수행 요청되는 CPU 에서 서로에게 완전히 순서가 맞춰지고 독립적으로 + 수행되는지에 대한 보장 여부는 이들이 액세스 하는 메모리 윈도우에 정의된 + 특성에 의해 결정됩니다. 예를 들어, 최신의 i386 아키텍쳐 머신에서는 MTRR + 레지스터로 이 특성이 조정됩니다. + + 일반적으로는, 프리페치 (prefetch) 가능한 디바이스를 액세스 하는게 + 아니라면, 이것들은 완전히 순서가 맞춰지고 결합되지 않게 보장될 겁니다. + + 하지만, (PCI 브리지와 같은) 중간의 하드웨어는 자신이 원한다면 집행을 + 연기시킬 수 있습니다; 스토어 명령을 실제로 하드웨어로 내려보내기(flush) + 위해서는 같은 위치로부터 로드를 하는 방법이 있습니다만[*], PCI 의 경우는 + 같은 디바이스나 환경 구성 영역에서의 로드만으로도 충분할 겁니다. + + [*] 주의! 쓰여진 것과 같은 위치로부터의 로드를 시도하는 것은 오동작을 + 일으킬 수도 있습니다 - 예로 16650 Rx/Tx 시리얼 레지스터를 생각해 + 보세요. + + 프리페치 가능한 I/O 메모리가 사용되면, 스토어 명령들이 순서를 지키도록 + 하기 위해 mmiowb() 배리어가 필요할 수 있습니다. + + PCI 트랜잭션 사이의 상호작용에 대해 더 많은 정보를 위해선 PCI 명세서를 + 참고하시기 바랍니다. + + (*) readX_relaxed(), writeX_relaxed() + + 이것들은 readX() 와 writeX() 랑 비슷하지만, 더 완화된 메모리 순서 보장을 + 제공합니다. 구체적으로, 이것들은 일반적 메모리 액세스 (예: DMA 버퍼) 에도 + LOCK 이나 UNLOCK 오퍼레이션들에도 순서를 보장하지 않습니다. LOCK 이나 + UNLOCK 오퍼레이션들에 맞춰지는 순서가 필요하다면, mmiowb() 배리어가 사용될 + 수 있습니다. 같은 주변 장치에의 완화된 액세스끼리는 순서가 지켜짐을 알아 + 두시기 바랍니다. + + (*) ioreadX(), iowriteX() + + 이것들은 inX()/outX() 나 readX()/writeX() 처럼 실제로 수행하는 액세스의 + 종류에 따라 적절하게 수행될 것입니다. + + +=================================== +가정되는 가장 완화된 실행 순서 모델 +=================================== + +컨셉적으로 CPU 는 주어진 프로그램에 대해 프로그램 그 자체에는 인과성 (program +causality) 을 지키는 것처럼 보이게 하지만 일반적으로는 순서를 거의 지켜주지 +않는다고 가정되어야만 합니다. (i386 이나 x86_64 같은) 일부 CPU 들은 코드 +재배치에 (powerpc 나 frv 와 같은) 다른 것들에 비해 강한 제약을 갖지만, 아키텍쳐 +종속적 코드 이외의 코드에서는 순서에 대한 제약이 가장 완화된 경우 (DEC Alpha) +를 가정해야 합니다. + +이 말은, CPU 에게 주어지는 인스트럭션 스트림 내의 한 인스트럭션이 앞의 +인스트럭션에 종속적이라면 앞의 인스트럭션은 뒤의 종속적 인스트럭션이 실행되기 +전에 완료[*]될 수 있어야 한다는 제약 (달리 말해서, 인과성이 지켜지는 것으로 +보이게 함) 외에는 자신이 원하는 순서대로 - 심지어 병렬적으로도 - 그 스트림을 +실행할 수 있음을 의미합니다 + + [*] 일부 인스트럭션은 하나 이상의 영향 - 조건 코드를 바꾼다던지, 레지스터나 + 메모리를 바꾼다던지 - 을 만들어내며, 다른 인스트럭션은 다른 효과에 + 종속적일 수 있습니다. + +CPU 는 최종적으로 아무 효과도 만들지 않는 인스트럭션 시퀀스는 없애버릴 수도 +있습니다. 예를 들어, 만약 두개의 연속되는 인스트럭션이 둘 다 같은 레지스터에 +직접적인 값 (immediate value) 을 집어넣는다면, 첫번째 인스트럭션은 버려질 수도 +있습니다. + + +비슷하게, 컴파일러 역시 프로그램의 인과성만 지켜준다면 인스트럭션 스트림을 +자신이 보기에 올바르다 생각되는대로 재배치 할 수 있습니다. + + +=============== +CPU 캐시의 영향 +=============== + +캐시된 메모리 오퍼레이션들이 시스템 전체에 어떻게 인지되는지는 CPU 와 메모리 +사이에 존재하는 캐시들, 그리고 시스템 상태의 일관성을 관리하는 메모리 일관성 +시스템에 상당 부분 영향을 받습니다. + +한 CPU 가 시스템의 다른 부분들과 캐시를 통해 상호작용한다면, 메모리 시스템은 +CPU 의 캐시들을 포함해야 하며, CPU 와 CPU 자신의 캐시 사이에서의 동작을 위한 +메모리 배리어를 가져야 합니다. (메모리 배리어는 논리적으로는 다음 그림의 +점선에서 동작합니다): + + <--- CPU ---> : <----------- Memory -----------> + : + +--------+ +--------+ : +--------+ +-----------+ + | | | | : | | | | +--------+ + | CPU | | Memory | : | CPU | | | | | + | Core |--->| Access |----->| Cache |<-->| | | | + | | | Queue | : | | | |--->| Memory | + | | | | : | | | | | | + +--------+ +--------+ : +--------+ | | | | + : | Cache | +--------+ + : | Coherency | + : | Mechanism | +--------+ + +--------+ +--------+ : +--------+ | | | | + | | | | : | | | | | | + | CPU | | Memory | : | CPU | | |--->| Device | + | Core |--->| Access |----->| Cache |<-->| | | | + | | | Queue | : | | | | | | + | | | | : | | | | +--------+ + +--------+ +--------+ : +--------+ +-----------+ + : + : + +특정 로드나 스토어는 해당 오퍼레이션을 요청한 CPU 의 캐시 내에서 동작을 완료할 +수도 있기 때문에 해당 CPU 의 바깥에는 보이지 않을 수 있지만, 다른 CPU 가 관심을 +갖는다면 캐시 일관성 메커니즘이 해당 캐시라인을 해당 CPU 에게 전달하고, 해당 +메모리 영역에 대한 오퍼레이션이 발생할 때마다 그 영향을 전파시키기 때문에, 해당 +오퍼레이션은 메모리에 실제로 액세스를 한것처럼 나타날 것입니다. + +CPU 코어는 프로그램의 인과성이 유지된다고만 여겨진다면 인스트럭션들을 어떤 +순서로든 재배치해서 수행할 수 있습니다. 일부 인스트럭션들은 로드나 스토어 +오퍼레이션을 만드는데 이 오퍼레이션들은 이후 수행될 메모리 액세스 큐에 들어가게 +됩니다. 코어는 이 오퍼레이션들을 해당 큐에 어떤 순서로든 원하는대로 넣을 수 +있고, 다른 인스트럭션의 완료를 기다리도록 강제되기 전까지는 수행을 계속합니다. + +메모리 배리어가 하는 일은 CPU 쪽에서 메모리 쪽으로 넘어가는 액세스들의 순서, +그리고 그 액세스의 결과가 시스템의 다른 관찰자들에게 인지되는 순서를 제어하는 +것입니다. + +[!] CPU 들은 항상 그들 자신의 로드와 스토어는 프로그램 순서대로 일어난 것으로 +보기 때문에, 주어진 CPU 내에서는 메모리 배리어를 사용할 필요가 _없습니다_. + +[!] MMIO 나 다른 디바이스 액세스들은 캐시 시스템을 우회할 수도 있습니다. 우회 +여부는 디바이스가 액세스 되는 메모리 윈도우의 특성에 의해 결정될 수도 있고, CPU +가 가지고 있을 수 있는 특수한 디바이스 통신 인스트럭션의 사용에 의해서 결정될 +수도 있습니다. + + +캐시 일관성 +----------- + +하지만 삶은 앞에서 이야기한 것처럼 단순하지 않습니다: 캐시들은 일관적일 것으로 +기대되지만, 그 일관성이 순서에도 적용될 거라는 보장은 없습니다. 한 CPU 에서 +만들어진 변경 사항은 최종적으로는 시스템의 모든 CPU 에게 보여지게 되지만, 다른 +CPU 들에게도 같은 순서로 보이게 될 거라는 보장은 없다는 뜻입니다. + + +두개의 CPU (1 & 2) 가 달려 있고, 각 CPU 에 두개의 데이터 캐시(CPU 1 은 A/B 를, +CPU 2 는 C/D 를 갖습니다)가 병렬로 연결되어 있는 시스템을 다룬다고 생각해 +봅시다: + + : + : +--------+ + : +---------+ | | + +--------+ : +--->| Cache A |<------->| | + | | : | +---------+ | | + | CPU 1 |<---+ | | + | | : | +---------+ | | + +--------+ : +--->| Cache B |<------->| | + : +---------+ | | + : | Memory | + : +---------+ | System | + +--------+ : +--->| Cache C |<------->| | + | | : | +---------+ | | + | CPU 2 |<---+ | | + | | : | +---------+ | | + +--------+ : +--->| Cache D |<------->| | + : +---------+ | | + : +--------+ + : + +이 시스템이 다음과 같은 특성을 갖는다 생각해 봅시다: + + (*) 홀수번 캐시라인은 캐시 A, 캐시 C 또는 메모리에 위치할 수 있음; + + (*) 짝수번 캐시라인은 캐시 B, 캐시 D 또는 메모리에 위치할 수 있음; + + (*) CPU 코어가 한개의 캐시에 접근하는 동안, 다른 캐시는 - 더티 캐시라인을 + 메모리에 내리거나 추측성 로드를 하거나 하기 위해 - 시스템의 다른 부분에 + 액세스 하기 위해 버스를 사용할 수 있음; + + (*) 각 캐시는 시스템의 나머지 부분들과 일관성을 맞추기 위해 해당 캐시에 + 적용되어야 할 오퍼레이션들의 큐를 가짐; + + (*) 이 일관성 큐는 캐시에 이미 존재하는 라인에 가해지는 평범한 로드에 의해서는 + 비워지지 않는데, 큐의 오퍼레이션들이 이 로드의 결과에 영향을 끼칠 수 있다 + 할지라도 그러함. + +이제, 첫번째 CPU 에서 두개의 쓰기 오퍼레이션을 만드는데, 해당 CPU 의 캐시에 +요청된 순서로 오퍼레이션이 도달됨을 보장하기 위해 두 오퍼레이션 사이에 쓰기 +배리어를 사용하는 상황을 상상해 봅시다: + + CPU 1 CPU 2 COMMENT + =============== =============== ======================================= + u == 0, v == 1 and p == &u, q == &u + v = 2; + smp_wmb(); v 의 변경이 p 의 변경 전에 보일 것을 + 분명히 함 + v 는 이제 캐시 A 에 독점적으로 존재함 + p = &v; + p 는 이제 캐시 B 에 독점적으로 존재함 + +여기서의 쓰기 메모리 배리어는 CPU 1 의 캐시가 올바른 순서로 업데이트 된 것으로 +시스템의 다른 CPU 들이 인지하게 만듭니다. 하지만, 이제 두번째 CPU 가 그 값들을 +읽으려 하는 상황을 생각해 봅시다: + + CPU 1 CPU 2 COMMENT + =============== =============== ======================================= + ... + q = p; + x = *q; + +위의 두개의 읽기 오퍼레이션은 예상된 순서로 일어나지 못할 수 있는데, 두번째 CPU +의 한 캐시에 다른 캐시 이벤트가 발생해 v 를 담고 있는 캐시라인의 해당 캐시에의 +업데이트가 지연되는 사이, p 를 담고 있는 캐시라인은 두번째 CPU 의 다른 캐시에 +업데이트 되어버렸을 수 있기 때문입니다. + + CPU 1 CPU 2 COMMENT + =============== =============== ======================================= + u == 0, v == 1 and p == &u, q == &u + v = 2; + smp_wmb(); + + + p = &v; q = p; + + + + x = *q; + 캐시에 업데이트 되기 전의 v 를 읽음 + + + +기본적으로, 두개의 캐시라인 모두 CPU 2 에 최종적으로는 업데이트 될 것이지만, +별도의 개입 없이는, 업데이트의 순서가 CPU 1 에서 만들어진 순서와 동일할 +것이라는 보장이 없습니다. + + +여기에 개입하기 위해선, 데이터 의존성 배리어나 읽기 배리어를 로드 오퍼레이션들 +사이에 넣어야 합니다. 이렇게 함으로써 캐시가 다음 요청을 처리하기 전에 일관성 +큐를 처리하도록 강제하게 됩니다. + + CPU 1 CPU 2 COMMENT + =============== =============== ======================================= + u == 0, v == 1 and p == &u, q == &u + v = 2; + smp_wmb(); + + + p = &v; q = p; + + + + smp_read_barrier_depends() + + + x = *q; + 캐시에 업데이트 된 v 를 읽음 + + +이런 부류의 문제는 DEC Alpha 계열 프로세서들에서 발견될 수 있는데, 이들은 +데이터 버스를 좀 더 잘 사용해 성능을 개선할 수 있는, 분할된 캐시를 가지고 있기 +때문입니다. 대부분의 CPU 는 하나의 읽기 오퍼레이션의 메모리 액세스가 다른 읽기 +오퍼레이션에 의존적이라면 데이터 의존성 배리어를 내포시킵니다만, 모두가 그런건 +아니기 때문에 이점에 의존해선 안됩니다. + +다른 CPU 들도 분할된 캐시를 가지고 있을 수 있지만, 그런 CPU 들은 평범한 메모리 +액세스를 위해서도 이 분할된 캐시들 사이의 조정을 해야만 합니다. Alpha 는 가장 +약한 메모리 순서 시맨틱 (semantic) 을 선택함으로써 메모리 배리어가 명시적으로 +사용되지 않았을 때에는 그런 조정이 필요하지 않게 했습니다. + + +캐시 일관성 VS DMA +------------------ + +모든 시스템이 DMA 를 하는 디바이스에 대해서까지 캐시 일관성을 유지하지는 +않습니다. 그런 경우, DMA 를 시도하는 디바이스는 RAM 으로부터 잘못된 데이터를 +읽을 수 있는데, 더티 캐시 라인이 CPU 의 캐시에 머무르고 있고, 바뀐 값이 아직 +RAM 에 써지지 않았을 수 있기 때문입니다. 이 문제를 해결하기 위해선, 커널의 +적절한 부분에서 각 CPU 캐시의 문제되는 비트들을 플러시 (flush) 시켜야만 합니다 +(그리고 그것들을 무효화 - invalidation - 시킬 수도 있겠죠). + +또한, 디바이스에 의해 RAM 에 DMA 로 쓰여진 값은 디바이스가 쓰기를 완료한 후에 +CPU 의 캐시에서 RAM 으로 쓰여지는 더티 캐시 라인에 의해 덮어써질 수도 있고, CPU +의 캐시에 존재하는 캐시 라인이 해당 캐시에서 삭제되고 다시 값을 읽어들이기 +전까지는 RAM 이 업데이트 되었다는 사실 자체가 숨겨져 버릴 수도 있습니다. 이 +문제를 해결하기 위해선, 커널의 적절한 부분에서 각 CPU 의 캐시 안의 문제가 되는 +비트들을 무효화 시켜야 합니다. + +캐시 관리에 대한 더 많은 정보를 위해선 Documentation/cachetlb.txt 를 +참고하세요. + + +캐시 일관성 VS MMIO +------------------- + +Memory mapped I/O 는 일반적으로 CPU 의 메모리 공간 내의 한 윈도우의 특정 부분 +내의 메모리 지역에 이루어지는데, 이 윈도우는 일반적인, RAM 으로 향하는 +윈도우와는 다른 특성을 갖습니다. + +그런 특성 가운데 하나는, 일반적으로 그런 액세스는 캐시를 완전히 우회하고 +디바이스 버스로 곧바로 향한다는 것입니다. 이 말은 MMIO 액세스는 먼저 +시작되어서 캐시에서 완료된 메모리 액세스를 추월할 수 있다는 뜻입니다. 이런 +경우엔 메모리 배리어만으로는 충분치 않고, 만약 캐시된 메모리 쓰기 오퍼레이션과 +MMIO 액세스가 어떤 방식으로든 의존적이라면 해당 캐시는 두 오퍼레이션 사이에 +비워져(flush)야만 합니다. + + +====================== +CPU 들이 저지르는 일들 +====================== + +프로그래머는 CPU 가 메모리 오퍼레이션들을 정확히 요청한대로 수행해 줄 것이라고 +생각하는데, 예를 들어 다음과 같은 코드를 CPU 에게 넘긴다면: + + a = READ_ONCE(*A); + WRITE_ONCE(*B, b); + c = READ_ONCE(*C); + d = READ_ONCE(*D); + WRITE_ONCE(*E, e); + +CPU 는 다음 인스트럭션을 처리하기 전에 현재의 인스트럭션을 위한 메모리 +오퍼레이션을 완료할 것이라 생각하고, 따라서 시스템 외부에서 관찰하기에도 정해진 +순서대로 오퍼레이션이 수행될 것으로 예상합니다: + + LOAD *A, STORE *B, LOAD *C, LOAD *D, STORE *E. + + +당연하지만, 실제로는 훨씬 엉망입니다. 많은 CPU 와 컴파일러에서 앞의 가정은 +성립하지 못하는데 그 이유는 다음과 같습니다: + + (*) 로드 오퍼레이션들은 실행을 계속 해나가기 위해 곧바로 완료될 필요가 있는 + 경우가 많은 반면, 스토어 오퍼레이션들은 종종 별다른 문제 없이 유예될 수 + 있습니다; + + (*) 로드 오퍼레이션들은 예측적으로 수행될 수 있으며, 필요없는 로드였다고 + 증명된 예측적 로드의 결과는 버려집니다; + + (*) 로드 오퍼레이션들은 예측적으로 수행될 수 있으므로, 예상된 이벤트의 + 시퀀스와 다른 시간에 로드가 이뤄질 수 있습니다; + + (*) 메모리 액세스 순서는 CPU 버스와 캐시를 좀 더 잘 사용할 수 있도록 재배치 + 될 수 있습니다; + + (*) 로드와 스토어는 인접한 위치에의 액세스들을 일괄적으로 처리할 수 있는 + 메모리나 I/O 하드웨어 (메모리와 PCI 디바이스 둘 다 이게 가능할 수 + 있습니다) 에 대해 요청되는 경우, 개별 오퍼레이션을 위한 트랜잭션 설정 + 비용을 아끼기 위해 조합되어 실행될 수 있습니다; 그리고 + + (*) 해당 CPU 의 데이터 캐시가 순서에 영향을 끼칠 수도 있고, 캐시 일관성 + 메커니즘이 - 스토어가 실제로 캐시에 도달한다면 - 이 문제를 완화시킬 수는 + 있지만 이 일관성 관리가 다른 CPU 들에도 같은 순서로 전달된다는 보장은 + 없습니다. + +따라서, 앞의 코드에 대해 다른 CPU 가 보는 결과는 다음과 같을 수 있습니다: + + LOAD *A, ..., LOAD {*C,*D}, STORE *E, STORE *B + + ("LOAD {*C,*D}" 는 조합된 로드입니다) + + +하지만, CPU 는 스스로는 일관적일 것을 보장합니다: CPU _자신_ 의 액세스들은 +자신에게는 메모리 배리어가 없음에도 불구하고 정확히 순서 세워진 것으로 보여질 +것입니다. 예를 들어 다음의 코드가 주어졌다면: + + U = READ_ONCE(*A); + WRITE_ONCE(*A, V); + WRITE_ONCE(*A, W); + X = READ_ONCE(*A); + WRITE_ONCE(*A, Y); + Z = READ_ONCE(*A); + +그리고 외부의 영향에 의한 간섭이 없다고 가정하면, 최종 결과는 다음과 같이 +나타날 것이라고 예상될 수 있습니다: + + U == *A 의 최초 값 + X == W + Z == Y + *A == Y + +앞의 코드는 CPU 가 다음의 메모리 액세스 시퀀스를 만들도록 할겁니다: + + U=LOAD *A, STORE *A=V, STORE *A=W, X=LOAD *A, STORE *A=Y, Z=LOAD *A + +하지만, 별다른 개입이 없고 프로그램의 시야에 이 세상이 여전히 일관적이라고 +보인다는 보장만 지켜진다면 이 시퀀스는 어떤 조합으로든 재구성될 수 있으며, 각 +액세스들은 합쳐지거나 버려질 수 있습니다. 일부 아키텍쳐에서 CPU 는 같은 위치에 +대한 연속적인 로드 오퍼레이션들을 재배치 할 수 있기 때문에 앞의 예에서의 +READ_ONCE() 와 WRITE_ONCE() 는 반드시 존재해야 함을 알아두세요. 그런 종류의 +아키텍쳐에서 READ_ONCE() 와 WRITE_ONCE() 는 이 문제를 막기 위해 필요한 일을 +뭐가 됐든지 하게 되는데, 예를 들어 Itanium 에서는 READ_ONCE() 와 WRITE_ONCE() +가 사용하는 volatile 캐스팅은 GCC 가 그런 재배치를 방지하는 특수 인스트럭션인 +ld.acq 와 stl.rel 인스트럭션을 각각 만들어 내도록 합니다. + +컴파일러 역시 이 시퀀스의 액세스들을 CPU 가 보기도 전에 합치거나 버리거나 뒤로 +미뤄버릴 수 있습니다. + +예를 들어: + + *A = V; + *A = W; + +는 다음과 같이 변형될 수 있습니다: + + *A = W; + +따라서, 쓰기 배리어나 WRITE_ONCE() 가 없다면 *A 로의 V 값의 저장의 효과는 +사라진다고 가정될 수 있습니다. 비슷하게: + + *A = Y; + Z = *A; + +는, 메모리 배리어나 READ_ONCE() 와 WRITE_ONCE() 없이는 다음과 같이 변형될 수 +있습니다: + + *A = Y; + Z = Y; + +그리고 이 LOAD 오퍼레이션은 CPU 바깥에는 아예 보이지 않습니다. + + +그리고, ALPHA 가 있다 +--------------------- + +DEC Alpha CPU 는 가장 완화된 메모리 순서의 CPU 중 하나입니다. 뿐만 아니라, +Alpha CPU 의 일부 버전은 분할된 데이터 캐시를 가지고 있어서, 의미적으로 +관계되어 있는 두개의 캐시 라인이 서로 다른 시간에 업데이트 되는게 가능합니다. +이게 데이터 의존성 배리어가 정말 필요해지는 부분인데, 데이터 의존성 배리어는 +메모리 일관성 시스템과 함께 두개의 캐시를 동기화 시켜서, 포인터 변경과 새로운 +데이터의 발견을 올바른 순서로 일어나게 하기 때문입니다. + +리눅스 커널의 메모리 배리어 모델은 Alpha 에 기초해서 정의되었습니다. + +위의 "캐시 일관성" 서브섹션을 참고하세요. + + +가상 머신 게스트 +---------------- + +가상 머신에서 동작하는 게스트들은 게스트 자체는 SMP 지원 없이 컴파일 되었다 +해도 SMP 영향을 받을 수 있습니다. 이건 UP 커널을 사용하면서 SMP 호스트와 +결부되어 발생하는 부작용입니다. 이 경우에는 mandatory 배리어를 사용해서 문제를 +해결할 수 있겠지만 그런 해결은 대부분의 경우 최적의 해결책이 아닙니다. + +이 문제를 완벽하게 해결하기 위해, 로우 레벨의 virt_mb() 등의 매크로를 사용할 수 +있습니다. 이것들은 SMP 가 활성화 되어 있다면 smp_mb() 등과 동일한 효과를 +갖습니다만, SMP 와 SMP 아닌 시스템 모두에 대해 동일한 코드를 만들어냅니다. +예를 들어, 가상 머신 게스트들은 (SMP 일 수 있는) 호스트와 동기화를 할 때에는 +smp_mb() 가 아니라 virt_mb() 를 사용해야 합니다. + +이것들은 smp_mb() 류의 것들과 모든 부분에서 동일하며, 특히, MMIO 의 영향에 +대해서는 간여하지 않습니다: MMIO 의 영향을 제어하려면, mandatory 배리어를 +사용하시기 바랍니다. + + +======= +사용 예 +======= + +순환식 버퍼 +----------- + +메모리 배리어는 순환식 버퍼를 생성자(producer)와 소비자(consumer) 사이의 +동기화에 락을 사용하지 않고 구현하는데에 사용될 수 있습니다. 더 자세한 내용을 +위해선 다음을 참고하세요: + + Documentation/circular-buffers.txt + + +========= +참고 문헌 +========= + +Alpha AXP Architecture Reference Manual, Second Edition (Sites & Witek, +Digital Press) + Chapter 5.2: Physical Address Space Characteristics + Chapter 5.4: Caches and Write Buffers + Chapter 5.5: Data Sharing + Chapter 5.6: Read/Write Ordering + +AMD64 Architecture Programmer's Manual Volume 2: System Programming + Chapter 7.1: Memory-Access Ordering + Chapter 7.4: Buffering and Combining Memory Writes + +IA-32 Intel Architecture Software Developer's Manual, Volume 3: +System Programming Guide + Chapter 7.1: Locked Atomic Operations + Chapter 7.2: Memory Ordering + Chapter 7.4: Serializing Instructions + +The SPARC Architecture Manual, Version 9 + Chapter 8: Memory Models + Appendix D: Formal Specification of the Memory Models + Appendix J: Programming with the Memory Models + +UltraSPARC Programmer Reference Manual + Chapter 5: Memory Accesses and Cacheability + Chapter 15: Sparc-V9 Memory Models + +UltraSPARC III Cu User's Manual + Chapter 9: Memory Models + +UltraSPARC IIIi Processor User's Manual + Chapter 8: Memory Models + +UltraSPARC Architecture 2005 + Chapter 9: Memory + Appendix D: Formal Specifications of the Memory Models + +UltraSPARC T1 Supplement to the UltraSPARC Architecture 2005 + Chapter 8: Memory Models + Appendix F: Caches and Cache Coherency + +Solaris Internals, Core Kernel Architecture, p63-68: + Chapter 3.3: Hardware Considerations for Locks and + Synchronization + +Unix Systems for Modern Architectures, Symmetric Multiprocessing and Caching +for Kernel Programmers: + Chapter 13: Other Memory Models + +Intel Itanium Architecture Software Developer's Manual: Volume 1: + Section 2.6: Speculation + Section 4.4: Memory Access From a91bf718dbc993ea582cd53c0cb711a0839b4603 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 12 Aug 2016 14:57:12 +0800 Subject: [PATCH 024/538] x86/mm/numa: Open code function early_get_boot_cpu_id() Previously early_acpi_boot_init() was called in early_get_boot_cpu_id() to get the value for boot_cpu_physical_apicid. Now early_acpi_boot_init() has been taken out and moved to setup_arch(), the name of early_get_boot_cpu_id() doesn't match its implementation anymore, and only the getting boot-time SMP configuration code was left. So in this patch we open code it. Also move the smp_found_config check into default_get_smp_config to simplify code, because both early_get_smp_config() and get_smp_config() call x86_init.mpparse.get_smp_config(). Also remove the redundent CONFIG_X86_MPPARSE #ifdef check when we call early_get_smp_config(). Signed-off-by: Baoquan He Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-acpi@vger.kernel.org Cc: rjw@rjwysocki.net Link: http://lkml.kernel.org/r/1470985033-22493-1-git-send-email-bhe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 3 +++ arch/x86/kernel/setup.c | 3 +-- arch/x86/mm/amdtopology.c | 22 +++++----------------- 3 files changed, 9 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 068c4a929de6..0f8d20497383 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -499,6 +499,9 @@ void __init default_get_smp_config(unsigned int early) { struct mpf_intel *mpf = mpf_found; + if (!smp_found_config) + return; + if (!mpf) return; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0fa60f5f5a16..cbf56344a0f6 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1221,8 +1221,7 @@ void __init setup_arch(char **cmdline_p) /* * get boot-time SMP configuration: */ - if (smp_found_config) - get_smp_config(); + get_smp_config(); prefill_possible_map(); diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index ba47524f56e8..d1c7de095808 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -52,21 +52,6 @@ static __init int find_northbridge(void) return -ENOENT; } -static __init void early_get_boot_cpu_id(void) -{ - /* - * need to get the APIC ID of the BSP so can use that to - * create apicid_to_node in amd_scan_nodes() - */ -#ifdef CONFIG_X86_MPPARSE - /* - * get boot-time SMP configuration: - */ - if (smp_found_config) - early_get_smp_config(); -#endif -} - int __init amd_numa_init(void) { u64 start = PFN_PHYS(0); @@ -180,8 +165,11 @@ int __init amd_numa_init(void) cores = 1 << bits; apicid_base = 0; - /* get the APIC ID of the BSP early for systems with apicid lifting */ - early_get_boot_cpu_id(); + /* + * get boot-time SMP configuration: + */ + early_get_smp_config(); + if (boot_cpu_physical_apicid > 0) { pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); apicid_base = boot_cpu_physical_apicid; From 6de421198c75d95088331e6a480e952292b0e121 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 12 Aug 2016 14:57:13 +0800 Subject: [PATCH 025/538] x86/apic, ACPI: Remove the repeated lapic address override entry parsing The ACPI MADT has a 32-bit field providing lapic address at which each processor can access its lapic information. MADT also contains an optional entry to provide a 64-bit address to override the 32-bit one. However the current code does the lapic address override entry parsing twice. One is in early_acpi_boot_init() because AMD NUMA need get boot_cpu_id earlier. The other is in acpi_boot_init() which parses all MADT entries. So in this patch we remove the repeated code in the 2nd part. Meanwhile print lapic override entry information like other MADT entry, this will be added to boot log. This patch is not supposed to change any runtime behavior, other than improving kernel messages. Signed-off-by: Baoquan He Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-acpi@vger.kernel.org Cc: rjw@rjwysocki.net Link: http://lkml.kernel.org/r/1470985033-22493-2-git-send-email-bhe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 17 ++--------------- arch/x86/kernel/apic/apic.c | 2 +- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 90d84c3eee53..2087bea6b461 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -282,6 +282,8 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, if (BAD_MADT_ENTRY(lapic_addr_ovr, end)) return -EINVAL; + acpi_table_print_madt_entry(header); + acpi_lapic_addr = lapic_addr_ovr->address; return 0; @@ -998,21 +1000,6 @@ static int __init acpi_parse_madt_lapic_entries(void) if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; - /* - * Note that the LAPIC address is obtained from the MADT (32-bit value) - * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value). - */ - - count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, - acpi_parse_lapic_addr_ovr, 0); - if (count < 0) { - printk(KERN_ERR PREFIX - "Error parsing LAPIC address override entry\n"); - return count; - } - - register_lapic_address(acpi_lapic_addr); - count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, acpi_parse_sapic, MAX_LOCAL_APIC); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index cea4fc19e844..63b748444880 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1825,7 +1825,7 @@ void __init register_lapic_address(unsigned long address) if (!x2apic_mode) { set_fixmap_nocache(FIX_APIC_BASE, address); apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, mp_lapic_addr); + APIC_BASE, address); } if (boot_cpu_physical_apicid == -1U) { boot_cpu_physical_apicid = read_apic_id(); From 31b02dd718712f4c45afbeea7fbd187ecb1b202c Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 12 Aug 2016 15:21:47 +0800 Subject: [PATCH 026/538] x86/apic, ACPI: Fix incorrect assignment when handling apic/x2apic entries By pure accident the bug makes no functional difference, because the only expression where we are using these values is (!count && !x2count), in which the variables are interchangeable, but it makes sense to fix the bug nevertheless. Signed-off-by: Baoquan He Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-acpi@vger.kernel.org Cc: rjw@rjwysocki.net Link: http://lkml.kernel.org/r/1470986507-24191-1-git-send-email-bhe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 2087bea6b461..1ad5fe213043 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1018,8 +1018,8 @@ static int __init acpi_parse_madt_lapic_entries(void) return ret; } - x2count = madt_proc[0].count; - count = madt_proc[1].count; + count = madt_proc[0].count; + x2count = madt_proc[1].count; } if (!count && !x2count) { printk(KERN_ERR PREFIX "No LAPIC entries present\n"); From 33a6c324a7266462f933ab25a92383c882e4b4f1 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Fri, 5 Aug 2016 16:55:18 +0200 Subject: [PATCH 027/538] dt-bindings: interrupt-controller: add DT binding for Marvell 7K/8K PIC This commit adds the Device Tree binding description for the PIC interrupt controller available in the ARM64 Marvell Armada 7K/8K SoCs. Signed-off-by: Thomas Petazzoni Acked-by: Rob Herring Link: https://lkml.kernel.org/r/1470408921-447-2-git-send-email-thomas.petazzoni@free-electrons.com Signed-off-by: Jason Cooper --- .../marvell,armada-8k-pic.txt | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 Documentation/devicetree/bindings/interrupt-controller/marvell,armada-8k-pic.txt diff --git a/Documentation/devicetree/bindings/interrupt-controller/marvell,armada-8k-pic.txt b/Documentation/devicetree/bindings/interrupt-controller/marvell,armada-8k-pic.txt new file mode 100644 index 000000000000..86a7b4cd03f5 --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/marvell,armada-8k-pic.txt @@ -0,0 +1,25 @@ +Marvell Armada 7K/8K PIC Interrupt controller +--------------------------------------------- + +This is the Device Tree binding for the PIC, a secondary interrupt +controller available on the Marvell Armada 7K/8K ARM64 SoCs, and +typically connected to the GIC as the primary interrupt controller. + +Required properties: +- compatible: should be "marvell,armada-8k-pic" +- interrupt-controller: identifies the node as an interrupt controller +- #interrupt-cells: the number of cells to define interrupts on this + controller. Should be 1 +- reg: the register area for the PIC interrupt controller +- interrupts: the interrupt to the primary interrupt controller, + typically the GIC + +Example: + + pic: interrupt-controller@3f0100 { + compatible = "marvell,armada-8k-pic"; + reg = <0x3f0100 0x10>; + #interrupt-cells = <1>; + interrupt-controller; + interrupts = ; + }; From 1fc770d5899c995db8e22d35eb918a2cb79559d9 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 15 Aug 2016 12:14:10 -0400 Subject: [PATCH 028/538] sched: Remove struct rq::nohz_stamp The nohz_stamp member of struct rq has been unused since 2010, when this commit removed the code that referenced it: 396e894d289d ("sched: Revert nohz_ratelimit() for now") Signed-off-by: Rik van Riel Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160815121410.5ea1c98f@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c64fc5114004..afe76d04e916 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -597,7 +597,6 @@ struct rq { #ifdef CONFIG_SMP unsigned long last_load_update_tick; #endif /* CONFIG_SMP */ - u64 nohz_stamp; unsigned long nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL From 94f438c84e850570f28dd36588a0d7f73b991e44 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Aug 2016 12:54:59 +0200 Subject: [PATCH 029/538] sched/core: Clarify SD_flags comment The SD_flags comment is very terse and doesn't explain why PACKING is odd. IIRC the distinction is that the 'normal' ones only describe topology, while the ASYM_PACKING one also prescribes behaviour. It is odd in the way that it doesn't only describe things. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/20160815105459.GS6879@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3b6b23c57418..54fff8109922 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6355,13 +6355,19 @@ static int sched_domains_curr_level; /* * SD_flags allowed in topology descriptions. * - * SD_SHARE_CPUCAPACITY - describes SMT topologies - * SD_SHARE_PKG_RESOURCES - describes shared caches - * SD_NUMA - describes NUMA topologies - * SD_SHARE_POWERDOMAIN - describes shared power domain + * These flags are purely descriptive of the topology and do not prescribe + * behaviour. Behaviour is artificial and mapped in the below sd_init() + * function: * - * Odd one out: - * SD_ASYM_PACKING - describes SMT quirks + * SD_SHARE_CPUCAPACITY - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA - describes NUMA topologies + * SD_SHARE_POWERDOMAIN - describes shared power domain + * + * Odd one out, which beside describing the topology has a quirk also + * prescribes the desired behaviour that goes along with it: + * + * SD_ASYM_PACKING - describes SMT quirks */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ From 0e6d2a67a41321b3ef650b780a279a37855de08e Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 25 Jul 2016 14:34:21 +0100 Subject: [PATCH 030/538] sched/core: Remove unnecessary NULL-pointer check Checking if the sched_domain pointer returned by sd_init() is NULL seems pointless as sd_init() neither checks if it is valid to begin with nor set it to NULL. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1469453670-2660-5-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 54fff8109922..1b2dd5220170 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6854,8 +6854,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, struct sched_domain *child, int cpu) { struct sched_domain *sd = sd_init(tl, cpu); - if (!sd) - return child; cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { From 1f6e6c7cb9bcd58abb5ee11243e0eefe6b36fc8e Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 25 Jul 2016 14:34:22 +0100 Subject: [PATCH 031/538] sched/core: Introduce SD_ASYM_CPUCAPACITY sched_domain topology flag Add a topology flag to the sched_domain hierarchy indicating the lowest domain level where the full range of CPU capacities is represented by the domain members for asymmetric capacity topologies (e.g. ARM big.LITTLE). The flag is intended to indicate that extra care should be taken when placing tasks on CPUs and this level spans all the different types of CPUs found in the system (no need to look further up the domain hierarchy). This information is currently only available through iterating through the capacities of all the CPUs at parent levels in the sched_domain hierarchy. SD 2 [ 0 1 2 3] SD_ASYM_CPUCAPACITY SD 1 [ 0 1] [ 2 3] !SD_ASYM_CPUCAPACITY CPU: 0 1 2 3 capacity: 756 756 1024 1024 If the topology in the example above is duplicated to create an eight CPU example with third sched_domain level on top (SD 3), this level should not have the flag set (!SD_ASYM_CPUCAPACITY) as its two group would both have all CPU capacities represented within them. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1469453670-2660-6-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 7f64e89a5873..d75024053e9b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1022,6 +1022,7 @@ extern void wake_up_q(struct wake_q_head *head); #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ +#define SD_ASYM_CPUCAPACITY 0x0040 /* Groups have different max cpu capacities */ #define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */ #define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1b2dd5220170..46bfb90aec00 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5716,6 +5716,7 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | + SD_ASYM_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN)) { if (sd->groups != sd->groups->next) @@ -5746,6 +5747,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | + SD_ASYM_CPUCAPACITY | SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | @@ -6363,6 +6365,7 @@ static int sched_domains_curr_level; * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies * SD_SHARE_POWERDOMAIN - describes shared power domain + * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies * * Odd one out, which beside describing the topology has a quirk also * prescribes the desired behaviour that goes along with it: @@ -6374,6 +6377,7 @@ static int sched_domains_curr_level; SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ + SD_ASYM_CPUCAPACITY | \ SD_SHARE_POWERDOMAIN) static struct sched_domain * From 3676b13e8524c576825fe1e731e347dba0083888 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 25 Jul 2016 14:34:23 +0100 Subject: [PATCH 032/538] sched/core: Pass child domain into sd_init() If behavioural sched_domain flags depend on topology flags set at higher domain levels we need a way to update the child domain flags. Moving the child pointer assignment inside sd_init() should make that possible. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1469453670-2660-7-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 46bfb90aec00..57394650c6ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6381,7 +6381,8 @@ static int sched_domains_curr_level; SD_SHARE_POWERDOMAIN) static struct sched_domain * -sd_init(struct sched_domain_topology_level *tl, int cpu) +sd_init(struct sched_domain_topology_level *tl, + struct sched_domain *child, int cpu) { struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); int sd_weight, sd_flags = 0; @@ -6433,6 +6434,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) .smt_gain = 0, .max_newidle_lb_cost = 0, .next_decay_max_lb_cost = jiffies, + .child = child, #ifdef CONFIG_SCHED_DEBUG .name = tl->name, #endif @@ -6857,14 +6859,13 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { - struct sched_domain *sd = sd_init(tl, cpu); + struct sched_domain *sd = sd_init(tl, child, cpu); cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { sd->level = child->level + 1; sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; - sd->child = child; if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) { From 9ee1cda5ee25c7dd82acf25892e0d229e818f8c7 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 25 Jul 2016 14:34:24 +0100 Subject: [PATCH 033/538] sched/core: Enable SD_BALANCE_WAKE for asymmetric capacity systems A domain with the SD_ASYM_CPUCAPACITY flag set indicate that sched_groups at this level and below do not include CPUs of all capacities available (e.g. group containing little-only or big-only CPUs in big.LITTLE systems). It is therefore necessary to put in more effort in finding an appropriate CPU at task wake-up by enabling balancing at wake-up (SD_BALANCE_WAKE) on all lower (child) levels. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1469453670-2660-8-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 57394650c6ab..4695df6ed752 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6444,6 +6444,13 @@ sd_init(struct sched_domain_topology_level *tl, * Convert topological properties into behaviour. */ + if (sd->flags & SD_ASYM_CPUCAPACITY) { + struct sched_domain *t = sd; + + for_each_lower_domain(t) + t->flags |= SD_BALANCE_WAKE; + } + if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; From cd92bfd3b8cb0ec2ee825e55a3aee704cd55aea9 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 1 Aug 2016 19:53:35 +0100 Subject: [PATCH 034/538] sched/core: Store maximum per-CPU capacity in root domain To be able to compare the capacity of the target CPU with the highest available CPU capacity, store the maximum per-CPU capacity in the root domain. The max per-CPU capacity should be 1024 for all systems except SMT, where the capacity is currently based on smt_gain and the number of hardware threads and is <1024. If SMT can be brought to work with a per-thread capacity of 1024, this patch can be dropped and replaced by a hard-coded max capacity of 1024 (=SCHED_CAPACITY_SCALE). Signed-off-by: Dietmar Eggemann Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/26c69258-9947-f830-a53e-0c54e7750646@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 12 ++++++++++++ kernel/sched/sched.h | 2 ++ 2 files changed, 14 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4695df6ed752..69243142cad1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6903,6 +6903,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; + struct rq *rq = NULL; int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); @@ -6953,11 +6954,22 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { + rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); + + /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ + if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) + WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); + cpu_attach_domain(sd, d.rd, i); } rcu_read_unlock(); + if (rq) { + pr_info("span: %*pbl (max cpu_capacity = %lu)\n", + cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); + } + ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index afe76d04e916..420c05d099c3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -565,6 +565,8 @@ struct root_domain { */ cpumask_var_t rto_mask; struct cpupri cpupri; + + unsigned long max_cpu_capacity; }; extern struct root_domain def_root_domain; From 3273163c6775c4c21823985304c2364b08ca6ea2 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 25 Jul 2016 14:34:26 +0100 Subject: [PATCH 035/538] sched/fair: Let asymmetric CPU configurations balance at wake-up Currently, SD_WAKE_AFFINE always takes priority over wakeup balancing if SD_BALANCE_WAKE is set on the sched_domains. For asymmetric configurations SD_WAKE_AFFINE is only desirable if the waking task's compute demand (utilization) is suitable for the waking CPU and the previous CPU, and all CPUs within their respective SD_SHARE_PKG_RESOURCES domains (sd_llc). If not, let wakeup balancing take over (find_idlest_{group, cpu}()). This patch makes affine wake-ups conditional on whether both the waker CPU and the previous CPU has sufficient capacity for the waking task, or not, assuming that the CPU capacities within an SD_SHARE_PKG_RESOURCES domain (sd_llc) are homogeneous. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1469453670-2660-10-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index acdc351d2386..61d485421bed 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +/* + * The margin used when comparing utilization with CPU capacity: + * util * 1024 < capacity * margin + */ +unsigned int capacity_margin = 1280; /* ~20% */ + static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; @@ -5376,6 +5382,32 @@ static int cpu_util(int cpu) return (util >= capacity) ? capacity : util; } +static inline int task_util(struct task_struct *p) +{ + return p->se.avg.util_avg; +} + +/* + * Disable WAKE_AFFINE in the case where task @p doesn't fit in the + * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. + * + * In that case WAKE_AFFINE doesn't make sense and we'll let + * BALANCE_WAKE sort things out. + */ +static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) +{ + long min_cap, max_cap; + + min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); + max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; + + /* Minimum capacity is close to max, no need to abort wake_affine */ + if (max_cap - min_cap < max_cap >> 3) + return 0; + + return min_cap * 1024 < task_util(p) * capacity_margin; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -5399,7 +5431,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); - want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) + && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); } rcu_read_lock(); From fe7bd58f5d25d5d655b1da4a084cc4ef6f085fee Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Wed, 17 Aug 2016 16:00:33 +0800 Subject: [PATCH 036/538] x86/ioapic: Change prototype of acpi_ioapic_add() Change the argument of acpi_ioapic_add() to a generic ACPI handle, and move its prototype from drivers/acpi/internal.h to include/linux/acpi.h so that it can be called from outside the pci_root driver. Signed-off-by: Rui Wang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bhelgaas@google.com Cc: helgaas@kernel.org Cc: linux-acpi@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: rjw@rjwysocki.net Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/1471420837-31003-2-git-send-email-rui.y.wang@intel.com Signed-off-by: Ingo Molnar --- drivers/acpi/internal.h | 2 -- drivers/acpi/ioapic.c | 6 +++--- drivers/acpi/pci_root.c | 2 +- include/linux/acpi.h | 6 ++++++ 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 940218ff0193..f26fc1d7cfea 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -40,10 +40,8 @@ int acpi_sysfs_init(void); void acpi_container_init(void); void acpi_memory_hotplug_init(void); #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC -int acpi_ioapic_add(struct acpi_pci_root *root); int acpi_ioapic_remove(struct acpi_pci_root *root); #else -static inline int acpi_ioapic_add(struct acpi_pci_root *root) { return 0; } static inline int acpi_ioapic_remove(struct acpi_pci_root *root) { return 0; } #endif #ifdef CONFIG_ACPI_DOCK diff --git a/drivers/acpi/ioapic.c b/drivers/acpi/ioapic.c index ccdc8db16bb8..2449377a6e7c 100644 --- a/drivers/acpi/ioapic.c +++ b/drivers/acpi/ioapic.c @@ -189,13 +189,13 @@ static acpi_status handle_ioapic_add(acpi_handle handle, u32 lvl, return AE_OK; } -int acpi_ioapic_add(struct acpi_pci_root *root) +int acpi_ioapic_add(acpi_handle root_handle) { acpi_status status, retval = AE_OK; - status = acpi_walk_namespace(ACPI_TYPE_DEVICE, root->device->handle, + status = acpi_walk_namespace(ACPI_TYPE_DEVICE, root_handle, UINT_MAX, handle_ioapic_add, NULL, - root->device->handle, (void **)&retval); + root_handle, (void **)&retval); return ACPI_SUCCESS(status) && ACPI_SUCCESS(retval) ? 0 : -ENODEV; } diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index d144168d4ef9..b07eda1e7b05 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -614,7 +614,7 @@ static int acpi_pci_root_add(struct acpi_device *device, if (hotadd) { pcibios_resource_survey_bus(root->bus); pci_assign_unassigned_root_bus_resources(root->bus); - acpi_ioapic_add(root); + acpi_ioapic_add(root->device->handle); } pci_lock_rescan_remove(); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4d8452c2384b..c9a596b9535c 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -751,6 +751,12 @@ static inline int acpi_reconfig_notifier_unregister(struct notifier_block *nb) #endif /* !CONFIG_ACPI */ +#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC +int acpi_ioapic_add(acpi_handle root); +#else +static inline int acpi_ioapic_add(acpi_handle root) { return 0; } +#endif + #ifdef CONFIG_ACPI void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state, u32 pm1a_ctrl, u32 pm1b_ctrl)); From 584c5c422f6c749ced1e0bc3c6837f650f64e1e1 Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Wed, 17 Aug 2016 16:00:34 +0800 Subject: [PATCH 037/538] x86/ioapic: Support hot-removal of IOAPICs present during boot IOAPICs present during system boot aren't added to ioapic_list, thus are unable to be hot-removed. Fix it by calling acpi_ioapic_add() during root bus enumeration. Signed-off-by: Rui Wang Acked-by: Bjorn Helgaas Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: helgaas@kernel.org Cc: linux-acpi@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: rjw@rjwysocki.net Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/1471420837-31003-3-git-send-email-rui.y.wang@intel.com Signed-off-by: Ingo Molnar --- drivers/acpi/pci_root.c | 10 ++++++++++ drivers/pci/setup-bus.c | 5 ++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index b07eda1e7b05..bf601d4df8cf 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -614,6 +614,16 @@ static int acpi_pci_root_add(struct acpi_device *device, if (hotadd) { pcibios_resource_survey_bus(root->bus); pci_assign_unassigned_root_bus_resources(root->bus); + /* + * This is only called for the hotadd case. For the boot-time + * case, we need to wait until after PCI initialization in + * order to deal with IOAPICs mapped in on a PCI BAR. + * + * This is currently x86-specific, because acpi_ioapic_add() + * is an empty function without CONFIG_ACPI_HOTPLUG_IOAPIC. + * And CONFIG_ACPI_HOTPLUG_IOAPIC depends on CONFIG_X86_IO_APIC + * (see drivers/acpi/Kconfig). + */ acpi_ioapic_add(root->device->handle); } diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index c74059e10a6d..ec538d3d2bd5 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "pci.h" unsigned int pci_flags; @@ -1852,8 +1853,10 @@ void __init pci_assign_unassigned_resources(void) { struct pci_bus *root_bus; - list_for_each_entry(root_bus, &pci_root_buses, node) + list_for_each_entry(root_bus, &pci_root_buses, node) { pci_assign_unassigned_root_bus_resources(root_bus); + acpi_ioapic_add(ACPI_HANDLE(root_bus->bridge)); + } } void pci_assign_unassigned_bridge_resources(struct pci_dev *bridge) From 6ab7eba5db93c11d61f6f7fbe21edbc875b26c1a Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Wed, 17 Aug 2016 16:00:35 +0800 Subject: [PATCH 038/538] x86/ioapic: Fix setup_res() failing to get resource acpi_dev_filter_resource_type() returns 0 on success, and 1 on failure. A return value of zero means there's a matching resource, so we should continue within setup_res() to get the resource. Signed-off-by: Rui Wang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bhelgaas@google.com Cc: helgaas@kernel.org Cc: linux-acpi@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: rjw@rjwysocki.net Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/1471420837-31003-4-git-send-email-rui.y.wang@intel.com Signed-off-by: Ingo Molnar --- drivers/acpi/ioapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/ioapic.c b/drivers/acpi/ioapic.c index 2449377a6e7c..8ab6d426c178 100644 --- a/drivers/acpi/ioapic.c +++ b/drivers/acpi/ioapic.c @@ -46,7 +46,7 @@ static acpi_status setup_res(struct acpi_resource *acpi_res, void *data) struct resource_win win; res->flags = 0; - if (acpi_dev_filter_resource_type(acpi_res, IORESOURCE_MEM) == 0) + if (acpi_dev_filter_resource_type(acpi_res, IORESOURCE_MEM)) return AE_OK; if (!acpi_dev_resource_memory(acpi_res, res)) { From 162b83bd5f1d7124e21da78bcf2685b9824d9ef0 Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Wed, 17 Aug 2016 16:00:36 +0800 Subject: [PATCH 039/538] x86/ioapic: Fix lost IOAPIC resource after hot-removal and hotadd IOAPIC resource at 0xfecxxxxx gets lost from /proc/iomem after hot-removing and then hot-adding the IOAPIC device. After system boot, in /proc/iomem: fec00000-fecfffff : PNP0003:00 fec00000-fec003ff : IOAPIC 0 fec01000-fec013ff : IOAPIC 1 fec40000-fec403ff : IOAPIC 2 fec80000-fec803ff : IOAPIC 3 fecc0000-fecc03ff : IOAPIC 4 Then hot-remove IOAPIC 2 and hot-add it again: fec00000-fecfffff : PNP0003:00 fec00000-fec003ff : IOAPIC 0 fec01000-fec013ff : IOAPIC 1 fec80000-fec803ff : IOAPIC 3 fecc0000-fecc03ff : IOAPIC 4 The range at 0xfec40000 is lost from /proc/iomem - which is a bug. This bug happens because handle_ioapic_add() requests resources from either PCI config BAR or ACPI "_CRS", not both. But Intel platforms map the IOxAPIC registers both at the PCI config BAR (called MBAR, dynamic), and at the ACPI "_CRS" (called ABAR, static). The 0xfecX_YZ00 to 0xfecX_YZFF range appears in "_CRS" of each IOAPIC device. Both ranges should be claimed from /proc/iomem for exclusive use. Signed-off-by: Rui Wang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bhelgaas@google.com Cc: helgaas@kernel.org Cc: linux-acpi@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: rjw@rjwysocki.net Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/1471420837-31003-5-git-send-email-rui.y.wang@intel.com Signed-off-by: Ingo Molnar --- drivers/acpi/ioapic.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/drivers/acpi/ioapic.c b/drivers/acpi/ioapic.c index 8ab6d426c178..ee201111e063 100644 --- a/drivers/acpi/ioapic.c +++ b/drivers/acpi/ioapic.c @@ -97,7 +97,7 @@ static acpi_status handle_ioapic_add(acpi_handle handle, u32 lvl, unsigned long long gsi_base; struct acpi_pci_ioapic *ioapic; struct pci_dev *dev = NULL; - struct resource *res = NULL; + struct resource *res = NULL, *pci_res = NULL, *crs_res; char *type = NULL; if (!acpi_is_ioapic(handle, &type)) @@ -137,23 +137,28 @@ static acpi_status handle_ioapic_add(acpi_handle handle, u32 lvl, pci_set_master(dev); if (pci_request_region(dev, 0, type)) goto exit_disable; - res = &dev->resource[0]; + pci_res = &dev->resource[0]; ioapic->pdev = dev; } else { pci_dev_put(dev); dev = NULL; + } - res = &ioapic->res; - acpi_walk_resources(handle, METHOD_NAME__CRS, setup_res, res); - if (res->flags == 0) { - acpi_handle_warn(handle, "failed to get resource\n"); - goto exit_free; - } else if (request_resource(&iomem_resource, res)) { - acpi_handle_warn(handle, "failed to insert resource\n"); - goto exit_free; - } + crs_res = &ioapic->res; + acpi_walk_resources(handle, METHOD_NAME__CRS, setup_res, crs_res); + if (crs_res->flags == 0) { + acpi_handle_warn(handle, "failed to get resource\n"); + goto exit_release; + } else if (request_resource(&iomem_resource, crs_res)) { + acpi_handle_warn(handle, "failed to insert resource\n"); + goto exit_release; } + /* try pci resource first, then "_CRS" resource */ + res = pci_res; + if (!res || !res->flags) + res = crs_res; + if (acpi_register_ioapic(handle, res->start, (u32)gsi_base)) { acpi_handle_warn(handle, "failed to register IOAPIC\n"); goto exit_release; @@ -174,14 +179,13 @@ static acpi_status handle_ioapic_add(acpi_handle handle, u32 lvl, exit_release: if (dev) pci_release_region(dev, 0); - else - release_resource(res); + if (ioapic->res.flags && ioapic->res.parent) + release_resource(&ioapic->res); exit_disable: if (dev) pci_disable_device(dev); exit_put: pci_dev_put(dev); -exit_free: kfree(ioapic); exit: mutex_unlock(&ioapic_list_lock); @@ -217,9 +221,9 @@ int acpi_ioapic_remove(struct acpi_pci_root *root) pci_release_region(ioapic->pdev, 0); pci_disable_device(ioapic->pdev); pci_dev_put(ioapic->pdev); - } else if (ioapic->res.flags && ioapic->res.parent) { - release_resource(&ioapic->res); } + if (ioapic->res.flags && ioapic->res.parent) + release_resource(&ioapic->res); list_del(&ioapic->list); kfree(ioapic); } From 624cad9d2907a0788b56e3ca664c5d7d02645ed4 Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Wed, 17 Aug 2016 16:00:37 +0800 Subject: [PATCH 040/538] x86/ioapic: Fix IOAPIC failing to request resource handle_ioapic_add() uses request_resource() to request ACPI "_CRS" resources. This can fail with the following error message: [ 247.325693] ACPI: \_SB_.IIO1.AID1: failed to insert resource This happens when there are multiple IOAPICs and DSDT groups their "_CRS" resources as the children of a parent resource, as seen from /proc/iomem: fec00000-fecfffff : PNP0003:00 fec00000-fec003ff : IOAPIC 0 fec01000-fec013ff : IOAPIC 1 fec40000-fec403ff : IOAPIC 2 In this case request_resource() fails because there's a conflicting resource which is the parent (fec0000-fecfffff). Fix it by using insert_resource() which can request resources by taking the conflicting resource as the parent. Signed-off-by: Rui Wang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bhelgaas@google.com Cc: helgaas@kernel.org Cc: linux-acpi@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: rjw@rjwysocki.net Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/1471420837-31003-6-git-send-email-rui.y.wang@intel.com Signed-off-by: Ingo Molnar --- drivers/acpi/ioapic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/ioapic.c b/drivers/acpi/ioapic.c index ee201111e063..6d7ce6e12aaa 100644 --- a/drivers/acpi/ioapic.c +++ b/drivers/acpi/ioapic.c @@ -146,10 +146,12 @@ static acpi_status handle_ioapic_add(acpi_handle handle, u32 lvl, crs_res = &ioapic->res; acpi_walk_resources(handle, METHOD_NAME__CRS, setup_res, crs_res); + crs_res->name = type; + crs_res->flags |= IORESOURCE_BUSY; if (crs_res->flags == 0) { acpi_handle_warn(handle, "failed to get resource\n"); goto exit_release; - } else if (request_resource(&iomem_resource, crs_res)) { + } else if (insert_resource(&iomem_resource, crs_res)) { acpi_handle_warn(handle, "failed to insert resource\n"); goto exit_release; } From a1eb1411b4e4251db02179e39d234c2ee5192c72 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 17 Aug 2016 11:30:44 +0200 Subject: [PATCH 041/538] sched/cputime: Improve scalability by not accounting thread group tasks pending runtime Commit: d670ec13178d0 ("posix-cpu-timers: Cure SMP wobbles") started accounting thread group tasks pending runtime in thread_group_cputime(). Another commit: 6e998916dfe32 ("sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency") updated scheduler runtime statistics (call update_curr()) when reading task pending runtime. Those changes cause bad performance of SYS_times() and SYS_clock_gettimes(CLOCK_PROCESS_CPUTIME_ID) syscalls, especially on larger systems with many CPUs. While we would like to have cpuclock monotonicity kept i.e. have problems fixed by above commits stay fixed, we also would like to have good performance. However when we notice that change from commit d670ec13178d0 is not longer needed to solve problem addressed by that commit, because of change from the second commit 6e998916dfe32, we can get room for optimization. Since we update task while reading it's pending runtime in task_sched_runtime(), clock_gettime(CLOCK_PROCESS_CPUTIME_ID) will see updated values and on testcase from d670ec13178d0 process cpuclock will not be smaller than thread cpuclock. I tested the patch on testcases from commits d670ec13178d0, 6e998916dfe32 and some other cpuclock/cputimers testcases and did not found cpuclock monotonicity problems or other malfunction. This patch has the drawback that we will not provide thread group cputime up-to-date to the last moment. For example when arming cputime timer, we will arm it with possibly a bit outdated values and that timer will trigger earlier compared to behaviour without the patch. However that was the behaviour before d670ec13178d0 commit (kernel v3.1) so it's unlikely to affect applications. Patch improves related syscall performance, as measured by Giovanni's benchmarks described in commit: 6075620b0590e ("sched/cputime: Mitigate performance regression in times()/clock_gettime()") The benchmark results are: SYS_clock_gettime(): threads 4.7-rc7 3.18-rc3 4.7-rc7 + prefetch 4.7-rc7 + patch (pre-6e998916dfe3) 2 3.48 2.23 ( 35.68%) 3.06 ( 11.83%) 1.08 ( 68.81%) 5 3.33 2.83 ( 14.84%) 3.25 ( 2.40%) 0.71 ( 78.55%) 8 3.37 2.84 ( 15.80%) 3.26 ( 3.30%) 0.56 ( 83.49%) 12 3.32 3.09 ( 6.69%) 3.37 ( -1.60%) 0.42 ( 87.28%) 21 4.01 3.14 ( 21.70%) 3.90 ( 2.74%) 0.35 ( 91.35%) 30 3.63 3.28 ( 9.75%) 3.36 ( 7.41%) 0.28 ( 92.23%) 48 3.71 3.02 ( 18.69%) 3.11 ( 16.27%) 0.39 ( 89.39%) 79 3.75 2.88 ( 23.23%) 3.16 ( 15.74%) 0.46 ( 87.76%) 110 3.81 2.95 ( 22.62%) 3.25 ( 14.80%) 0.56 ( 85.41%) 128 3.88 3.05 ( 21.28%) 3.31 ( 14.76%) 0.62 ( 84.10%) SYS_times(): threads 4.7-rc7 3.18-rc3 4.7-rc7 + prefetch 4.7-rc7 + patch (pre-6e998916dfe3) 2 3.65 2.27 ( 37.94%) 3.25 ( 11.03%) 1.62 ( 55.71%) 5 3.45 2.78 ( 19.34%) 3.17 ( 7.92%) 2.33 ( 32.28%) 8 3.52 2.79 ( 20.66%) 3.22 ( 8.69%) 2.06 ( 41.44%) 12 3.29 3.02 ( 8.33%) 3.36 ( -2.04%) 2.00 ( 39.18%) 21 4.07 3.10 ( 23.86%) 3.92 ( 3.78%) 2.07 ( 49.18%) 30 3.87 3.33 ( 13.80%) 3.40 ( 12.17%) 1.89 ( 51.12%) 48 3.79 2.96 ( 21.94%) 3.16 ( 16.61%) 1.69 ( 55.46%) 79 3.88 2.88 ( 25.82%) 3.28 ( 15.42%) 1.60 ( 58.81%) 110 3.90 2.98 ( 23.73%) 3.38 ( 13.35%) 1.73 ( 55.61%) 128 4.00 3.10 ( 22.40%) 3.38 ( 15.45%) 1.66 ( 58.52%) Reported-and-tested-by: Giovanni Gherdovich Signed-off-by: Stanislaw Gruszka Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/20160817093043.GA25206@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a846cf89eb96..b93c72d5f64f 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -306,6 +306,26 @@ static inline cputime_t account_other_time(cputime_t max) return accounted; } +#ifdef CONFIG_64BIT +static inline u64 read_sum_exec_runtime(struct task_struct *t) +{ + return t->se.sum_exec_runtime; +} +#else +static u64 read_sum_exec_runtime(struct task_struct *t) +{ + u64 ns; + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(t, &rf); + ns = t->se.sum_exec_runtime; + task_rq_unlock(rq, t, &rf); + + return ns; +} +#endif + /* * Accumulate raw cputime values of dead tasks (sig->[us]time) and live * tasks (sum on group iteration) belonging to @tsk's group. @@ -318,6 +338,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) unsigned int seq, nextseq; unsigned long flags; + /* + * Update current task runtime to account pending time since last + * scheduler action or thread_group_cputime() call. This thread group + * might have other running tasks on different CPUs, but updating + * their runtime can affect syscall performance, so we skip account + * those pending times and rely only on values updated on tick or + * other scheduler action. + */ + if (same_thread_group(current, tsk)) + (void) task_sched_runtime(current); + rcu_read_lock(); /* Attempt a lockless read on the first round. */ nextseq = 0; @@ -332,7 +363,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) task_cputime(t, &utime, &stime); times->utime += utime; times->stime += stime; - times->sum_exec_runtime += task_sched_runtime(t); + times->sum_exec_runtime += read_sum_exec_runtime(t); } /* If lockless access failed, take the lock. */ nextseq = 1; From 3942a9bd7b5842a924e99ee6ec1350b8006c94ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Aug 2016 18:54:13 +0200 Subject: [PATCH 042/538] locking, rcu, cgroup: Avoid synchronize_sched() in __cgroup_procs_write() The current percpu-rwsem read side is entirely free of serializing insns at the cost of having a synchronize_sched() in the write path. The latency of the synchronize_sched() is too high for cgroups. The commit 1ed1328792ff talks about the write path being a fairly cold path but this is not the case for Android which moves task to the foreground cgroup and back around binder IPC calls from foreground processes to background processes, so it is significantly hotter than human initiated operations. Switch cgroup_threadgroup_rwsem into the slow mode for now to avoid the problem, hopefully it should not be that slow after another commit: 80127a39681b ("locking/percpu-rwsem: Optimize readers and reduce global impact"). We could just add rcu_sync_enter() into cgroup_init() but we do not want another synchronize_sched() at boot time, so this patch adds the new helper which doesn't block but currently can only be called before the first use. Reported-by: John Stultz Reported-by: Dmitry Shmidt Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Colin Cross Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rom Lemarchand Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Link: http://lkml.kernel.org/r/20160811165413.GA22807@redhat.com Signed-off-by: Ingo Molnar --- include/linux/rcu_sync.h | 1 + kernel/cgroup.c | 6 ++++++ kernel/rcu/sync.c | 12 ++++++++++++ 3 files changed, 19 insertions(+) diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h index a63a33e6196e..ece7ed9a4a70 100644 --- a/include/linux/rcu_sync.h +++ b/include/linux/rcu_sync.h @@ -59,6 +59,7 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp) } extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type); +extern void rcu_sync_enter_start(struct rcu_sync *); extern void rcu_sync_enter(struct rcu_sync *); extern void rcu_sync_exit(struct rcu_sync *); extern void rcu_sync_dtor(struct rcu_sync *); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1c51b7f5221..9f51cdf58f5a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5606,6 +5606,12 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); + /* + * The latency of the synchronize_sched() is too high for cgroups, + * avoid it at the cost of forcing all readers into the slow path. + */ + rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); + get_user_ns(init_cgroup_ns.user_ns); mutex_lock(&cgroup_mutex); diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 198473d90f81..50d1861f7759 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -84,6 +84,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) rsp->gp_type = type; } +/** + * Must be called after rcu_sync_init() and before first use. + * + * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() + * pairs turn into NO-OPs. + */ +void rcu_sync_enter_start(struct rcu_sync *rsp) +{ + rsp->gp_count++; + rsp->gp_state = GP_PASSED; +} + /** * rcu_sync_enter() - Force readers onto slowpath * @rsp: Pointer to rcu_sync structure to use for synchronization From 84b23f9b58687a11ced66cc4be9b0219e8ecab84 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 5 Aug 2016 01:04:43 -0700 Subject: [PATCH 043/538] locking/rwsem: Return void in __rwsem_mark_wake() We currently return a rw_semaphore structure, which is the same lock we passed to the function's argument in the first place. While there are several functions that choose this return value, the callers use it, for example, for things like ERR_PTR. This is not the case for __rwsem_mark_wake(), and in addition this function is really about the lock waiters (which we know there are at this point), so its somewhat odd to be returning the sem structure. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hp.com Cc: dave@stgolabs.net Cc: jason.low2@hpe.com Cc: wanpeng.li@hotmail.com Link: http://lkml.kernel.org/r/1470384285-32163-2-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 447e08de1fab..b03623172277 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -121,16 +121,17 @@ enum rwsem_wake_type { * - woken process blocks are discarded from the list after having task zeroed * - writers are only marked woken if downgrading is false */ -static struct rw_semaphore * -__rwsem_mark_wake(struct rw_semaphore *sem, - enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) +static void __rwsem_mark_wake(struct rw_semaphore *sem, + enum rwsem_wake_type wake_type, + struct wake_q_head *wake_q) { struct rwsem_waiter *waiter; struct task_struct *tsk; struct list_head *next; - long oldcount, woken, loop, adjustment; + long loop, oldcount, woken = 0, adjustment = 0; waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); + if (waiter->type == RWSEM_WAITING_FOR_WRITE) { if (wake_type == RWSEM_WAKE_ANY) { /* @@ -142,19 +143,19 @@ __rwsem_mark_wake(struct rw_semaphore *sem, */ wake_q_add(wake_q, waiter->task); } - goto out; + + return; } - /* Writers might steal the lock before we grant it to the next reader. + /* + * Writers might steal the lock before we grant it to the next reader. * We prefer to do the first reader grant before counting readers * so we can bail out early if a writer stole the lock. */ - adjustment = 0; if (wake_type != RWSEM_WAKE_READ_OWNED) { adjustment = RWSEM_ACTIVE_READ_BIAS; try_reader_grant: oldcount = atomic_long_fetch_add(adjustment, &sem->count); - if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { /* * If the count is still less than RWSEM_WAITING_BIAS @@ -164,7 +165,8 @@ __rwsem_mark_wake(struct rw_semaphore *sem, */ if (atomic_long_add_return(-adjustment, &sem->count) < RWSEM_WAITING_BIAS) - goto out; + return; + /* Last active locker left. Retry waking readers. */ goto try_reader_grant; } @@ -176,11 +178,11 @@ __rwsem_mark_wake(struct rw_semaphore *sem, rwsem_set_reader_owned(sem); } - /* Grant an infinite number of read locks to the readers at the front + /* + * Grant an infinite number of read locks to the readers at the front * of the queue. Note we increment the 'active part' of the count by * the number of readers before waking any processes up. */ - woken = 0; do { woken++; @@ -219,9 +221,6 @@ __rwsem_mark_wake(struct rw_semaphore *sem, sem->wait_list.next = next; next->prev = &sem->wait_list; - - out: - return sem; } /* @@ -255,7 +254,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) if (count == RWSEM_WAITING_BIAS || (count > RWSEM_WAITING_BIAS && adjustment != -RWSEM_ACTIVE_READ_BIAS)) - sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); @@ -505,7 +504,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) if (count > RWSEM_WAITING_BIAS) { WAKE_Q(wake_q); - sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); + __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); /* * The wakeup is normally called _after_ the wait_lock * is released, but given that we are proactively waking @@ -616,7 +615,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); wake_up_q(&wake_q); @@ -640,7 +639,7 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); + __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); wake_up_q(&wake_q); From c2867bbaf5d8f1534cae15175a389c5cbf58fec1 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 5 Aug 2016 01:04:44 -0700 Subject: [PATCH 044/538] locking/rwsem: Remove a few useless comments Our rwsem code (xadd, at least) is rather well documented, but there are a few really annoying comments in there that serve no purpose and we shouldn't bother with them. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hp.com Cc: dave@stgolabs.net Cc: jason.low2@hpe.com Cc: wanpeng.li@hotmail.com Link: http://lkml.kernel.org/r/1470384285-32163-3-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index b03623172277..e02fe3289b5a 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -234,7 +234,6 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) struct task_struct *tsk = current; WAKE_Q(wake_q); - /* set up my own style of waitqueue */ waiter.task = tsk; waiter.type = RWSEM_WAITING_FOR_READ; @@ -613,7 +612,6 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); locked: - /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); @@ -637,7 +635,6 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); From 70800c3c0cc525baa38fd0fe4660f2c27f1bfeeb Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 5 Aug 2016 01:04:45 -0700 Subject: [PATCH 045/538] locking/rwsem: Scan the wait_list for readers only once When wanting to wakeup readers, __rwsem_mark_wakeup() currently iterates the wait_list twice while looking to wakeup the first N queued reader-tasks. While this can be quite inefficient, it was there such that a awoken reader would be first and foremost acknowledged by the lock counter. Keeping the same logic, we can further benefit from the use of wake_qs and avoid entirely the first wait_list iteration that sets the counter as wake_up_process() isn't going to occur right away, and therefore we maintain the counter->list order of going about things. Other than saving cycles with O(n) "scanning", this change also nicely cleans up a good chunk of __rwsem_mark_wakeup(); both visually and less tedious to read. For example, the following improvements where seen on some will it scale microbenchmarks, on a 48-core Haswell: v4.7 v4.7-rwsem-v1 Hmean signal1-processes-8 5792691.42 ( 0.00%) 5771971.04 ( -0.36%) Hmean signal1-processes-12 6081199.96 ( 0.00%) 6072174.38 ( -0.15%) Hmean signal1-processes-21 3071137.71 ( 0.00%) 3041336.72 ( -0.97%) Hmean signal1-processes-48 3712039.98 ( 0.00%) 3708113.59 ( -0.11%) Hmean signal1-processes-79 4464573.45 ( 0.00%) 4682798.66 ( 4.89%) Hmean signal1-processes-110 4486842.01 ( 0.00%) 4633781.71 ( 3.27%) Hmean signal1-processes-141 4611816.83 ( 0.00%) 4692725.38 ( 1.75%) Hmean signal1-processes-172 4638157.05 ( 0.00%) 4714387.86 ( 1.64%) Hmean signal1-processes-203 4465077.80 ( 0.00%) 4690348.07 ( 5.05%) Hmean signal1-processes-224 4410433.74 ( 0.00%) 4687534.43 ( 6.28%) Stddev signal1-processes-8 6360.47 ( 0.00%) 8455.31 ( 32.94%) Stddev signal1-processes-12 4004.98 ( 0.00%) 9156.13 (128.62%) Stddev signal1-processes-21 3273.14 ( 0.00%) 5016.80 ( 53.27%) Stddev signal1-processes-48 28420.25 ( 0.00%) 26576.22 ( -6.49%) Stddev signal1-processes-79 22038.34 ( 0.00%) 18992.70 (-13.82%) Stddev signal1-processes-110 23226.93 ( 0.00%) 17245.79 (-25.75%) Stddev signal1-processes-141 6358.98 ( 0.00%) 7636.14 ( 20.08%) Stddev signal1-processes-172 9523.70 ( 0.00%) 4824.75 (-49.34%) Stddev signal1-processes-203 13915.33 ( 0.00%) 9326.33 (-32.98%) Stddev signal1-processes-224 15573.94 ( 0.00%) 10613.82 (-31.85%) Other runs that saw improvements include context_switch and pipe; and as expected, this is particularly highlighted on larger thread counts as it becomes more expensive to walk the list twice. No change in wakeup ordering or semantics. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hp.com Cc: dave@stgolabs.net Cc: jason.low2@hpe.com Cc: wanpeng.li@hotmail.com Link: http://lkml.kernel.org/r/1470384285-32163-4-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 58 +++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 32 deletions(-) diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index e02fe3289b5a..2337b4bb2366 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -125,12 +125,14 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) { - struct rwsem_waiter *waiter; - struct task_struct *tsk; - struct list_head *next; - long loop, oldcount, woken = 0, adjustment = 0; + struct rwsem_waiter *waiter, *tmp; + long oldcount, woken = 0, adjustment = 0; - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); + /* + * Take a peek at the queue head waiter such that we can determine + * the wakeup(s) to perform. + */ + waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list); if (waiter->type == RWSEM_WAITING_FOR_WRITE) { if (wake_type == RWSEM_WAKE_ANY) { @@ -180,36 +182,21 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, /* * Grant an infinite number of read locks to the readers at the front - * of the queue. Note we increment the 'active part' of the count by - * the number of readers before waking any processes up. + * of the queue. We know that woken will be at least 1 as we accounted + * for above. Note we increment the 'active part' of the count by the + * number of readers before waking any processes up. */ - do { - woken++; + list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { + struct task_struct *tsk; - if (waiter->list.next == &sem->wait_list) + if (waiter->type == RWSEM_WAITING_FOR_WRITE) break; - waiter = list_entry(waiter->list.next, - struct rwsem_waiter, list); - - } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - - adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; - if (waiter->type != RWSEM_WAITING_FOR_WRITE) - /* hit end of list above */ - adjustment -= RWSEM_WAITING_BIAS; - - if (adjustment) - atomic_long_add(adjustment, &sem->count); - - next = sem->wait_list.next; - loop = woken; - do { - waiter = list_entry(next, struct rwsem_waiter, list); - next = waiter->list.next; + woken++; tsk = waiter->task; wake_q_add(wake_q, tsk); + list_del(&waiter->list); /* * Ensure that the last operation is setting the reader * waiter to nil such that rwsem_down_read_failed() cannot @@ -217,10 +204,16 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, * to the task to wakeup. */ smp_store_release(&waiter->task, NULL); - } while (--loop); + } - sem->wait_list.next = next; - next->prev = &sem->wait_list; + adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; + if (list_empty(&sem->wait_list)) { + /* hit end of list above */ + adjustment -= RWSEM_WAITING_BIAS; + } + + if (adjustment) + atomic_long_add(adjustment, &sem->count); } /* @@ -245,7 +238,8 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* we're now waiting on the lock, but no longer actively locking */ count = atomic_long_add_return(adjustment, &sem->count); - /* If there are no active locks, wake the front queued process(es). + /* + * If there are no active locks, wake the front queued process(es). * * If there are no writers and we are first in the queue, * wake our own waiter to join the existing active readers ! From bf255bdaada6d497536aadee5406f6ded318978b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Aug 2016 10:59:01 -0500 Subject: [PATCH 046/538] x86/dumpstack: Remove show_trace() There are a bewildering array of options for dumping the stack. Simplify things a little by removing show_trace(), which is unused. Signed-off-by: Josh Poimboeuf Reviewed-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/fe02292eac9d409001ec0cf6d06f90ced242570d.1471535549.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kdebug.h | 2 -- arch/x86/kernel/dumpstack.c | 6 ------ 2 files changed, 8 deletions(-) diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index 1ef9d581b5d9..d31881188431 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -24,8 +24,6 @@ enum die_val { extern void printk_address(unsigned long address); extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); -extern void show_trace(struct task_struct *t, struct pt_regs *regs, - unsigned long *sp, unsigned long bp); extern void show_stack_regs(struct pt_regs *regs); extern void __show_regs(struct pt_regs *regs, int all); extern unsigned long oops_begin(void); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 92e8f0a7159c..5f49c043500a 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -182,12 +182,6 @@ show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); } -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); -} - void show_stack(struct task_struct *task, unsigned long *sp) { unsigned long bp = 0; From 32541b47bd34940d836fbdf713d16c7ac70d51be Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Aug 2016 10:59:02 -0500 Subject: [PATCH 047/538] x86/asm/head: Remove unused init_rsp variable extern There is no init_rsp variable. Remove its extern. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c183bbecd5730d84e8c6aff3824537c1c1bf3591.1471535549.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/realmode.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index b2988c0ed829..3327ffb38926 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -44,7 +44,6 @@ struct trampoline_header { extern struct real_mode_header *real_mode_header; extern unsigned char real_mode_blob_end[]; -extern unsigned long init_rsp; extern unsigned long initial_code; extern unsigned long initial_gs; From b32f96c75d0dcbb9bf9cc7994e8022c8ce20a668 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Aug 2016 10:59:03 -0500 Subject: [PATCH 048/538] x86/asm/head: Rename 'stack_start' -> 'initial_stack' The 'stack_start' variable is similar in usage to 'initial_code' and 'initial_gs': they're all stored in head_64.S and they're all updated by SMP and ACPI suspend before starting a CPU. Rename it to 'initial_stack' to be consistent with the others. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/87063d773a3212051b77e17b0ee427f6582a5050.1471535549.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/realmode.h | 1 + arch/x86/include/asm/smp.h | 3 --- arch/x86/kernel/acpi/sleep.c | 2 +- arch/x86/kernel/head_32.S | 8 ++++---- arch/x86/kernel/head_64.S | 11 +++++------ arch/x86/kernel/smpboot.c | 2 +- 6 files changed, 12 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 3327ffb38926..230e1903acf0 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -46,6 +46,7 @@ extern unsigned char real_mode_blob_end[]; extern unsigned long initial_code; extern unsigned long initial_gs; +extern unsigned long initial_stack; extern unsigned char real_mode_blob[]; extern unsigned char real_mode_relocs[]; diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index ebd0c164cd4e..19980b36f394 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -39,9 +39,6 @@ DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid); #endif -/* Static state in head.S used to set up a CPU */ -extern unsigned long stack_start; /* Initial stack pointer address */ - struct task_struct; struct smp_ops { diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index adb3eaf8fe2a..48587335ede8 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -99,7 +99,7 @@ int x86_acpi_suspend_lowlevel(void) saved_magic = 0x12345678; #else /* CONFIG_64BIT */ #ifdef CONFIG_SMP - stack_start = (unsigned long)temp_stack + sizeof(temp_stack); + initial_stack = (unsigned long)temp_stack + sizeof(temp_stack); early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(smp_processor_id()); initial_gs = per_cpu_offset(smp_processor_id()); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 6f8902b0d151..5f401262f12d 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -94,7 +94,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) */ __HEAD ENTRY(startup_32) - movl pa(stack_start),%ecx + movl pa(initial_stack),%ecx /* test KEEP_SEGMENTS flag to see if the bootloader is asking us to not reload segments */ @@ -286,7 +286,7 @@ num_subarch_entries = (. - subarch_entries) / 4 * start_secondary(). */ ENTRY(start_cpu0) - movl stack_start, %ecx + movl initial_stack, %ecx movl %ecx, %esp jmp *(initial_code) ENDPROC(start_cpu0) @@ -307,7 +307,7 @@ ENTRY(startup_32_smp) movl %eax,%es movl %eax,%fs movl %eax,%gs - movl pa(stack_start),%ecx + movl pa(initial_stack),%ecx movl %eax,%ss leal -__PAGE_OFFSET(%ecx),%esp @@ -703,7 +703,7 @@ ENTRY(initial_page_table) .data .balign 4 -ENTRY(stack_start) +ENTRY(initial_stack) .long init_thread_union+THREAD_SIZE __INITRODATA diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 9f8efc9f0075..e04814215f15 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -66,7 +66,7 @@ startup_64: */ /* - * Setup stack for verify_cpu(). "-8" because stack_start is defined + * Setup stack for verify_cpu(). "-8" because initial_stack is defined * this way, see below. Our best guess is a NULL ptr for stack * termination heuristics and we don't want to break anything which * might depend on it (kgdb, ...). @@ -226,7 +226,7 @@ ENTRY(secondary_startup_64) movq %rax, %cr0 /* Setup a boot time stack */ - movq stack_start(%rip), %rsp + movq initial_stack(%rip), %rsp /* zero EFLAGS after setting rsp */ pushq $0 @@ -310,7 +310,7 @@ ENDPROC(secondary_startup_64) * start_secondary(). */ ENTRY(start_cpu0) - movq stack_start(%rip),%rsp + movq initial_stack(%rip),%rsp movq initial_code(%rip),%rax pushq $0 # fake return address to stop unwinder pushq $__KERNEL_CS # set correct cs @@ -319,15 +319,14 @@ ENTRY(start_cpu0) ENDPROC(start_cpu0) #endif - /* SMP bootup changes these two */ + /* Both SMP bootup and ACPI suspend change these variables */ __REFDATA .balign 8 GLOBAL(initial_code) .quad x86_64_start_kernel GLOBAL(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) - - GLOBAL(stack_start) + GLOBAL(initial_stack) .quad init_thread_union+THREAD_SIZE-8 .word 0 __FINITDATA diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 4296beb8fdd3..c85d2c636092 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -969,7 +969,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; - stack_start = idle->thread.sp; + initial_stack = idle->thread.sp; /* * Enable the espfix hack for this CPU From 6225f3232a04a54786f817f3648a1f8cc5920272 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Aug 2016 10:59:04 -0500 Subject: [PATCH 049/538] x86/dumpstack: Remove extra brackets around "" When starting the dump of an exception stack, it shows "<>" instead of "". print_trace_stack() already adds brackets, no need to add them again. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/77f185fd5b81845869b400aa619415458df6b6cc.1471535549.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 9ee4520ce83c..daf9f6321856 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -202,7 +202,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, bp = ops->walk_stack(task, stack, bp, ops, data, stack_end, &graph); - ops->stack(data, ""); + ops->stack(data, "EOE"); /* * We link to the next stack via the * second-to-last pointer (index -2 to end) in the From ae952ffdfdf986ecd1452d552a69b82cae7b5e58 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Aug 2016 10:59:05 -0500 Subject: [PATCH 050/538] x86/head: Remove useless zeroed word This zeroed word has no apparent purpose, so remove it. Brian Gerst says: "FYI the word used to be the SS segment selector for the LSS instruction, which isn't needed in 64-bit mode." Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/b056855c295bbb3825b97c1e9f7958539a4d6cf2.1471535549.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index e04814215f15..c98a559c346e 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -328,7 +328,6 @@ ENDPROC(start_cpu0) .quad INIT_PER_CPU_VAR(irq_stack_union) GLOBAL(initial_stack) .quad init_thread_union+THREAD_SIZE-8 - .word 0 __FINITDATA bad_address: From 72b4f6a5e903b071f2a7c4eb1418cbe4eefdc344 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Aug 2016 10:59:06 -0500 Subject: [PATCH 051/538] x86/dumpstack: Fix x86_32 kernel_stack_pointer() previous stack access On x86_32, when an interrupt happens from kernel space, SS and SP aren't pushed and the existing stack is used. So pt_regs is effectively two words shorter, and the previous stack pointer is normally the memory after the shortened pt_regs, aka '®s->sp'. But in the rare case where the interrupt hits right after the stack pointer has been changed to point to an empty stack, like for example when call_on_stack() is used, the address immediately after the shortened pt_regs is no longer on the stack. In that case, instead of '®s->sp', the previous stack pointer should be retrieved from the beginning of the current stack page. kernel_stack_pointer() wants to do that, but it forgets to dereference the pointer. So instead of returning a pointer to the previous stack, it returns a pointer to the beginning of the current stack. Note that it's probably outside of kernel_stack_pointer()'s scope to be switching stacks at all. The x86_64 version of this function doesn't do it, and it would be better for the caller to do it if necessary. But that's a patch for another day. This just fixes the original intent. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Fixes: 0788aa6a23cb ("x86: Prepare removal of previous_esp from i386 thread_info structure") Link: http://lkml.kernel.org/r/472453d6e9f6a2d4ab16aaed4935f43117111566.1471535549.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index f79576a541ff..a1606eadd9ce 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -173,8 +173,8 @@ unsigned long kernel_stack_pointer(struct pt_regs *regs) return sp; prev_esp = (u32 *)(context); - if (prev_esp) - return (unsigned long)prev_esp; + if (*prev_esp) + return (unsigned long)*prev_esp; return (unsigned long)regs; } From 8b927d734122f3021c5999aaeffaa2a36ab224c2 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Aug 2016 10:59:07 -0500 Subject: [PATCH 052/538] proc: Fix return address printk conversion specifer in /proc//stack When printing call return addresses found on a stack, /proc//stack can sometimes give a confusing result. If the call instruction was the last instruction in the function (which can happen when calling a noreturn function), '%pS' will incorrectly display the name of the function which happens to be next in the object code, rather than the name of the actual calling function. Use '%pB' instead, which was created for this exact purpose. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/47ad2821e5ebdbed1fbf83fb85424ae4fbdf8b6e.1471535549.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 54e270262979..e9ff186c723f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -483,7 +483,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, save_stack_trace_tsk(task, &trace); for (i = 0; i < trace.nr_entries; i++) { - seq_printf(m, "[<%pK>] %pS\n", + seq_printf(m, "[<%pK>] %pB\n", (void *)entries[i], (void *)entries[i]); } unlock_trace(task); From 4950d6d48a0c43cc61d0bbb76fb10e0214b79c66 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Aug 2016 10:59:08 -0500 Subject: [PATCH 053/538] x86/dumpstack: Remove 64-byte gap at end of irq stack There has been a 64-byte gap at the end of the irq stack for at least 12 years. It predates git history, and I can't find any good reason for it. Remove it. What's the worst that could happen? Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/14f9281c5475cc44af95945ea7546bff2e3836db.1471535549.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/dumpstack_64.c | 9 +++------ arch/x86/kernel/setup_percpu.c | 2 +- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 809eda03c527..6ef55e83fb8a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1281,7 +1281,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(char *, irq_stack_ptr) = - init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; + init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE; DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index daf9f6321856..066eb5c77fd6 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -103,9 +103,6 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack, return (stack >= irq_stack && stack < irq_stack_end); } -static const unsigned long irq_stack_size = - (IRQ_STACK_SIZE - 64) / sizeof(unsigned long); - enum stack_type { STACK_IS_UNKNOWN, STACK_IS_NORMAL, @@ -133,7 +130,7 @@ analyze_stack(int cpu, struct task_struct *task, unsigned long *stack, return STACK_IS_NORMAL; *stack_end = irq_stack; - irq_stack = irq_stack - irq_stack_size; + irq_stack -= (IRQ_STACK_SIZE / sizeof(long)); if (in_irq_stack(stack, irq_stack, *stack_end)) return STACK_IS_IRQ; @@ -256,8 +253,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, preempt_disable(); cpu = smp_processor_id(); - irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); - irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); + irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); + irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long)); /* * Debugging aid: "show_stack(NULL, NULL);" prints the diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 7a40e068302d..d182799c4264 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -246,7 +246,7 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_X86_64 per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack_union.irq_stack, cpu) + - IRQ_STACK_SIZE - 64; + IRQ_STACK_SIZE; #endif #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = From 70b5b18f716a2d7ab20c2cfaea21919b9fdfb805 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 18 Aug 2016 17:29:40 +0300 Subject: [PATCH 054/538] x86/platform/intel-mid: Run PWRMU command immediately On some firmwares we have to tell how exactly we want the command to be proceeded. The default case, based on the official BSP code, is to run it immediately. This appears to be a safer approach based on the documentation. Signed-off-by: Andy Shevchenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471530580-94247-1-git-send-email-andriy.shevchenko@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/platform/intel-mid/pwr.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/platform/intel-mid/pwr.c b/arch/x86/platform/intel-mid/pwr.c index c901a3423772..0548741b6894 100644 --- a/arch/x86/platform/intel-mid/pwr.c +++ b/arch/x86/platform/intel-mid/pwr.c @@ -44,6 +44,10 @@ /* Bits in PM_CMD */ #define PM_CMD_CMD(x) ((x) << 0) #define PM_CMD_IOC (1 << 8) +#define PM_CMD_CM_NOP (0 << 9) +#define PM_CMD_CM_IMMEDIATE (1 << 9) +#define PM_CMD_CM_DELAY (2 << 9) +#define PM_CMD_CM_TRIGGER (3 << 9) #define PM_CMD_D3cold (1 << 21) /* List of commands */ @@ -137,7 +141,7 @@ static int mid_pwr_wait(struct mid_pwr *pwr) static int mid_pwr_wait_for_cmd(struct mid_pwr *pwr, u8 cmd) { - writel(PM_CMD_CMD(cmd), pwr->regs + PM_CMD); + writel(PM_CMD_CMD(cmd) | PM_CMD_CM_IMMEDIATE, pwr->regs + PM_CMD); return mid_pwr_wait(pwr); } From bedc1969150d480c462cdac320fa944b694a7162 Mon Sep 17 00:00:00 2001 From: Ding Tianhong Date: Wed, 15 Jun 2016 15:27:36 +0800 Subject: [PATCH 055/538] rcu: Fix soft lockup for rcu_nocb_kthread Carrying out the following steps results in a softlockup in the RCU callback-offload (rcuo) kthreads: 1. Connect to ixgbevf, and set the speed to 10Gb/s. 2. Use ifconfig to bring the nic up and down repeatedly. [ 317.005148] IPv6: ADDRCONF(NETDEV_CHANGE): eth2: link becomes ready [ 368.106005] BUG: soft lockup - CPU#1 stuck for 22s! [rcuos/1:15] [ 368.106005] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 368.106005] task: ffff88057dd8a220 ti: ffff88057dd9c000 task.ti: ffff88057dd9c000 [ 368.106005] RIP: 0010:[] [] fib_table_lookup+0x14/0x390 [ 368.106005] RSP: 0018:ffff88061fc83ce8 EFLAGS: 00000286 [ 368.106005] RAX: 0000000000000001 RBX: 00000000020155c0 RCX: 0000000000000001 [ 368.106005] RDX: ffff88061fc83d50 RSI: ffff88061fc83d70 RDI: ffff880036d11a00 [ 368.106005] RBP: ffff88061fc83d08 R08: 0000000000000001 R09: 0000000000000000 [ 368.106005] R10: ffff880036d11a00 R11: ffffffff819e0900 R12: ffff88061fc83c58 [ 368.106005] R13: ffffffff816154dd R14: ffff88061fc83d08 R15: 00000000020155c0 [ 368.106005] FS: 0000000000000000(0000) GS:ffff88061fc80000(0000) knlGS:0000000000000000 [ 368.106005] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 368.106005] CR2: 00007f8c2aee9c40 CR3: 000000057b222000 CR4: 00000000000407e0 [ 368.106005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 368.106005] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 368.106005] Stack: [ 368.106005] 00000000010000c0 ffff88057b766000 ffff8802e380b000 ffff88057af03e00 [ 368.106005] ffff88061fc83dc0 ffffffff815349a6 ffff88061fc83d40 ffffffff814ee146 [ 368.106005] ffff8802e380af00 00000000e380af00 ffffffff819e0900 020155c0010000c0 [ 368.106005] Call Trace: [ 368.106005] [ 368.106005] [ 368.106005] [] ip_route_input_noref+0x516/0xbd0 [ 368.106005] [] ? skb_release_data+0xd6/0x110 [ 368.106005] [] ? kfree_skb+0x3a/0xa0 [ 368.106005] [] ip_rcv_finish+0x29f/0x350 [ 368.106005] [] ip_rcv+0x234/0x380 [ 368.106005] [] __netif_receive_skb_core+0x676/0x870 [ 368.106005] [] __netif_receive_skb+0x18/0x60 [ 368.106005] [] process_backlog+0xae/0x180 [ 368.106005] [] net_rx_action+0x152/0x240 [ 368.106005] [] __do_softirq+0xef/0x280 [ 368.106005] [] call_softirq+0x1c/0x30 [ 368.106005] [ 368.106005] [ 368.106005] [] do_softirq+0x65/0xa0 [ 368.106005] [] local_bh_enable+0x94/0xa0 [ 368.106005] [] rcu_nocb_kthread+0x232/0x370 [ 368.106005] [] ? wake_up_bit+0x30/0x30 [ 368.106005] [] ? rcu_start_gp+0x40/0x40 [ 368.106005] [] kthread+0xcf/0xe0 [ 368.106005] [] ? kthread_create_on_node+0x140/0x140 [ 368.106005] [] ret_from_fork+0x58/0x90 [ 368.106005] [] ? kthread_create_on_node+0x140/0x140 ==================================cut here============================== It turns out that the rcuos callback-offload kthread is busy processing a very large quantity of RCU callbacks, and it is not reliquishing the CPU while doing so. This commit therefore adds an cond_resched_rcu_qs() within the loop to allow other tasks to run. Signed-off-by: Ding Tianhong [ paulmck: Substituted cond_resched_rcu_qs for cond_resched. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0082fce402a0..85c5a883c6e3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2173,6 +2173,7 @@ static int rcu_nocb_kthread(void *arg) cl++; c++; local_bh_enable(); + cond_resched_rcu_qs(); list = next; } trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); From 62148f0930a8e9bd5c5614f8387222f0220d7d47 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 2 Aug 2016 08:11:00 -0300 Subject: [PATCH 056/538] [media] cec: rename cec_devnode fhs_lock to just lock This lock will be used to protect more than just the fhs list. So rename it to just 'lock'. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/cec/cec-adap.c | 12 ++++++------ drivers/staging/media/cec/cec-api.c | 8 ++++---- drivers/staging/media/cec/cec-core.c | 6 +++--- include/media/cec.h | 2 +- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/staging/media/cec/cec-adap.c b/drivers/staging/media/cec/cec-adap.c index b2393bbacb26..9dcb784b8d6a 100644 --- a/drivers/staging/media/cec/cec-adap.c +++ b/drivers/staging/media/cec/cec-adap.c @@ -124,10 +124,10 @@ static void cec_queue_event(struct cec_adapter *adap, u64 ts = ktime_get_ns(); struct cec_fh *fh; - mutex_lock(&adap->devnode.fhs_lock); + mutex_lock(&adap->devnode.lock); list_for_each_entry(fh, &adap->devnode.fhs, list) cec_queue_event_fh(fh, ev, ts); - mutex_unlock(&adap->devnode.fhs_lock); + mutex_unlock(&adap->devnode.lock); } /* @@ -191,12 +191,12 @@ static void cec_queue_msg_monitor(struct cec_adapter *adap, u32 monitor_mode = valid_la ? CEC_MODE_MONITOR : CEC_MODE_MONITOR_ALL; - mutex_lock(&adap->devnode.fhs_lock); + mutex_lock(&adap->devnode.lock); list_for_each_entry(fh, &adap->devnode.fhs, list) { if (fh->mode_follower >= monitor_mode) cec_queue_msg_fh(fh, msg); } - mutex_unlock(&adap->devnode.fhs_lock); + mutex_unlock(&adap->devnode.lock); } /* @@ -207,12 +207,12 @@ static void cec_queue_msg_followers(struct cec_adapter *adap, { struct cec_fh *fh; - mutex_lock(&adap->devnode.fhs_lock); + mutex_lock(&adap->devnode.lock); list_for_each_entry(fh, &adap->devnode.fhs, list) { if (fh->mode_follower == CEC_MODE_FOLLOWER) cec_queue_msg_fh(fh, msg); } - mutex_unlock(&adap->devnode.fhs_lock); + mutex_unlock(&adap->devnode.lock); } /* Notify userspace of an adapter state change. */ diff --git a/drivers/staging/media/cec/cec-api.c b/drivers/staging/media/cec/cec-api.c index 7be7615a0fdf..4e2696a34ddb 100644 --- a/drivers/staging/media/cec/cec-api.c +++ b/drivers/staging/media/cec/cec-api.c @@ -508,14 +508,14 @@ static int cec_open(struct inode *inode, struct file *filp) filp->private_data = fh; - mutex_lock(&devnode->fhs_lock); + mutex_lock(&devnode->lock); /* Queue up initial state events */ ev_state.state_change.phys_addr = adap->phys_addr; ev_state.state_change.log_addr_mask = adap->log_addrs.log_addr_mask; cec_queue_event_fh(fh, &ev_state, 0); list_add(&fh->list, &devnode->fhs); - mutex_unlock(&devnode->fhs_lock); + mutex_unlock(&devnode->lock); return 0; } @@ -540,9 +540,9 @@ static int cec_release(struct inode *inode, struct file *filp) cec_monitor_all_cnt_dec(adap); mutex_unlock(&adap->lock); - mutex_lock(&devnode->fhs_lock); + mutex_lock(&devnode->lock); list_del(&fh->list); - mutex_unlock(&devnode->fhs_lock); + mutex_unlock(&devnode->lock); /* Unhook pending transmits from this filehandle. */ mutex_lock(&adap->lock); diff --git a/drivers/staging/media/cec/cec-core.c b/drivers/staging/media/cec/cec-core.c index 112a5fae12f5..73792d078462 100644 --- a/drivers/staging/media/cec/cec-core.c +++ b/drivers/staging/media/cec/cec-core.c @@ -117,7 +117,7 @@ static int __must_check cec_devnode_register(struct cec_devnode *devnode, /* Initialization */ INIT_LIST_HEAD(&devnode->fhs); - mutex_init(&devnode->fhs_lock); + mutex_init(&devnode->lock); /* Part 1: Find a free minor number */ mutex_lock(&cec_devnode_lock); @@ -181,10 +181,10 @@ static void cec_devnode_unregister(struct cec_devnode *devnode) if (!devnode->registered || devnode->unregistered) return; - mutex_lock(&devnode->fhs_lock); + mutex_lock(&devnode->lock); list_for_each_entry(fh, &devnode->fhs, list) wake_up_interruptible(&fh->wait); - mutex_unlock(&devnode->fhs_lock); + mutex_unlock(&devnode->lock); devnode->registered = false; devnode->unregistered = true; diff --git a/include/media/cec.h b/include/media/cec.h index dc7854b855f3..fdb5d600e4bb 100644 --- a/include/media/cec.h +++ b/include/media/cec.h @@ -57,8 +57,8 @@ struct cec_devnode { int minor; bool registered; bool unregistered; - struct mutex fhs_lock; struct list_head fhs; + struct mutex lock; }; struct cec_adapter; From 2ab25d35a91098ef0f42d478cc37f6a5591a4ab0 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 2 Aug 2016 08:13:57 -0300 Subject: [PATCH 057/538] [media] cec: improve locking - The global lock was used in cec_get_device when it should have used the devnode lock. - cec_put_device also took the global lock, but since the release function takes that lock as well this could lead to a deadlock. Just don't take the lock here since there is no reason for it. - cec_devnode_register() should take the global lock when clearing the bit in the global bitmap. - In cec_devnode_unregister() place the devnode->(un)register tests and assignments under the devnode lock as well: this has to be in a critical block. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/cec/cec-core.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/staging/media/cec/cec-core.c b/drivers/staging/media/cec/cec-core.c index 73792d078462..3b1e4d2b190d 100644 --- a/drivers/staging/media/cec/cec-core.c +++ b/drivers/staging/media/cec/cec-core.c @@ -51,31 +51,29 @@ int cec_get_device(struct cec_devnode *devnode) { /* * Check if the cec device is available. This needs to be done with - * the cec_devnode_lock held to prevent an open/unregister race: + * the devnode->lock held to prevent an open/unregister race: * without the lock, the device could be unregistered and freed between * the devnode->registered check and get_device() calls, leading to * a crash. */ - mutex_lock(&cec_devnode_lock); + mutex_lock(&devnode->lock); /* * return ENXIO if the cec device has been removed * already or if it is not registered anymore. */ if (!devnode->registered) { - mutex_unlock(&cec_devnode_lock); + mutex_unlock(&devnode->lock); return -ENXIO; } /* and increase the device refcount */ get_device(&devnode->dev); - mutex_unlock(&cec_devnode_lock); + mutex_unlock(&devnode->lock); return 0; } void cec_put_device(struct cec_devnode *devnode) { - mutex_lock(&cec_devnode_lock); put_device(&devnode->dev); - mutex_unlock(&cec_devnode_lock); } /* Called when the last user of the cec device exits. */ @@ -84,11 +82,10 @@ static void cec_devnode_release(struct device *cd) struct cec_devnode *devnode = to_cec_devnode(cd); mutex_lock(&cec_devnode_lock); - /* Mark device node number as free */ clear_bit(devnode->minor, cec_devnode_nums); - mutex_unlock(&cec_devnode_lock); + cec_delete_adapter(to_cec_adapter(devnode)); } @@ -160,7 +157,9 @@ static int __must_check cec_devnode_register(struct cec_devnode *devnode, cdev_del: cdev_del(&devnode->cdev); clr_bit: + mutex_lock(&cec_devnode_lock); clear_bit(devnode->minor, cec_devnode_nums); + mutex_unlock(&cec_devnode_lock); return ret; } @@ -177,17 +176,21 @@ static void cec_devnode_unregister(struct cec_devnode *devnode) { struct cec_fh *fh; + mutex_lock(&devnode->lock); + /* Check if devnode was never registered or already unregistered */ - if (!devnode->registered || devnode->unregistered) + if (!devnode->registered || devnode->unregistered) { + mutex_unlock(&devnode->lock); return; + } - mutex_lock(&devnode->lock); list_for_each_entry(fh, &devnode->fhs, list) wake_up_interruptible(&fh->wait); - mutex_unlock(&devnode->lock); devnode->registered = false; devnode->unregistered = true; + mutex_unlock(&devnode->lock); + device_del(&devnode->dev); cdev_del(&devnode->cdev); put_device(&devnode->dev); From 9ebf1945d757433a089ab3ee940673503e3e11ec Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Mon, 1 Aug 2016 07:29:34 -0300 Subject: [PATCH 058/538] [media] cec-funcs.h: fix typo: && should be & Fix typo where logical AND was used instead of bitwise AND. Reported-by: David Binderman Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/linux/cec-funcs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/cec-funcs.h b/include/linux/cec-funcs.h index 82c3d3b7269d..9e054aa168f3 100644 --- a/include/linux/cec-funcs.h +++ b/include/linux/cec-funcs.h @@ -227,7 +227,7 @@ static inline void cec_set_digital_service_id(__u8 *msg, if (digital->service_id_method == CEC_OP_SERVICE_ID_METHOD_BY_CHANNEL) { *msg++ = (digital->channel.channel_number_fmt << 2) | (digital->channel.major >> 8); - *msg++ = digital->channel.major && 0xff; + *msg++ = digital->channel.major & 0xff; *msg++ = digital->channel.minor >> 8; *msg++ = digital->channel.minor & 0xff; *msg++ = 0; @@ -1277,7 +1277,7 @@ static inline void cec_msg_user_control_pressed(struct cec_msg *msg, msg->len += 4; msg->msg[3] = (ui_cmd->channel_identifier.channel_number_fmt << 2) | (ui_cmd->channel_identifier.major >> 8); - msg->msg[4] = ui_cmd->channel_identifier.major && 0xff; + msg->msg[4] = ui_cmd->channel_identifier.major & 0xff; msg->msg[5] = ui_cmd->channel_identifier.minor >> 8; msg->msg[6] = ui_cmd->channel_identifier.minor & 0xff; break; From 31dc8b7302f1e48952ec8e90cd49dca843146cd0 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 10 Aug 2016 08:01:38 -0300 Subject: [PATCH 059/538] [media] cec-funcs.h: add reply argument for Record On/Off A reply parameter is added to the cec_msg_record_on/off functions in cec-funcs.h. The standard mandates that Record Status shall be replied to Record On, and it may be replied to Record Off. Signed-off-by: Johan Fjeldtvedt Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/linux/cec-funcs.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/cec-funcs.h b/include/linux/cec-funcs.h index 9e054aa168f3..8af613e67633 100644 --- a/include/linux/cec-funcs.h +++ b/include/linux/cec-funcs.h @@ -162,10 +162,11 @@ static inline void cec_msg_standby(struct cec_msg *msg) /* One Touch Record Feature */ -static inline void cec_msg_record_off(struct cec_msg *msg) +static inline void cec_msg_record_off(struct cec_msg *msg, bool reply) { msg->len = 2; msg->msg[1] = CEC_MSG_RECORD_OFF; + msg->reply = reply ? CEC_MSG_RECORD_STATUS : 0; } struct cec_op_arib_data { @@ -323,6 +324,7 @@ static inline void cec_msg_record_on_phys_addr(struct cec_msg *msg, } static inline void cec_msg_record_on(struct cec_msg *msg, + bool reply, const struct cec_op_record_src *rec_src) { switch (rec_src->type) { @@ -346,6 +348,7 @@ static inline void cec_msg_record_on(struct cec_msg *msg, rec_src->ext_phys_addr.phys_addr); break; } + msg->reply = reply ? CEC_MSG_RECORD_STATUS : 0; } static inline void cec_ops_record_on(const struct cec_msg *msg, From e1ef69217f68b8407245e9e353cf88cc2f9ebc18 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 20 Jun 2016 07:51:22 +0900 Subject: [PATCH 060/538] rcutorture: Remove outdated config option description CONFIG_RCU_TORTURE_TEST_RUNNABLE was removed by commit 4e9a073f60367 ("torture: Remove CONFIG_RCU_TORTURE_TEST_RUNNABLE, simplify code"), but the documentation was not updated accordingly. This commit therefore updates the documentation to reflect CONFIG_RCU_TORTURE_TEST_RUNNABLE's removal and to add a description for the alternative module parameter. Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- Documentation/RCU/torture.txt | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index 118e7c176ce7..278f6a9383b6 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt @@ -10,21 +10,6 @@ status messages via printk(), which can be examined via the dmesg command (perhaps grepping for "torture"). The test is started when the module is loaded, and stops when the module is unloaded. -CONFIG_RCU_TORTURE_TEST_RUNNABLE - -It is also possible to specify CONFIG_RCU_TORTURE_TEST=y, which will -result in the tests being loaded into the base kernel. In this case, -the CONFIG_RCU_TORTURE_TEST_RUNNABLE config option is used to specify -whether the RCU torture tests are to be started immediately during -boot or whether the /proc/sys/kernel/rcutorture_runnable file is used -to enable them. This /proc file can be used to repeatedly pause and -restart the tests, regardless of the initial state specified by the -CONFIG_RCU_TORTURE_TEST_RUNNABLE config option. - -You will normally -not- want to start the RCU torture tests during boot -(and thus the default is CONFIG_RCU_TORTURE_TEST_RUNNABLE=n), but doing -this can sometimes be useful in finding boot-time bugs. - MODULE PARAMETERS From ed2bec07fd1aa47f1c06be92c164c13c70fb7a45 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 Aug 2016 21:15:15 -0700 Subject: [PATCH 061/538] documentation: Record reason for rcu_head two-byte alignment There is an assertion in __call_rcu() that checks only the bottom bit of the rcu_head pointer, rather than the bottom two (as might be expected for 32-bit systems) or the bottom three (as might be expected for 64-bit systems). This choice might be a bit surprising in these days of ubiquitous 32-bit and 64-bit systems. This commit therefore records the reason for this odd alignment check, namely that m68k guarantees only two-byte alignment despite being a 32-bit architectures. Signed-off-by: Paul E. McKenney --- .../RCU/Design/Requirements/Requirements.html | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index ece410f40436..a4d3838130e4 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -2493,6 +2493,28 @@

Memory Efficiency

variant of call_rcu() that might one day be created for energy-efficiency purposes. +

+That said, there are limits. +RCU requires that the rcu_head structure be aligned to a +two-byte boundary, and passing a misaligned rcu_head +structure to one of the call_rcu() family of functions +will result in a splat. +It is therefore necessary to exercise caution when packing +structures containing fields of type rcu_head. +Why not a four-byte or even eight-byte alignment requirement? +Because the m68k architecture provides only two-byte alignment, +and thus acts as alignment's least common denominator. + +

+The reason for reserving the bottom bit of pointers to +rcu_head structures is to leave the door open to +“lazy” callbacks whose invocations can safely be deferred. +Deferring invocation could potentially have energy-efficiency +benefits, but only if the rate of non-lazy callbacks decreases +significantly for some important workload. +In the meantime, reserving the bottom bit keeps this option open +in case it one day becomes useful. +

Performance, Scalability, Response Time, and Reliability

From 277f963cea4ec87144c6713377322fe3bf172a5e Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 10 Aug 2016 06:00:53 -0300 Subject: [PATCH 062/538] [media] cec: improve dqevent documentation The documentation for the cec_event_state_change struct was incomplete. This patch documents what happens in the corner cases. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/cec/cec-ioc-dqevent.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Documentation/media/uapi/cec/cec-ioc-dqevent.rst b/Documentation/media/uapi/cec/cec-ioc-dqevent.rst index 7a6d6d00ce19..2e1e73928396 100644 --- a/Documentation/media/uapi/cec/cec-ioc-dqevent.rst +++ b/Documentation/media/uapi/cec/cec-ioc-dqevent.rst @@ -64,7 +64,8 @@ it is guaranteed that the state did change in between the two events. - ``phys_addr`` - - The current physical address. + - The current physical address. This is ``CEC_PHYS_ADDR_INVALID`` if no + valid physical address is set. - .. row 2 @@ -72,7 +73,10 @@ it is guaranteed that the state did change in between the two events. - ``log_addr_mask`` - - The current set of claimed logical addresses. + - The current set of claimed logical addresses. This is 0 if no logical + addresses are claimed or if ``phys_addr`` is ``CEC_PHYS_ADDR_INVALID``. + If bit 15 is set (``1 << CEC_LOG_ADDR_UNREGISTERED``) then this device + has the unregistered logical address. In that case all other bits are 0. From f7b8eb847e35b18d3ec333774691a905bf16017f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Jun 2016 11:30:32 -0700 Subject: [PATCH 063/538] rcu: Consolidate expedited grace period machinery The functions synchronize_rcu_expedited() and synchronize_sched_expedited() have nearly identical code. This commit therefore consolidates this code into a new _synchronize_rcu_expedited() function. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 62 ++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6d86ab6ec2c9..1549f456fb7b 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -516,6 +516,33 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) mutex_unlock(&rsp->exp_wake_mutex); } +/* + * Given an rcu_state pointer and a smp_call_function() handler, kick + * off the specified flavor of expedited grace period. + */ +static void _synchronize_rcu_expedited(struct rcu_state *rsp, + smp_call_func_t func) +{ + unsigned long s; + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(rsp->call); + return; + } + + /* Take a snapshot of the sequence number. */ + s = rcu_exp_gp_seq_snap(rsp); + if (exp_funnel_lock(rsp, s)) + return; /* Someone else did our work for us. */ + + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_cpus(rsp, func); + + /* Wait and clean up, including waking everyone. */ + rcu_exp_wait_wake(rsp, s); +} + /** * synchronize_sched_expedited - Brute-force RCU-sched grace period * @@ -534,29 +561,13 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) */ void synchronize_sched_expedited(void) { - unsigned long s; struct rcu_state *rsp = &rcu_sched_state; /* If only one CPU, this is automatically a grace period. */ if (rcu_blocking_is_gp()) return; - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu_sched); - return; - } - - /* Take a snapshot of the sequence number. */ - s = rcu_exp_gp_seq_snap(rsp); - if (exp_funnel_lock(rsp, s)) - return; /* Someone else did our work for us. */ - - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); - - /* Wait and clean up, including waking everyone. */ - rcu_exp_wait_wake(rsp, s); + _synchronize_rcu_expedited(rsp, sync_sched_exp_handler); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); @@ -620,23 +631,8 @@ static void sync_rcu_exp_handler(void *info) void synchronize_rcu_expedited(void) { struct rcu_state *rsp = rcu_state_p; - unsigned long s; - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu); - return; - } - - s = rcu_exp_gp_seq_snap(rsp); - if (exp_funnel_lock(rsp, s)) - return; /* Someone else did our work for us. */ - - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); - - /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ - rcu_exp_wait_wake(rsp, s); + _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); From 8b355e3bc1408be238ae4695fb6318ae502cae8e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Jun 2016 13:46:25 -0700 Subject: [PATCH 064/538] rcu: Drive expedited grace periods from workqueue The current implementation of expedited grace periods has the user task drive the grace period. This works, but has downsides: (1) The user task must awaken tasks piggybacking on this grace period, which can result in latencies rivaling that of the grace period itself, and (2) User tasks can receive signals, which interfere with RCU CPU stall warnings. This commit therefore uses workqueues to drive the grace periods, so that the user task need not do the awakening. A subsequent commit will remove the now-unnecessary code allowing for signals. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 + kernel/rcu/tree_exp.h | 48 +++++++++++++++++++++++++++++++++++------ kernel/rcu/tree_trace.c | 7 +++--- 3 files changed, 47 insertions(+), 9 deletions(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f714f873bf9d..e99a5234d9ed 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -400,6 +400,7 @@ struct rcu_data { #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + atomic_long_t exp_workdone0; /* # done by workqueue. */ atomic_long_t exp_workdone1; /* # done by others #1. */ atomic_long_t exp_workdone2; /* # done by others #2. */ atomic_long_t exp_workdone3; /* # done by others #3. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1549f456fb7b..97f5ffe42b58 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -500,7 +500,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) * next GP, to proceed. */ mutex_lock(&rsp->exp_wake_mutex); - mutex_unlock(&rsp->exp_mutex); rcu_for_each_node_breadth_first(rsp, rnp) { if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { @@ -516,6 +515,29 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) mutex_unlock(&rsp->exp_wake_mutex); } +/* Let the workqueue handler know what it is supposed to do. */ +struct rcu_exp_work { + smp_call_func_t rew_func; + struct rcu_state *rew_rsp; + unsigned long rew_s; + struct work_struct rew_work; +}; + +/* + * Work-queue handler to drive an expedited grace period forward. + */ +static void wait_rcu_exp_gp(struct work_struct *wp) +{ + struct rcu_exp_work *rewp; + + /* Initialize the rcu_node tree in preparation for the wait. */ + rewp = container_of(wp, struct rcu_exp_work, rew_work); + sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func); + + /* Wait and clean up, including waking everyone. */ + rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s); +} + /* * Given an rcu_state pointer and a smp_call_function() handler, kick * off the specified flavor of expedited grace period. @@ -523,6 +545,9 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) static void _synchronize_rcu_expedited(struct rcu_state *rsp, smp_call_func_t func) { + struct rcu_data *rdp; + struct rcu_exp_work rew; + struct rcu_node *rnp; unsigned long s; /* If expedited grace periods are prohibited, fall back to normal. */ @@ -536,11 +561,22 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, func); - - /* Wait and clean up, including waking everyone. */ - rcu_exp_wait_wake(rsp, s); + /* Marshall arguments and schedule the expedited grace period. */ + rew.rew_func = func; + rew.rew_rsp = rsp; + rew.rew_s = s; + INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); + schedule_work(&rew.rew_work); + + /* Wait for expedited grace period to complete. */ + rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); + rnp = rcu_get_root(rsp); + wait_event(rnp->exp_wq[(s >> 1) & 0x3], + sync_exp_work_done(rsp, + &rdp->exp_workdone0, s)); + + /* Let the next expedited grace period start. */ + mutex_unlock(&rsp->exp_mutex); } /** diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 86782f9a4604..b1f28972872c 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,16 +185,17 @@ static int show_rcuexp(struct seq_file *m, void *v) int cpu; struct rcu_state *rsp = (struct rcu_state *)m->private; struct rcu_data *rdp; - unsigned long s1 = 0, s2 = 0, s3 = 0; + unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(rsp->rda, cpu); + s0 += atomic_long_read(&rdp->exp_workdone0); s1 += atomic_long_read(&rdp->exp_workdone1); s2 += atomic_long_read(&rdp->exp_workdone2); s3 += atomic_long_read(&rdp->exp_workdone3); } - seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, s1, s2, s3, + seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", + rsp->expedited_sequence, s0, s1, s2, s3, atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); From 908d2c1fd156d414008e1b7e1fb5a7716e013231 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Jun 2016 14:34:59 -0700 Subject: [PATCH 065/538] rcu: Stop disabling expedited RCU CPU stall warnings Now that RCU expedited grace periods are always driven by a workqueue, there is no need to account for signal reception, and thus no need to disable expedited RCU CPU stall warnings due to signal reception. This commit therefore removes the signal-reception checks, leaving a WARN_ON() to catch possible future bugs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 97f5ffe42b58..3a647eb96f23 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -427,12 +427,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) jiffies_stall); if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) return; - if (ret < 0) { - /* Hit a signal, disable CPU stall warnings. */ - swait_event(rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root)); - return; - } + WARN_ON(ret < 0); /* workqueues should not be signaled. */ pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rsp->name); ndetected = 0; From 24a6cff286030b98149ff10b968cba31280fcb7a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Jun 2016 14:49:29 -0700 Subject: [PATCH 066/538] rcu: Make expedited RCU CPU stall warnings respond to controls The expedited RCU CPU stall warnings currently responds to neither the panic_on_rcu_stall sysctl setting nor the rcupdate.rcu_cpu_stall_suppress kernel boot parameter. This commit therefore updates the expedited code to respond to these two controls. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 3a647eb96f23..f316683b18f1 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -428,6 +428,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) return; WARN_ON(ret < 0); /* workqueues should not be signaled. */ + if (rcu_cpu_stall_suppress) + continue; + panic_on_rcu_stall(); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rsp->name); ndetected = 0; From 98834b83785e1388fa8672cf4f8de09974d15e86 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Jun 2016 17:04:19 -0700 Subject: [PATCH 067/538] rcu: Exclude RCU-offline CPUs from expedited grace periods The expedited RCU grace periods currently rely on a failure indication from smp_call_function_single() to determine that a given CPU is offline. This works after a fashion, but is more contorted and less precise than relying on RCU's internal state. This commit therefore takes a first step towards relying on internal state. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index f316683b18f1..3bc4b3dda801 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -359,7 +359,8 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); if (raw_smp_processor_id() == cpu || - !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + !(atomic_add_return(0, &rdtp->dynticks) & 0x1) || + !(rnp->qsmaskinitnext & rdp->grpmask)) mask_ofl_test |= rdp->grpmask; } mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; From 385c859f678e8ee6b0b122086f34e72a0e861cef Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Jun 2016 12:16:11 -0700 Subject: [PATCH 068/538] rcu: Use RCU's online-CPU state for expedited IPI retry This commit improves the accuracy of the interaction between CPU hotplug operations and RCU's expedited grace periods by using RCU's online-CPU state to determine when failed IPIs should be retried. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 3bc4b3dda801..24343eb87b58 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -385,17 +385,16 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, mask_ofl_ipi &= ~mask; continue; } - /* Failed, raced with offline. */ + /* Failed, raced with CPU hotplug operation. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (cpu_online(cpu) && + if ((rnp->qsmaskinitnext & mask) && (rnp->expmask & mask)) { + /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); schedule_timeout_uninterruptible(1); - if (cpu_online(cpu) && - (rnp->expmask & mask)) - goto retry_ipi; - raw_spin_lock_irqsave_rcu_node(rnp, flags); + goto retry_ipi; } + /* CPU really is offline, so we can ignore it. */ if (!(rnp->expmask & mask)) mask_ofl_ipi &= ~mask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); From 94d44776737266eccafee32b985fe31fd5e021ca Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 22 Jun 2016 17:19:27 +0800 Subject: [PATCH 069/538] rcu: Use rcu_gp_kthread_wake() to wake up grace period kthreads Commit abedf8e2419f ("rcu: Use simple wait queues where possible in rcutree") converts Tree RCU's wait queues to simple wait queues, but it incorrectly reverts the commit 2aa792e6faf1 ("rcu: Use rcu_gp_kthread_wake() to wake up grace period kthreads"). This can result in redundant self-wakeups. This commit therefore replaces the simple wait-queue wakeups with rcu_gp_kthread_wake(), thus avoiding the redundant wakeups. Signed-off-by: Jisheng Zhang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5d80925e7fc8..cc1779a7ec5f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2344,7 +2344,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); - swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ + rcu_gp_kthread_wake(rsp); } /* @@ -2970,7 +2970,7 @@ static void force_quiescent_state(struct rcu_state *rsp) } WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); - swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ + rcu_gp_kthread_wake(rsp); } /* From 379d9ecb3cc9d5d043216185904c00e54c736a96 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Jun 2016 10:37:20 -0700 Subject: [PATCH 070/538] sched: Make wake_up_nohz_cpu() handle CPUs going offline Both timers and hrtimers are maintained on the outgoing CPU until CPU_DEAD time, at which point they are migrated to a surviving CPU. If a mod_timer() executes between CPU_DYING and CPU_DEAD time, x86 systems will splat in native_smp_send_reschedule() when attempting to wake up the just-now-offlined CPU, as shown below from a NO_HZ_FULL kernel: [ 7976.741556] WARNING: CPU: 0 PID: 661 at /home/paulmck/public_git/linux-rcu/arch/x86/kernel/smp.c:125 native_smp_send_reschedule+0x39/0x40 [ 7976.741595] Modules linked in: [ 7976.741595] CPU: 0 PID: 661 Comm: rcu_torture_rea Not tainted 4.7.0-rc2+ #1 [ 7976.741595] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 [ 7976.741595] 0000000000000000 ffff88000002fcc8 ffffffff8138ab2e 0000000000000000 [ 7976.741595] 0000000000000000 ffff88000002fd08 ffffffff8105cabc 0000007d1fd0ee18 [ 7976.741595] 0000000000000001 ffff88001fd16d40 ffff88001fd0ee00 ffff88001fd0ee00 [ 7976.741595] Call Trace: [ 7976.741595] [] dump_stack+0x67/0x99 [ 7976.741595] [] __warn+0xcc/0xf0 [ 7976.741595] [] warn_slowpath_null+0x18/0x20 [ 7976.741595] [] native_smp_send_reschedule+0x39/0x40 [ 7976.741595] [] wake_up_nohz_cpu+0x82/0x190 [ 7976.741595] [] internal_add_timer+0x7a/0x80 [ 7976.741595] [] mod_timer+0x187/0x2b0 [ 7976.741595] [] rcu_torture_reader+0x33d/0x380 [ 7976.741595] [] ? sched_torture_read_unlock+0x30/0x30 [ 7976.741595] [] ? rcu_bh_torture_read_lock+0x80/0x80 [ 7976.741595] [] kthread+0xdf/0x100 [ 7976.741595] [] ret_from_fork+0x1f/0x40 [ 7976.741595] [] ? kthread_create_on_node+0x200/0x200 However, in this case, the wakeup is redundant, because the timer migration will reprogram timer hardware as needed. Note that the fact that preemption is disabled does not avoid the splat, as the offline operation has already passed both the synchronize_sched() and the stop_machine() that would be blocked by disabled preemption. This commit therefore modifies wake_up_nohz_cpu() to avoid attempting to wake up offline CPUs. It also adds a comment stating that the caller must tolerate lost wakeups when the target CPU is going offline, and suggesting the CPU_DEAD notifier as a recovery mechanism. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Thomas Gleixner --- kernel/sched/core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5c883fe8e440..2a18856f00ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -580,6 +580,8 @@ static bool wake_up_full_nohz_cpu(int cpu) * If needed we can still optimize that later with an * empty IRQ. */ + if (cpu_is_offline(cpu)) + return true; /* Don't try to wake offline CPUs. */ if (tick_nohz_full_cpu(cpu)) { if (cpu != smp_processor_id() || tick_nohz_tick_stopped()) @@ -590,6 +592,11 @@ static bool wake_up_full_nohz_cpu(int cpu) return false; } +/* + * Wake up the specified CPU. If the CPU is going offline, it is the + * caller's responsibility to deal with the lost wakeup, for example, + * by hooking into the CPU_DEAD notifier like timers and hrtimers do. + */ void wake_up_nohz_cpu(int cpu) { if (!wake_up_full_nohz_cpu(cpu)) From e77b7041258e11ba198951553d3acf1e371a9053 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 15 Jul 2016 12:19:41 -0400 Subject: [PATCH 071/538] rcu: Don't use modular infrastructure in non-modular code The Kconfig currently controlling compilation of tree.c is: init/Kconfig:config TREE_RCU init/Kconfig: bool ...and update.c and sync.c are "obj-y" meaning that none are ever built as a module by anyone. Since MODULE_ALIAS is a no-op for non-modular code, we can remove them from these files. We leave moduleparam.h behind since the files instantiate some boot time configuration parameters with module_param() still. Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Signed-off-by: Paul Gortmaker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 -- kernel/rcu/update.c | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cc1779a7ec5f..e83446062f65 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include #include @@ -60,7 +59,6 @@ #include "tree.h" #include "rcu.h" -MODULE_ALIAS("rcutree"); #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX #endif diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f0d8322bc3ec..f19271dce0a9 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include @@ -54,7 +54,6 @@ #include "rcu.h" -MODULE_ALIAS("rcupdate"); #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX #endif From 3563a438f124cb0b8cfd350c86de2f26c63d8837 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jul 2016 09:39:11 -0700 Subject: [PATCH 072/538] rcu: Avoid redundant quiescent-state chasing Currently, __note_gp_changes() checks to see if the CPU has slept through multiple grace periods. If it has, it resynchronizes that CPU's view of the grace-period state, which includes whether or not the current grace period needs a quiescent state from this CPU. The fact of this need (or lack thereof) needs to be in two places, rdp->cpu_no_qs.b.norm and rdp->core_needs_qs. The former tells RCU's context-switch code to go get a quiescent state and the latter says that it needs to be reported. The current code unconditionally sets the former to true, but correctly sets the latter. This does not result in failures, but it does unnecessarily increase the amount of work done on average at context-switch time. This commit therefore correctly sets both fields. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e83446062f65..733902c33dd2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1846,6 +1846,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { bool ret; + bool need_gp; /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed && @@ -1872,9 +1873,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, */ rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); - rdp->cpu_no_qs.b.norm = true; + need_gp = !!(rnp->qsmask & rdp->grpmask); + rdp->cpu_no_qs.b.norm = need_gp; rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); - rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); + rdp->core_needs_qs = need_gp; zero_cpu_stall_ticks(rdp); WRITE_ONCE(rdp->gpwrap, false); } From 7ec99de36f402618ae44147ac7fa9a07e4757a5f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Jun 2016 13:58:26 -0700 Subject: [PATCH 073/538] rcu: Provide exact CPU-online tracking for RCU Up to now, RCU has assumed that the CPU-online process makes it from CPU_UP_PREPARE to set_cpu_online() within one jiffy. Given the recent rise of virtualized environments, this assumption is very clearly obsolete. Failing to meet this deadline can result in RCU paying attention to an incoming CPU for one jiffy, then ignoring it until the grace period following the one in which that CPU sets itself online. This situation might prove to be fatally disappointing to any RCU read-side critical sections that had the misfortune to execute during the time in which RCU was ignoring the slow-to-come-online CPU. This commit therefore updates RCU's internal CPU state-tracking information at notify_cpu_starting() time, thus providing RCU with an exact transition of the CPU's state from offline to online. Note that this means that incoming CPUs must not use RCU read-side critical section (other than those of SRCU) until notify_cpu_starting() time. Note also that the CPU_STARTING notifiers -are- allowed to use RCU read-side critical sections. (Of course, CPU-hotplug notifiers are rapidly becoming obsolete, so you need to act fast!) If a given architecture or CPU family needs to use RCU read-side critical sections earlier, the call to rcu_cpu_starting() from notify_cpu_starting() will need to be architecture-specific, with architectures that need early use being required to hand-place the call to rcu_cpu_starting() at some point preceding the call to notify_cpu_starting(). Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 1 + kernel/cpu.c | 1 + kernel/rcu/tree.c | 32 +++++++++++++++++++++++++++++--- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 1aa62e1a761b..321f9ed552a9 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -334,6 +334,7 @@ void rcu_sched_qs(void); void rcu_bh_qs(void); void rcu_check_callbacks(int user); void rcu_report_dead(unsigned int cpu); +void rcu_cpu_starting(unsigned int cpu); #ifndef CONFIG_TINY_RCU void rcu_end_inkernel_boot(void); diff --git a/kernel/cpu.c b/kernel/cpu.c index 341bf80f80bd..9482ceb928e0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -889,6 +889,7 @@ void notify_cpu_starting(unsigned int cpu) struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); + rcu_cpu_starting(cpu); /* All CPU_STARTING notifiers can use RCU. */ while (st->state < target) { struct cpuhp_step *step; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5d80925e7fc8..d2973fb85e8c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3792,8 +3792,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rnp = rdp->mynode; mask = rdp->grpmask; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - rnp->qsmaskinitnext |= mask; - rnp->expmaskinitnext |= mask; if (!rdp->beenonline) WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); rdp->beenonline = true; /* We have now been online. */ @@ -3860,6 +3858,32 @@ int rcutree_dead_cpu(unsigned int cpu) return 0; } +/* + * Mark the specified CPU as being online so that subsequent grace periods + * (both expedited and normal) will wait on it. Note that this means that + * incoming CPUs are not allowed to use RCU read-side critical sections + * until this function is called. Failing to observe this restriction + * will result in lockdep splats. + */ +void rcu_cpu_starting(unsigned int cpu) +{ + unsigned long flags; + unsigned long mask; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; + mask = rdp->grpmask; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->qsmaskinitnext |= mask; + rnp->expmaskinitnext |= mask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } +} + #ifdef CONFIG_HOTPLUG_CPU /* * The CPU is exiting the idle loop into the arch_cpu_idle_dead() @@ -4209,8 +4233,10 @@ void __init rcu_init(void) * or the scheduler are operational. */ pm_notifier(rcu_pm_notify, 0); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { rcutree_prepare_cpu(cpu); + rcu_cpu_starting(cpu); + } } #include "tree_exp.h" From 0c6d4576c45736f829dc3390ac95181b2ed21bc7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 17 Aug 2016 14:21:04 +0200 Subject: [PATCH 074/538] cpu/hotplug: Get rid of CPU_STARTING reference CPU_STARTING is scheduled for removal. There is no use of it in drivers and core code uses it only for compatibility with old-style CPU-hotplug notifiers. This patch removes therefore removes CPU_STARTING from an RCU-related comment. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 9482ceb928e0..ff8bc3817dde 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -889,7 +889,7 @@ void notify_cpu_starting(unsigned int cpu) struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); - rcu_cpu_starting(cpu); /* All CPU_STARTING notifiers can use RCU. */ + rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ while (st->state < target) { struct cpuhp_step *step; From 0ffd374b2207a1a0cba9f2dbcc799198482391d5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 18 Aug 2016 14:57:22 +0200 Subject: [PATCH 075/538] rcutorture: Convert to hotplug state machine Install the callbacks via the state machine and let the core invoke the callbacks on the already online CPUs. Cc: Josh Triplett Cc: "Paul E. McKenney" Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 52 +++++++++++------------------------------ 1 file changed, 14 insertions(+), 38 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 971e2b138063..dc9814860645 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1362,12 +1362,12 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) onoff_interval, onoff_holdoff); } -static void rcutorture_booster_cleanup(int cpu) +static int rcutorture_booster_cleanup(unsigned int cpu) { struct task_struct *t; if (boost_tasks[cpu] == NULL) - return; + return 0; mutex_lock(&boost_mutex); t = boost_tasks[cpu]; boost_tasks[cpu] = NULL; @@ -1375,9 +1375,10 @@ static void rcutorture_booster_cleanup(int cpu) /* This must be outside of the mutex, otherwise deadlock! */ torture_stop_kthread(rcu_torture_boost, t); + return 0; } -static int rcutorture_booster_init(int cpu) +static int rcutorture_booster_init(unsigned int cpu) { int retval; @@ -1577,28 +1578,7 @@ static void rcu_torture_barrier_cleanup(void) } } -static int rcutorture_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - (void)rcutorture_booster_init(cpu); - break; - case CPU_DOWN_PREPARE: - rcutorture_booster_cleanup(cpu); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block rcutorture_cpu_nb = { - .notifier_call = rcutorture_cpu_notify, -}; +static enum cpuhp_state rcutor_hp; static void rcu_torture_cleanup(void) @@ -1638,11 +1618,8 @@ rcu_torture_cleanup(void) for (i = 0; i < ncbflooders; i++) torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); if ((test_boost == 1 && cur_ops->can_boost) || - test_boost == 2) { - unregister_cpu_notifier(&rcutorture_cpu_nb); - for_each_possible_cpu(i) - rcutorture_booster_cleanup(i); - } + test_boost == 2) + cpuhp_remove_state(rcutor_hp); /* * Wait for all RCU callbacks to fire, then do flavor-specific @@ -1869,14 +1846,13 @@ rcu_torture_init(void) test_boost == 2) { boost_starttime = jiffies + test_boost_interval * HZ; - register_cpu_notifier(&rcutorture_cpu_nb); - for_each_possible_cpu(i) { - if (cpu_is_offline(i)) - continue; /* Heuristic: CPU can go offline. */ - firsterr = rcutorture_booster_init(i); - if (firsterr) - goto unwind; - } + + firsterr = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "RCU_TORTURE", + rcutorture_booster_init, + rcutorture_booster_cleanup); + if (firsterr < 0) + goto unwind; + rcutor_hp = firsterr; } firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); if (firsterr) From dcceb1eaf210096831b14471bc87678375b086ed Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 10 Aug 2016 09:24:45 -0300 Subject: [PATCH 076/538] [media] cec: add CEC_LOG_ADDRS_FL_ALLOW_UNREG_FALLBACK flag Currently if none of the requested logical addresses can be claimed, the framework will fall back to the Unregistered logical address. Add a flag to enable this explicitly. By default it will just go back to the unconfigured state. Usually Unregistered is not something you want since the functionality is very limited. Unless the application has support for this, it will fail to work correctly. So require that the application explicitly requests this. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../uapi/cec/cec-ioc-adap-g-log-addrs.rst | 21 ++++++++++++++++++- drivers/staging/media/cec/cec-adap.c | 4 ++++ drivers/staging/media/cec/cec-api.c | 2 +- include/linux/cec.h | 5 ++++- 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/Documentation/media/uapi/cec/cec-ioc-adap-g-log-addrs.rst b/Documentation/media/uapi/cec/cec-ioc-adap-g-log-addrs.rst index 04ee90099676..201d4839931c 100644 --- a/Documentation/media/uapi/cec/cec-ioc-adap-g-log-addrs.rst +++ b/Documentation/media/uapi/cec/cec-ioc-adap-g-log-addrs.rst @@ -144,7 +144,7 @@ logical address types are already defined will return with error ``EBUSY``. - ``flags`` - - Flags. No flags are defined yet, so set this to 0. + - Flags. See :ref:`cec-log-addrs-flags` for a list of available flags. - .. row 7 @@ -201,6 +201,25 @@ logical address types are already defined will return with error ``EBUSY``. give the CEC framework more information about the device type, even though the framework won't use it directly in the CEC message. +.. _cec-log-addrs-flags: + +.. flat-table:: Flags for struct cec_log_addrs + :header-rows: 0 + :stub-columns: 0 + :widths: 3 1 4 + + + - .. _`CEC-LOG-ADDRS-FL-ALLOW-UNREG-FALLBACK`: + + - ``CEC_LOG_ADDRS_FL_ALLOW_UNREG_FALLBACK`` + + - 1 + + - By default if no logical address of the requested type can be claimed, then + it will go back to the unconfigured state. If this flag is set, then it will + fallback to the Unregistered logical address. Note that if the Unregistered + logical address was explicitly requested, then this flag has no effect. + .. _cec-versions: .. flat-table:: CEC Versions diff --git a/drivers/staging/media/cec/cec-adap.c b/drivers/staging/media/cec/cec-adap.c index 9dcb784b8d6a..2458a6c87642 100644 --- a/drivers/staging/media/cec/cec-adap.c +++ b/drivers/staging/media/cec/cec-adap.c @@ -1047,6 +1047,10 @@ static int cec_config_thread_func(void *arg) dprintk(1, "could not claim LA %d\n", i); } + if (adap->log_addrs.log_addr_mask == 0 && + !(las->flags & CEC_LOG_ADDRS_FL_ALLOW_UNREG_FALLBACK)) + goto unconfigure; + configured: if (adap->log_addrs.log_addr_mask == 0) { /* Fall back to unregistered */ diff --git a/drivers/staging/media/cec/cec-api.c b/drivers/staging/media/cec/cec-api.c index 4e2696a34ddb..6f58ee85eea4 100644 --- a/drivers/staging/media/cec/cec-api.c +++ b/drivers/staging/media/cec/cec-api.c @@ -162,7 +162,7 @@ static long cec_adap_s_log_addrs(struct cec_adapter *adap, struct cec_fh *fh, return -ENOTTY; if (copy_from_user(&log_addrs, parg, sizeof(log_addrs))) return -EFAULT; - log_addrs.flags = 0; + log_addrs.flags &= CEC_LOG_ADDRS_FL_ALLOW_UNREG_FALLBACK; mutex_lock(&adap->lock); if (!adap->is_configuring && (!log_addrs.num_log_addrs || !adap->is_configured) && diff --git a/include/linux/cec.h b/include/linux/cec.h index b3e22893a002..851968e803fa 100644 --- a/include/linux/cec.h +++ b/include/linux/cec.h @@ -364,7 +364,7 @@ struct cec_caps { * @num_log_addrs: how many logical addresses should be claimed. Set by the * caller. * @vendor_id: the vendor ID of the device. Set by the caller. - * @flags: set to 0. + * @flags: flags. * @osd_name: the OSD name of the device. Set by the caller. * @primary_device_type: the primary device type for each logical address. * Set by the caller. @@ -389,6 +389,9 @@ struct cec_log_addrs { __u8 features[CEC_MAX_LOG_ADDRS][12]; }; +/* Allow a fallback to unregistered */ +#define CEC_LOG_ADDRS_FL_ALLOW_UNREG_FALLBACK (1 << 0) + /* Events */ /* Event that occurs when the adapter state changes */ From 0c1d61b0e4ed68d125b21fed375c38b6e3c2a658 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Sun, 14 Aug 2016 08:27:09 -0300 Subject: [PATCH 077/538] [media] cec: set unclaimed addresses to CEC_LOG_ADDR_INVALID Up to 4 logical addresses can be claimed. Make sure that any unclaimed logical addresses are set to CEC_LOG_ADDR_INVALID as per the documentation. Take special care in the unregistered case: when falling back to unregistered num_log_addrs may be > 1, so mark those as invalid. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/cec/cec-adap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/staging/media/cec/cec-adap.c b/drivers/staging/media/cec/cec-adap.c index 2458a6c87642..6cc7d7904446 100644 --- a/drivers/staging/media/cec/cec-adap.c +++ b/drivers/staging/media/cec/cec-adap.c @@ -1056,6 +1056,8 @@ static int cec_config_thread_func(void *arg) /* Fall back to unregistered */ las->log_addr[0] = CEC_LOG_ADDR_UNREGISTERED; las->log_addr_mask = 1 << las->log_addr[0]; + for (i = 1; i < las->num_log_addrs; i++) + las->log_addr[i] = CEC_LOG_ADDR_INVALID; } adap->is_configured = true; adap->is_configuring = false; @@ -1074,6 +1076,8 @@ static int cec_config_thread_func(void *arg) cec_report_features(adap, i); cec_report_phys_addr(adap, i); } + for (i = las->num_log_addrs; i < CEC_MAX_LOG_ADDRS; i++) + las->log_addr[i] = CEC_LOG_ADDR_INVALID; mutex_lock(&adap->lock); adap->kthread_config = NULL; mutex_unlock(&adap->lock); From 260ff1144a9dd1afb85cf5da462672d68412cbc4 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 19 Jul 2016 08:44:32 -0300 Subject: [PATCH 078/538] [media] cec: add item to TODO Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/cec/TODO | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/staging/media/cec/TODO b/drivers/staging/media/cec/TODO index a10d4f82b954..13224694a8ae 100644 --- a/drivers/staging/media/cec/TODO +++ b/drivers/staging/media/cec/TODO @@ -12,6 +12,7 @@ Hopefully this will happen later in 2016. Other TODOs: +- There are two possible replies to CEC_MSG_INITIATE_ARC. How to handle that? - Add a flag to inhibit passing CEC RC messages to the rc subsystem. Applications should be able to choose this when calling S_LOG_ADDRS. - If the reply field of cec_msg is set then when the reply arrives it From 31257c3c8b7307f106d67345755d937cb5fb8bd4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 18 Jun 2016 07:45:43 -0700 Subject: [PATCH 079/538] torture: Convert torture_shutdown() to hrtimer Upcoming changes to the timer wheel introduce significant inaccuracy and possibly also an ultimate limit on timeout duration. This is a problem for the current implementation of torture_shutdown() because (1) shutdown times are user-specified, and can therefore be quite long, and (2) the torture scripting will kill a test instance that runs for more than a few minutes longer than scheduled. This commit therefore converts the torture_shutdown() timed waits to an hrtimer, thus avoiding too-short torture test runs as well as death by scripting. Signed-off-by: Paul E. McKenney Acked-by: Arnd Bergmann --- kernel/torture.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/kernel/torture.c b/kernel/torture.c index 75961b3decfe..0d887eb62856 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -446,9 +447,8 @@ EXPORT_SYMBOL_GPL(torture_shuffle_cleanup); * Variables for auto-shutdown. This allows "lights out" torture runs * to be fully scripted. */ -static int shutdown_secs; /* desired test duration in seconds. */ static struct task_struct *shutdown_task; -static unsigned long shutdown_time; /* jiffies to system shutdown. */ +static ktime_t shutdown_time; /* time to system shutdown. */ static void (*torture_shutdown_hook)(void); /* @@ -471,20 +471,20 @@ EXPORT_SYMBOL_GPL(torture_shutdown_absorb); */ static int torture_shutdown(void *arg) { - long delta; - unsigned long jiffies_snap; + ktime_t ktime_snap; VERBOSE_TOROUT_STRING("torture_shutdown task started"); - jiffies_snap = jiffies; - while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && + ktime_snap = ktime_get(); + while (ktime_before(ktime_snap, shutdown_time) && !torture_must_stop()) { - delta = shutdown_time - jiffies_snap; if (verbose) pr_alert("%s" TORTURE_FLAG - "torture_shutdown task: %lu jiffies remaining\n", - torture_type, delta); - schedule_timeout_interruptible(delta); - jiffies_snap = jiffies; + "torture_shutdown task: %llu ms remaining\n", + torture_type, + ktime_ms_delta(shutdown_time, ktime_snap)); + set_current_state(TASK_INTERRUPTIBLE); + schedule_hrtimeout(&shutdown_time, HRTIMER_MODE_ABS); + ktime_snap = ktime_get(); } if (torture_must_stop()) { torture_kthread_stopping("torture_shutdown"); @@ -511,10 +511,9 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void)) { int ret = 0; - shutdown_secs = ssecs; torture_shutdown_hook = cleanup; - if (shutdown_secs > 0) { - shutdown_time = jiffies + shutdown_secs * HZ; + if (ssecs > 0) { + shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0)); ret = torture_create_kthread(torture_shutdown, NULL, shutdown_task); } From 3e92d8b238e48dfb539e8112bb2cc463e35e1b71 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Fri, 12 Aug 2016 13:32:07 -0300 Subject: [PATCH 080/538] [media] cec: ignore messages when log_addr_mask == 0 Most CEC adapters will still receive broadcast messages, even if no logical addresses are claimed. But those messages should only be passed on for monitoring purposes, but not for processing by either kernel or userspace if userspace didn't call CEC_ADAP_S_LOG_ADDRS first. So if adap->log_addrs.log_addr_mask is 0, then just return before passing the received message on to the processing code. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/cec/cec-adap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/staging/media/cec/cec-adap.c b/drivers/staging/media/cec/cec-adap.c index 6cc7d7904446..e980ac9c9279 100644 --- a/drivers/staging/media/cec/cec-adap.c +++ b/drivers/staging/media/cec/cec-adap.c @@ -851,6 +851,9 @@ void cec_received_msg(struct cec_adapter *adap, struct cec_msg *msg) if (!valid_la || msg->len <= 1) return; + if (adap->log_addrs.log_addr_mask == 0) + return; + /* * Process the message on the protocol level. If is_reply is true, * then cec_receive_notify() won't pass on the reply to the listener(s) From 73b14977549e4e1214413e7da2d0e97a9947bf8d Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Sun, 14 Aug 2016 06:45:54 -0300 Subject: [PATCH 081/538] [media] mtk-vcodec: add HAS_DMA dependency This fixes this kbuild test robot error: tree: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master head: 329f4152911c276b074bec75a0443f88821afdb7 commit: c1023ba74fc77dc56dc317bd98f5060aab889ac1 [media] drivers/media/platform/Kconfig: fix VIDEO_MEDIATEK_VCODEC dependency config: m32r-allyesconfig (attached as .config) compiler: m32r-linux-gcc (GCC) 4.9.0 reproduce: wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross git checkout c1023ba74fc77dc56dc317bd98f5060aab889ac1 # save the attached .config to linux build tree make.cross ARCH=m32r All errors (new ones prefixed by >>): drivers/media/v4l2-core/videobuf2-dma-contig.c: In function 'vb2_dc_get_userptr': >> >> drivers/media/v4l2-core/videobuf2-dma-contig.c:486:2: error: implicit declaration of function 'dma_get_cache_alignment' [-Werror=implicit-function-declaration] unsigned long dma_align = dma_get_cache_alignment(); ^ cc1: some warnings being treated as errors This driver depends on HAS_DMA for dma_get_cache_alignment(). Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/Kconfig b/drivers/media/platform/Kconfig index f25344bc7912..552b635cfce7 100644 --- a/drivers/media/platform/Kconfig +++ b/drivers/media/platform/Kconfig @@ -169,7 +169,7 @@ config VIDEO_MEDIATEK_VPU config VIDEO_MEDIATEK_VCODEC tristate "Mediatek Video Codec driver" depends on MTK_IOMMU || COMPILE_TEST - depends on VIDEO_DEV && VIDEO_V4L2 + depends on VIDEO_DEV && VIDEO_V4L2 && HAS_DMA depends on ARCH_MEDIATEK || COMPILE_TEST select VIDEOBUF2_DMA_CONTIG select V4L2_MEM2MEM_DEV From 4ffa66992476c94d8b4d33b2c792d336a400ada2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Jun 2016 11:56:38 -0700 Subject: [PATCH 082/538] torture: Add task state to writer-task stall printk()s This commit adds a dump of the scheduler state for stalled rcutorture writer tasks. This addition provides yet more debug for the intermittent "failures to proceed", where grace periods move ahead but the rcutorture writer tasks fail to do so. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 971e2b138063..f0f32f888ec5 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1238,6 +1238,7 @@ rcu_torture_stats_print(void) long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; static unsigned long rtcv_snap = ULONG_MAX; + struct task_struct *wtp; for_each_possible_cpu(cpu) { for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { @@ -1312,10 +1313,12 @@ rcu_torture_stats_print(void) rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); - pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n", + wtp = READ_ONCE(writer_task); + pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", rcu_torture_writer_state_getname(), rcu_torture_writer_state, - gpnum, completed, flags); + gpnum, completed, flags, + wtp == NULL ? ~0UL : wtp->state); show_rcu_gp_kthreads(); rcu_ftrace_dump(DUMP_ALL); } From 1e6e97541ab51b65019bd823506af81ebb3730fc Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Fri, 12 Aug 2016 06:44:27 -0300 Subject: [PATCH 083/538] [media] pulse8-cec: set correct Signal Free Time Don't hardcode the signal free time to 3 bit periods, instead use the value for the signal free time as passed in by the CEC framework. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/pulse8-cec/pulse8-cec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/media/pulse8-cec/pulse8-cec.c b/drivers/staging/media/pulse8-cec/pulse8-cec.c index 94f8590492dc..28f853c80b19 100644 --- a/drivers/staging/media/pulse8-cec/pulse8-cec.c +++ b/drivers/staging/media/pulse8-cec/pulse8-cec.c @@ -388,7 +388,7 @@ static int pulse8_cec_adap_transmit(struct cec_adapter *adap, u8 attempts, int err; cmd[0] = MSGCODE_TRANSMIT_IDLETIME; - cmd[1] = 3; + cmd[1] = signal_free_time; err = pulse8_send_and_wait(pulse8, cmd, 2, MSGCODE_COMMAND_ACCEPTED, 1); cmd[0] = MSGCODE_TRANSMIT_ACK_POLARITY; From 472213a675e21185416101a77102253f93713fa9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 13 Aug 2016 15:54:35 +0900 Subject: [PATCH 084/538] rcutorture: Print out barrier error as document says Tests for rcu_barrier() were introduced by commit fae4b54f28f0 ("rcu: Introduce rcutorture testing for rcu_barrier()"). This commit updated the documentation to say that the "rtbe" field in rcutorture's dmesg output indicates test failure. However, the code was not updated, only the documentation. This commit therefore updates the code to match the updated documentation. Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f0f32f888ec5..ac29017623e5 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1259,8 +1259,9 @@ rcu_torture_stats_print(void) atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), atomic_read(&n_rcu_torture_free)); - pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ", + pr_cont("rtmbe: %d rtbe: %ld rtbke: %ld rtbre: %ld ", atomic_read(&n_rcu_torture_mberror), + n_rcu_torture_barrier_error, n_rcu_torture_boost_ktrerror, n_rcu_torture_boost_rterror); pr_cont("rtbf: %ld rtb: %ld nt: %ld ", From 31f58e31dc0e170e117a83584103921269b7581b Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Fri, 12 Aug 2016 06:46:06 -0300 Subject: [PATCH 085/538] [media] pulse8-cec: fix error handling Support more error codes and fix a bug where MSGCODE_TRANSMIT_FAILED_LINE was mapped to CEC_TX_STATUS_ARB_LOST, which is wrong. Thanks to Pulse-Eight for providing me with the information needed to handle this correctly (I hope). Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/pulse8-cec/pulse8-cec.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/staging/media/pulse8-cec/pulse8-cec.c b/drivers/staging/media/pulse8-cec/pulse8-cec.c index 28f853c80b19..ed8bd95ad6d0 100644 --- a/drivers/staging/media/pulse8-cec/pulse8-cec.c +++ b/drivers/staging/media/pulse8-cec/pulse8-cec.c @@ -114,14 +114,11 @@ static void pulse8_irq_work_handler(struct work_struct *work) cec_transmit_done(pulse8->adap, CEC_TX_STATUS_OK, 0, 0, 0, 0); break; - case MSGCODE_TRANSMIT_FAILED_LINE: - cec_transmit_done(pulse8->adap, CEC_TX_STATUS_ARB_LOST, - 1, 0, 0, 0); - break; case MSGCODE_TRANSMIT_FAILED_ACK: cec_transmit_done(pulse8->adap, CEC_TX_STATUS_NACK, 0, 1, 0, 0); break; + case MSGCODE_TRANSMIT_FAILED_LINE: case MSGCODE_TRANSMIT_FAILED_TIMEOUT_DATA: case MSGCODE_TRANSMIT_FAILED_TIMEOUT_LINE: cec_transmit_done(pulse8->adap, CEC_TX_STATUS_ERROR, @@ -170,6 +167,9 @@ static irqreturn_t pulse8_interrupt(struct serio *serio, unsigned char data, case MSGCODE_TRANSMIT_FAILED_TIMEOUT_LINE: schedule_work(&pulse8->work); break; + case MSGCODE_HIGH_ERROR: + case MSGCODE_LOW_ERROR: + case MSGCODE_RECEIVE_FAILED: case MSGCODE_TIMEOUT_ERROR: break; case MSGCODE_COMMAND_ACCEPTED: From 8ac6a1a53e9f195e8c4336a7edfba2e102fc14bb Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Thu, 18 Aug 2016 04:13:42 -0300 Subject: [PATCH 086/538] [media] cec-edid: check for IEEE identifier The cec_get_edid_spa_location() function did not verify that the IEEE identifier in the Vendor Specific Data Block matched the HDMI-LLC identifier. This could result in the wrong VSDB block being returned. For example, for HDMI 2.0 EDIDs there is also a HDMI Forum VSDB. So check the IEEE identifier as well. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/cec-edid.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/media/cec-edid.c b/drivers/media/cec-edid.c index 70018247bdda..5719b991e340 100644 --- a/drivers/media/cec-edid.c +++ b/drivers/media/cec-edid.c @@ -70,7 +70,10 @@ static unsigned int cec_get_edid_spa_location(const u8 *edid, unsigned int size) u8 tag = edid[i] >> 5; u8 len = edid[i] & 0x1f; - if (tag == 3 && len >= 5 && i + len <= end) + if (tag == 3 && len >= 5 && i + len <= end && + edid[i + 1] == 0x03 && + edid[i + 2] == 0x0c && + edid[i + 3] == 0x00) return i + 4; i += len + 1; } while (i < end); From 4808f721627c2a23b5d749f9bbd20d4529ea2b8d Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Sat, 20 Aug 2016 07:54:38 -0300 Subject: [PATCH 087/538] [media] cec-funcs.h: add missing vendor-specific messages The cec-funcs.h header was missing support for these three vendor-specific messages: CEC_MSG_VENDOR_COMMAND CEC_MSG_VENDOR_COMMAND_WITH_ID CEC_MSG_VENDOR_REMOTE_BUTTON_DOWN Add wrappers for these messages. I originally postponed adding these wrappers due to the fact that the argument is just a byte array which cec-ctl couldn't handle at the time, and then I just forgot to add them once the CEC framework was finalized. It wasn't until an attempt to transmit a vendor specific command was made that I realized that these wrappers were missing. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/linux/cec-funcs.h | 69 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/include/linux/cec-funcs.h b/include/linux/cec-funcs.h index 8af613e67633..138bbf721e70 100644 --- a/include/linux/cec-funcs.h +++ b/include/linux/cec-funcs.h @@ -1144,6 +1144,75 @@ static inline void cec_msg_give_device_vendor_id(struct cec_msg *msg, msg->reply = reply ? CEC_MSG_DEVICE_VENDOR_ID : 0; } +static inline void cec_msg_vendor_command(struct cec_msg *msg, + __u8 size, const __u8 *vendor_cmd) +{ + if (size > 14) + size = 14; + msg->len = 2 + size; + msg->msg[1] = CEC_MSG_VENDOR_COMMAND; + memcpy(msg->msg + 2, vendor_cmd, size); +} + +static inline void cec_ops_vendor_command(const struct cec_msg *msg, + __u8 *size, + const __u8 **vendor_cmd) +{ + *size = msg->len - 2; + + if (*size > 14) + *size = 14; + *vendor_cmd = msg->msg + 2; +} + +static inline void cec_msg_vendor_command_with_id(struct cec_msg *msg, + __u32 vendor_id, __u8 size, + const __u8 *vendor_cmd) +{ + if (size > 11) + size = 11; + msg->len = 5 + size; + msg->msg[1] = CEC_MSG_VENDOR_COMMAND_WITH_ID; + msg->msg[2] = vendor_id >> 16; + msg->msg[3] = (vendor_id >> 8) & 0xff; + msg->msg[4] = vendor_id & 0xff; + memcpy(msg->msg + 5, vendor_cmd, size); +} + +static inline void cec_ops_vendor_command_with_id(const struct cec_msg *msg, + __u32 *vendor_id, __u8 *size, + const __u8 **vendor_cmd) +{ + *size = msg->len - 5; + + if (*size > 11) + *size = 11; + *vendor_id = (msg->msg[2] << 16) | (msg->msg[3] << 8) | msg->msg[4]; + *vendor_cmd = msg->msg + 5; +} + +static inline void cec_msg_vendor_remote_button_down(struct cec_msg *msg, + __u8 size, + const __u8 *rc_code) +{ + if (size > 14) + size = 14; + msg->len = 2 + size; + msg->msg[1] = CEC_MSG_VENDOR_REMOTE_BUTTON_DOWN; + memcpy(msg->msg + 2, rc_code, size); +} + +static inline void cec_ops_vendor_remote_button_down(const struct cec_msg *msg, + __u8 *size, + const __u8 **rc_code) +{ + *size = msg->len - 2; + + if (*size > 14) + *size = 14; + *rc_code = msg->msg + 2; +} + static inline void cec_msg_vendor_remote_button_up(struct cec_msg *msg) { msg->len = 2; From a56fefa2605cf8e125ef09451487f30336128028 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 21 Aug 2016 16:54:39 +0900 Subject: [PATCH 088/538] rcuperf: Consistently insert space between flag and message A few rcuperf dmesg output messages have no space between the flag and the start of the message. In contrast, every other messages consistently supplies a single space. This difference makes rcuperf dmesg output hard to read and to mechanically parse. This commit therefore fixes this problem by modifying a pr_alert() call and PERFOUT_STRING() macro function to provide that single space. Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index d38ab08a3fe7..123ccbd22449 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -52,7 +52,7 @@ MODULE_AUTHOR("Paul E. McKenney "); #define PERF_FLAG "-perf:" #define PERFOUT_STRING(s) \ - pr_alert("%s" PERF_FLAG s "\n", perf_type) + pr_alert("%s" PERF_FLAG " %s\n", perf_type, s) #define VERBOSE_PERFOUT_STRING(s) \ do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) #define VERBOSE_PERFOUT_ERRSTRING(s) \ @@ -400,9 +400,8 @@ rcu_perf_writer(void *arg) sp.sched_priority = 0; sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); - pr_alert("%s" PERF_FLAG - "rcu_perf_writer %ld has %d measurements\n", - perf_type, me, MIN_MEAS); + pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n", + perf_type, PERF_FLAG, me, MIN_MEAS); if (atomic_inc_return(&n_rcu_perf_writer_finished) >= nrealwriters) { schedule_timeout_interruptible(10); From 489bb3d252d41392ce52590e49f0ae8782fb016e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 21 Aug 2016 16:54:40 +0900 Subject: [PATCH 089/538] torture: TOROUT_STRING(): Insert a space between flag and message The TOROUT_STRING() macro does not insert a space between the flag and the message. In contrast, other similar torture-test dmesg messages consistently supply a single space character. This difference makes the output hard to read and to mechanically parse. This commit therefore adds a space character between flag and message in TOROUT_STRING() output. Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/torture.h b/include/linux/torture.h index 6685a73736a2..a45702eb3e7b 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -43,7 +43,7 @@ #define TORTURE_FLAG "-torture:" #define TOROUT_STRING(s) \ - pr_alert("%s" TORTURE_FLAG s "\n", torture_type) + pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s) #define VERBOSE_TOROUT_STRING(s) \ do { if (verbose) pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s); } while (0) #define VERBOSE_TOROUT_ERRSTRING(s) \ From a109893bd3e71912b376a731b27de8c45fded9b3 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Fri, 5 Aug 2016 16:55:19 +0200 Subject: [PATCH 090/538] irqchip/mvebu-pic: New driver for Marvell Armada 7K/8K PIC The Marvell Armada 7K/8K integrates a secondary interrupt controller very originally named "PIC". It is connected to the main GIC via a PPI. Amongst other things, this PIC is used for the ARM PMU. This commit adds a simple irqchip driver for this interrupt controller. Since this interrupt controller is not needed early at boot time, we make the driver a proper platform driver rather than use the IRQCHIP_DECLARE() mechanism. Signed-off-by: Yehuda Yitschak Signed-off-by: Thomas Petazzoni Link: https://lkml.kernel.org/r/1470408921-447-3-git-send-email-thomas.petazzoni@free-electrons.com Signed-off-by: Jason Cooper --- drivers/irqchip/Kconfig | 3 + drivers/irqchip/Makefile | 1 + drivers/irqchip/irq-mvebu-pic.c | 197 ++++++++++++++++++++++++++++++++ 3 files changed, 201 insertions(+) create mode 100644 drivers/irqchip/irq-mvebu-pic.c diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 7f8728984f44..5c08cdb510d0 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -251,6 +251,9 @@ config IRQ_MXS config MVEBU_ODMI bool +config MVEBU_PIC + bool + config LS_SCFG_MSI def_bool y if SOC_LS1021A || ARCH_LAYERSCAPE depends on PCI && PCI_MSI diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 4c203b6b8163..93e61bad8142 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -68,6 +68,7 @@ obj-$(CONFIG_INGENIC_IRQ) += irq-ingenic.o obj-$(CONFIG_IMX_GPCV2) += irq-imx-gpcv2.o obj-$(CONFIG_PIC32_EVIC) += irq-pic32-evic.o obj-$(CONFIG_MVEBU_ODMI) += irq-mvebu-odmi.o +obj-$(CONFIG_MVEBU_PIC) += irq-mvebu-pic.o obj-$(CONFIG_LS_SCFG_MSI) += irq-ls-scfg-msi.o obj-$(CONFIG_EZNPS_GIC) += irq-eznps.o obj-$(CONFIG_ARCH_ASPEED) += irq-aspeed-vic.o diff --git a/drivers/irqchip/irq-mvebu-pic.c b/drivers/irqchip/irq-mvebu-pic.c new file mode 100644 index 000000000000..eec63951129a --- /dev/null +++ b/drivers/irqchip/irq-mvebu-pic.c @@ -0,0 +1,197 @@ +/* + * Copyright (C) 2016 Marvell + * + * Yehuda Yitschak + * Thomas Petazzoni + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PIC_CAUSE 0x0 +#define PIC_MASK 0x4 + +#define PIC_MAX_IRQS 32 +#define PIC_MAX_IRQ_MASK ((1UL << PIC_MAX_IRQS) - 1) + +struct mvebu_pic { + void __iomem *base; + u32 parent_irq; + struct irq_domain *domain; + struct irq_chip irq_chip; +}; + +static void mvebu_pic_reset(struct mvebu_pic *pic) +{ + /* ACK and mask all interrupts */ + writel(0, pic->base + PIC_MASK); + writel(PIC_MAX_IRQ_MASK, pic->base + PIC_CAUSE); +} + +static void mvebu_pic_eoi_irq(struct irq_data *d) +{ + struct mvebu_pic *pic = irq_data_get_irq_chip_data(d); + + writel(1 << d->hwirq, pic->base + PIC_CAUSE); +} + +static void mvebu_pic_mask_irq(struct irq_data *d) +{ + struct mvebu_pic *pic = irq_data_get_irq_chip_data(d); + u32 reg; + + reg = readl(pic->base + PIC_MASK); + reg |= (1 << d->hwirq); + writel(reg, pic->base + PIC_MASK); +} + +static void mvebu_pic_unmask_irq(struct irq_data *d) +{ + struct mvebu_pic *pic = irq_data_get_irq_chip_data(d); + u32 reg; + + reg = readl(pic->base + PIC_MASK); + reg &= ~(1 << d->hwirq); + writel(reg, pic->base + PIC_MASK); +} + +static int mvebu_pic_irq_map(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) +{ + struct mvebu_pic *pic = domain->host_data; + + irq_set_percpu_devid(virq); + irq_set_chip_data(virq, pic); + irq_set_chip_and_handler(virq, &pic->irq_chip, + handle_percpu_devid_irq); + irq_set_status_flags(virq, IRQ_LEVEL); + irq_set_probe(virq); + + return 0; +} + +static const struct irq_domain_ops mvebu_pic_domain_ops = { + .map = mvebu_pic_irq_map, + .xlate = irq_domain_xlate_onecell, +}; + +static void mvebu_pic_handle_cascade_irq(struct irq_desc *desc) +{ + struct mvebu_pic *pic = irq_desc_get_handler_data(desc); + struct irq_chip *chip = irq_desc_get_chip(desc); + unsigned long irqmap, irqn; + unsigned int cascade_irq; + + irqmap = readl_relaxed(pic->base + PIC_CAUSE); + chained_irq_enter(chip, desc); + + for_each_set_bit(irqn, &irqmap, BITS_PER_LONG) { + cascade_irq = irq_find_mapping(pic->domain, irqn); + generic_handle_irq(cascade_irq); + } + + chained_irq_exit(chip, desc); +} + +static void mvebu_pic_enable_percpu_irq(void *data) +{ + struct mvebu_pic *pic = data; + + mvebu_pic_reset(pic); + enable_percpu_irq(pic->parent_irq, IRQ_TYPE_NONE); +} + +static void mvebu_pic_disable_percpu_irq(void *data) +{ + struct mvebu_pic *pic = data; + + disable_percpu_irq(pic->parent_irq); +} + +static int mvebu_pic_probe(struct platform_device *pdev) +{ + struct device_node *node = pdev->dev.of_node; + struct mvebu_pic *pic; + struct irq_chip *irq_chip; + struct resource *res; + + pic = devm_kzalloc(&pdev->dev, sizeof(struct mvebu_pic), GFP_KERNEL); + if (!pic) + return -ENOMEM; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + pic->base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(pic->base)) + return PTR_ERR(pic->base); + + irq_chip = &pic->irq_chip; + irq_chip->name = dev_name(&pdev->dev); + irq_chip->irq_mask = mvebu_pic_mask_irq; + irq_chip->irq_unmask = mvebu_pic_unmask_irq; + irq_chip->irq_eoi = mvebu_pic_eoi_irq; + + pic->parent_irq = irq_of_parse_and_map(node, 0); + if (pic->parent_irq <= 0) { + dev_err(&pdev->dev, "Failed to parse parent interrupt\n"); + return -EINVAL; + } + + pic->domain = irq_domain_add_linear(node, PIC_MAX_IRQS, + &mvebu_pic_domain_ops, pic); + if (!pic->domain) { + dev_err(&pdev->dev, "Failed to allocate irq domain\n"); + return -ENOMEM; + } + + irq_set_chained_handler(pic->parent_irq, mvebu_pic_handle_cascade_irq); + irq_set_handler_data(pic->parent_irq, pic); + + on_each_cpu(mvebu_pic_enable_percpu_irq, pic, 1); + + platform_set_drvdata(pdev, pic); + + return 0; +} + +static int mvebu_pic_remove(struct platform_device *pdev) +{ + struct mvebu_pic *pic = platform_get_drvdata(pdev); + + on_each_cpu(mvebu_pic_disable_percpu_irq, pic, 1); + irq_domain_remove(pic->domain); + + return 0; +} + +static const struct of_device_id mvebu_pic_of_match[] = { + { .compatible = "marvell,armada-8k-pic", }, + {}, +}; +MODULE_DEVICE_TABLE(of, mvebu_pic_of_match); + +static struct platform_driver mvebu_pic_driver = { + .probe = mvebu_pic_probe, + .remove = mvebu_pic_remove, + .driver = { + .name = "mvebu-pic", + .of_match_table = mvebu_pic_of_match, + }, +}; +module_platform_driver(mvebu_pic_driver); + +MODULE_AUTHOR("Yehuda Yitschak "); +MODULE_AUTHOR("Thomas Petazzoni "); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("platform:mvebu_pic"); + From 04208a24b9d2f46f07f4400a4829d5372d0a3661 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Fri, 5 Aug 2016 16:55:20 +0200 Subject: [PATCH 091/538] arm64: marvell: enable the Marvell PIC driver This commit makes sure the driver for the Marvell PIC interrupt controller (used on Marvell Armada 7K/8K) is enabled. Signed-off-by: Thomas Petazzoni Link: https://lkml.kernel.org/r/1470408921-447-4-git-send-email-thomas.petazzoni@free-electrons.com Signed-off-by: Jason Cooper --- arch/arm64/Kconfig.platforms | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms index bb2616b16157..0b988f315c40 100644 --- a/arch/arm64/Kconfig.platforms +++ b/arch/arm64/Kconfig.platforms @@ -93,6 +93,7 @@ config ARCH_MVEBU select ARMADA_CP110_SYSCON select ARMADA_37XX_CLK select MVEBU_ODMI + select MVEBU_PIC help This enables support for Marvell EBU familly, including: - Armada 3700 SoC Family From 21118df66c198d6ebb23e6827e2e92ab1e148e78 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sat, 20 Aug 2016 15:26:28 +0000 Subject: [PATCH 092/538] irqchip/jcore-aic: Fix non static symbol warning Fixes the following sparse warning: drivers/irqchip/irq-jcore-aic.c:47:12: warning: symbol 'aic_irq_of_init' was not declared. Should it be static? Signed-off-by: Wei Yongjun Link: https://lkml.kernel.org/r/1471706788-27587-1-git-send-email-weiyj.lk@gmail.com Signed-off-by: Jason Cooper --- drivers/irqchip/irq-jcore-aic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-jcore-aic.c b/drivers/irqchip/irq-jcore-aic.c index 5e5e3bb7d3c7..84b01dec277d 100644 --- a/drivers/irqchip/irq-jcore-aic.c +++ b/drivers/irqchip/irq-jcore-aic.c @@ -44,7 +44,8 @@ static void noop(struct irq_data *data) { } -int __init aic_irq_of_init(struct device_node *node, struct device_node *parent) +static int __init aic_irq_of_init(struct device_node *node, + struct device_node *parent) { unsigned min_irq = JCORE_AIC2_MIN_HWIRQ; unsigned dom_sz = JCORE_AIC_MAX_HWIRQ+1; From cae750bae4e488c138eb436175201a60943eb3dc Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Fri, 19 Aug 2016 18:11:19 +0100 Subject: [PATCH 093/538] irqchip/mips-gic: Use for_each_set_bit to iterate over IRQs The MIPS GIC driver has previously iterated over bits set in a bitmap representing pending IRQs by calling find_first_bit, clearing that bit then calling find_first_bit again until all bits are clear. If multiple interrupts are pending then this is wasteful, as find_first_bit will have to loop over the whole bitmap from the start. Use the for_each_set_bit macro which performs exactly what we need here instead. It will use find_next_bit and thus only scan over the relevant part of the bitmap, and it makes the intent of the code more clear. Signed-off-by: Paul Burton Link: https://lkml.kernel.org/r/20160819171119.28121-1-paul.burton@imgtec.com Signed-off-by: Jason Cooper --- drivers/irqchip/irq-mips-gic.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index c5f33c3bd228..a376fc632263 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -371,18 +371,13 @@ static void gic_handle_shared_int(bool chained) bitmap_and(pending, pending, intrmask, gic_shared_intrs); bitmap_and(pending, pending, pcpu_mask, gic_shared_intrs); - intr = find_first_bit(pending, gic_shared_intrs); - while (intr != gic_shared_intrs) { + for_each_set_bit(intr, pending, gic_shared_intrs) { virq = irq_linear_revmap(gic_irq_domain, GIC_SHARED_TO_HWIRQ(intr)); if (chained) generic_handle_irq(virq); else do_IRQ(virq); - - /* go to next pending bit */ - bitmap_clear(pending, intr, 1); - intr = find_first_bit(pending, gic_shared_intrs); } } From 556b6723689694ac9134bcc36a07828168e057f4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 23 Aug 2016 19:23:56 +0200 Subject: [PATCH 094/538] x86/entry: Remove outdated comment about SYSCALL targets The comment probably meant some old AMD64 incarnation which most likely never saw the light of day. STAR and LSTAR are two different registers and STAR sets CS/SS(DS) selectors for *all* modes, not only 32-bit. So simply remove that comment. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160823172356.15879-1-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6ef55e83fb8a..e374c19b9ddc 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1305,11 +1305,6 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks /* May not be marked __init: used by software suspend */ void syscall_init(void) { - /* - * LSTAR and STAR live in a bit strange symbiosis. - * They both write to the same internal register. STAR allows to - * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. - */ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); From 384d9fe3741657c8ed8cd9bf30bc1d4611864d56 Mon Sep 17 00:00:00 2001 From: Wei Jiangang Date: Fri, 19 Aug 2016 11:22:36 +0800 Subject: [PATCH 095/538] x86/smpboot: Check APIC ID before setting up default routing This is not a bugfix, but code optimization. If the BSP's APIC ID in local APIC is unexpected, a kernel panic will occur and the system will halt. That means no need to enable APIC mode, and no reason to set up the default routing for APIC. The combination of default_setup_apic_routing() and apic_bsp_setup() are used to enable APIC mode. They two should be kept together, rather than being separated by the codes of checking APIC ID. Just like their usage in APIC_init_uniprocessor(). Signed-off-by: Wei Jiangang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bp@suse.de Link: http://lkml.kernel.org/r/1471576957-12961-1-git-send-email-weijg.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2a6e84a30a54..8216b997c1c9 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1316,14 +1316,13 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) break; } - default_setup_apic_routing(); - if (read_apic_id() != boot_cpu_physical_apicid) { panic("Boot APIC ID in local APIC unexpected (%d vs %d)", read_apic_id(), boot_cpu_physical_apicid); /* Or can we switch back to PIC here? */ } + default_setup_apic_routing(); cpu0_logical_apicid = apic_bsp_setup(false); pr_info("CPU%d: ", 0); From 5035da41996d346c648a65c1d7a9f6469c7d358a Mon Sep 17 00:00:00 2001 From: Wei Jiangang Date: Fri, 19 Aug 2016 11:22:37 +0800 Subject: [PATCH 096/538] x86/apic: Update comment about disabling processor focus Fix references to discarded end_level_ioapic_irq(). Signed-off-by: Wei Jiangang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bp@suse.de Link: http://lkml.kernel.org/r/1471576957-12961-2-git-send-email-weijg.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 63b748444880..1cbae30af51c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1374,7 +1374,6 @@ void setup_local_APIC(void) * Actually disabling the focus CPU check just makes the hang less * frequent as it makes the interrupt distributon model be more * like LRU than MRU (the short-term load is more even across CPUs). - * See also the comment in end_level_ioapic_irq(). --macro */ /* From ba14a194a434ccc8f733e263ad2ce941e35e5787 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 11 Aug 2016 02:35:21 -0700 Subject: [PATCH 097/538] fork: Add generic vmalloced stack support If CONFIG_VMAP_STACK=y is selected, kernel stacks are allocated with __vmalloc_node_range(). Grsecurity has had a similar feature (called GRKERNSEC_KSTACKOVERFLOW=y) for a long time. Signed-off-by: Andy Lutomirski Acked-by: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/14c07d4fd173a5b117f51e8b939f9f4323e39899.1470907718.git.luto@kernel.org [ Minor edits. ] Signed-off-by: Ingo Molnar --- arch/Kconfig | 34 ++++++++++ arch/ia64/include/asm/thread_info.h | 2 +- include/linux/sched.h | 15 +++++ kernel/fork.c | 96 +++++++++++++++++++++++------ 4 files changed, 126 insertions(+), 21 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index e9c9334507dd..9ecf9f6f9e15 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -707,4 +707,38 @@ config ARCH_NO_COHERENT_DMA_MMAP config CPU_NO_EFFICIENT_FFS def_bool n +config HAVE_ARCH_VMAP_STACK + def_bool n + help + An arch should select this symbol if it can support kernel stacks + in vmalloc space. This means: + + - vmalloc space must be large enough to hold many kernel stacks. + This may rule out many 32-bit architectures. + + - Stacks in vmalloc space need to work reliably. For example, if + vmap page tables are created on demand, either this mechanism + needs to work while the stack points to a virtual address with + unpopulated page tables or arch code (switch_to() and switch_mm(), + most likely) needs to ensure that the stack's page table entries + are populated before running on a possibly unpopulated stack. + + - If the stack overflows into a guard page, something reasonable + should happen. The definition of "reasonable" is flexible, but + instantly rebooting without logging anything would be unfriendly. + +config VMAP_STACK + default y + bool "Use a virtually-mapped stack" + depends on HAVE_ARCH_VMAP_STACK && !KASAN + ---help--- + Enable this if you want the use virtually-mapped kernel stacks + with guard pages. This causes kernel stack overflows to be + caught immediately rather than causing difficult-to-diagnose + corruption. + + This is presently incompatible with KASAN because KASAN expects + the stack to map directly to the KASAN shadow map using a formula + that is incorrect if the stack is in vmalloc space. + source "kernel/gcov/Kconfig" diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index 29bd59790d6c..c7026429816b 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -56,7 +56,7 @@ struct thread_info { #define alloc_thread_stack_node(tsk, node) ((unsigned long *) 0) #define task_thread_info(tsk) ((struct thread_info *) 0) #endif -#define free_thread_stack(ti) /* nothing */ +#define free_thread_stack(tsk) /* nothing */ #define task_stack_page(tsk) ((void *)(tsk)) #define __HAVE_THREAD_FUNCTIONS diff --git a/include/linux/sched.h b/include/linux/sched.h index 62c68e513e39..20f9f47bcfd0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1923,6 +1923,9 @@ struct task_struct { #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; #endif +#ifdef CONFIG_VMAP_STACK + struct vm_struct *stack_vm_area; +#endif /* CPU-specific state of this task */ struct thread_struct thread; /* @@ -1939,6 +1942,18 @@ extern int arch_task_struct_size __read_mostly; # define arch_task_struct_size (sizeof(struct task_struct)) #endif +#ifdef CONFIG_VMAP_STACK +static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) +{ + return t->stack_vm_area; +} +#else +static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) +{ + return NULL; +} +#endif + /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) diff --git a/kernel/fork.c b/kernel/fork.c index 52e725d4a866..9b85f6b2cdcd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -158,19 +158,39 @@ void __weak arch_release_thread_stack(unsigned long *stack) * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. */ -# if THREAD_SIZE >= PAGE_SIZE -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, - int node) +# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { +#ifdef CONFIG_VMAP_STACK + void *stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, + VMALLOC_START, VMALLOC_END, + THREADINFO_GFP | __GFP_HIGHMEM, + PAGE_KERNEL, + 0, node, + __builtin_return_address(0)); + + /* + * We can't call find_vm_area() in interrupt context, and + * free_thread_stack() can be called in interrupt context, + * so cache the vm_struct. + */ + if (stack) + tsk->stack_vm_area = find_vm_area(stack); + return stack; +#else struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; +#endif } -static inline void free_thread_stack(unsigned long *stack) +static inline void free_thread_stack(struct task_struct *tsk) { - __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); + if (task_stack_vm_area(tsk)) + vfree(tsk->stack); + else + __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_stack_cache; @@ -181,9 +201,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); } -static void free_thread_stack(unsigned long *stack) +static void free_thread_stack(struct task_struct *tsk) { - kmem_cache_free(thread_stack_cache, stack); + kmem_cache_free(thread_stack_cache, tsk->stack); } void thread_stack_cache_init(void) @@ -213,24 +233,47 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -static void account_kernel_stack(unsigned long *stack, int account) +static void account_kernel_stack(struct task_struct *tsk, int account) { - /* All stack pages are in the same zone and belong to the same memcg. */ - struct page *first_page = virt_to_page(stack); + void *stack = task_stack_page(tsk); + struct vm_struct *vm = task_stack_vm_area(tsk); + + BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); + + if (vm) { + int i; + + BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + mod_zone_page_state(page_zone(vm->pages[i]), + NR_KERNEL_STACK_KB, + PAGE_SIZE / 1024 * account); + } - mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, - THREAD_SIZE / 1024 * account); + /* All stack pages belong to the same memcg. */ + memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); + } else { + /* + * All stack pages are in the same zone and belong to the + * same memcg. + */ + struct page *first_page = virt_to_page(stack); + + mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, + THREAD_SIZE / 1024 * account); - memcg_kmem_update_page_stat( - first_page, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); + } } void free_task(struct task_struct *tsk) { - account_kernel_stack(tsk->stack, -1); + account_kernel_stack(tsk, -1); arch_release_thread_stack(tsk->stack); - free_thread_stack(tsk->stack); + free_thread_stack(tsk); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); @@ -342,6 +385,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; unsigned long *stack; + struct vm_struct *stack_vm_area; int err; if (node == NUMA_NO_NODE) @@ -354,11 +398,23 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!stack) goto free_tsk; + stack_vm_area = task_stack_vm_area(tsk); + err = arch_dup_task_struct(tsk, orig); + + /* + * arch_dup_task_struct() clobbers the stack-related fields. Make + * sure they're properly initialized before using any stack-related + * functions again. + */ + tsk->stack = stack; +#ifdef CONFIG_VMAP_STACK + tsk->stack_vm_area = stack_vm_area; +#endif + if (err) goto free_stack; - tsk->stack = stack; #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under @@ -390,14 +446,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; - account_kernel_stack(stack, 1); + account_kernel_stack(tsk, 1); kcov_task_init(tsk); return tsk; free_stack: - free_thread_stack(stack); + free_thread_stack(tsk); free_tsk: free_task_struct(tsk); return NULL; From b4a0f533e5976cb1a79f31d6152e1d322d79b7f1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 11 Aug 2016 02:35:22 -0700 Subject: [PATCH 098/538] dma-api: Teach the "DMA-from-stack" check about vmapped stacks If we're using CONFIG_VMAP_STACK=y and we manage to point an sg entry at the stack, then either the sg page will be in highmem or sg_virt() will return the direct-map alias. In neither case will the existing check_for_stack() implementation realize that it's a stack page. Fix it by explicitly checking for stack pages. This has no effect by itself. It's broken out for ease of review. Signed-off-by: Andy Lutomirski Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/448460622731312298bf19dcbacb1606e75de7a9.1470907718.git.luto@kernel.org [ Minor edits. ] Signed-off-by: Ingo Molnar --- lib/dma-debug.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index fcfa1939ac41..06f02f6aecd2 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -1164,11 +1165,32 @@ static void check_unmap(struct dma_debug_entry *ref) put_hash_bucket(bucket, &flags); } -static void check_for_stack(struct device *dev, void *addr) +static void check_for_stack(struct device *dev, + struct page *page, size_t offset) { - if (object_is_on_stack(addr)) - err_printk(dev, NULL, "DMA-API: device driver maps memory from " - "stack [addr=%p]\n", addr); + void *addr; + struct vm_struct *stack_vm_area = task_stack_vm_area(current); + + if (!stack_vm_area) { + /* Stack is direct-mapped. */ + if (PageHighMem(page)) + return; + addr = page_address(page) + offset; + if (object_is_on_stack(addr)) + err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [addr=%p]\n", addr); + } else { + /* Stack is vmalloced. */ + int i; + + for (i = 0; i < stack_vm_area->nr_pages; i++) { + if (page != stack_vm_area->pages[i]) + continue; + + addr = (u8 *)current->stack + i * PAGE_SIZE + offset; + err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [probable addr=%p]\n", addr); + break; + } + } } static inline bool overlap(void *addr, unsigned long len, void *start, void *end) @@ -1291,10 +1313,11 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, if (map_single) entry->type = dma_debug_single; + check_for_stack(dev, page, offset); + if (!PageHighMem(page)) { void *addr = page_address(page) + offset; - check_for_stack(dev, addr); check_for_illegal_area(dev, addr, size); } @@ -1386,8 +1409,9 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, entry->sg_call_ents = nents; entry->sg_mapped_ents = mapped_ents; + check_for_stack(dev, sg_page(s), s->offset); + if (!PageHighMem(sg_page(s))) { - check_for_stack(dev, sg_virt(s)); check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s)); } From e37e43a497d5a8b7c0cc1736d56986f432c394c9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 11 Aug 2016 02:35:23 -0700 Subject: [PATCH 099/538] x86/mm/64: Enable vmapped stacks (CONFIG_HAVE_ARCH_VMAP_STACK=y) This allows x86_64 kernels to enable vmapped stacks by setting HAVE_ARCH_VMAP_STACK=y - which enables the CONFIG_VMAP_STACK=y high level Kconfig option. There are a couple of interesting bits: First, x86 lazily faults in top-level paging entries for the vmalloc area. This won't work if we get a page fault while trying to access the stack: the CPU will promote it to a double-fault and we'll die. To avoid this problem, probe the new stack when switching stacks and forcibly populate the pgd entry for the stack when switching mms. Second, once we have guard pages around the stack, we'll want to detect and handle stack overflow. I didn't enable it on x86_32. We'd need to rework the double-fault code a bit and I'm concerned about running out of vmalloc virtual addresses under some workloads. This patch, by itself, will behave somewhat erratically when the stack overflows while RSP is still more than a few tens of bytes above the bottom of the stack. Specifically, we'll get #PF and make it to no_context and them oops without reliably triggering a double-fault, and no_context doesn't know about stack overflows. The next patch will improve that case. Thank you to Nadav and Brian for helping me pay enough attention to the SDM to hopefully get this right. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Nadav Amit Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c88f3e2920b18e6cc621d772a04a62c06869037e.1470907718.git.luto@kernel.org [ Minor edits. ] Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/include/asm/switch_to.h | 28 ++++++++++++++- arch/x86/kernel/traps.c | 61 ++++++++++++++++++++++++++++++++ arch/x86/mm/tlb.c | 15 ++++++++ 4 files changed, 104 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c580d8c33562..21a6d0ec5983 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -94,6 +94,7 @@ config X86 select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_EBPF_JIT if X86_64 + select HAVE_ARCH_VMAP_STACK if X86_64 select HAVE_CC_STACKPROTECTOR select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 8f321a1b03a1..14e4b20f0aaf 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -8,6 +8,28 @@ struct tss_struct; void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss); +/* This runs runs on the previous thread's stack. */ +static inline void prepare_switch_to(struct task_struct *prev, + struct task_struct *next) +{ +#ifdef CONFIG_VMAP_STACK + /* + * If we switch to a stack that has a top-level paging entry + * that is not present in the current mm, the resulting #PF will + * will be promoted to a double-fault and we'll panic. Probe + * the new stack now so that vmalloc_fault can fix up the page + * tables if needed. This can only happen if we use a stack + * in vmap space. + * + * We assume that the stack is aligned so that it never spans + * more than one top-level paging entry. + * + * To minimize cache pollution, just follow the stack pointer. + */ + READ_ONCE(*(unsigned char *)next->thread.sp); +#endif +} + #ifdef CONFIG_X86_32 #ifdef CONFIG_CC_STACKPROTECTOR @@ -39,6 +61,8 @@ do { \ */ \ unsigned long ebx, ecx, edx, esi, edi; \ \ + prepare_switch_to(prev, next); \ + \ asm volatile("pushl %%ebp\n\t" /* save EBP */ \ "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ @@ -103,7 +127,9 @@ do { \ * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL * has no effect. */ -#define switch_to(prev, next, last) \ +#define switch_to(prev, next, last) \ + prepare_switch_to(prev, next); \ + \ asm volatile(SAVE_CONTEXT \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b70ca12dd389..907b4e4aeb5e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) +#ifdef CONFIG_VMAP_STACK +static void __noreturn handle_stack_overflow(const char *message, + struct pt_regs *regs, + unsigned long fault_address) +{ + printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n", + (void *)fault_address, current->stack, + (char *)current->stack + THREAD_SIZE - 1); + die(message, regs, 0); + + /* Be absolutely certain we don't return. */ + panic(message); +} +#endif + #ifdef CONFIG_X86_64 /* Runs on IST stack */ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) { static const char str[] = "double fault"; struct task_struct *tsk = current; +#ifdef CONFIG_VMAP_STACK + unsigned long cr2; +#endif #ifdef CONFIG_X86_ESPFIX64 extern unsigned char native_irq_return_iret[]; @@ -332,6 +350,49 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_DF; +#ifdef CONFIG_VMAP_STACK + /* + * If we overflow the stack into a guard page, the CPU will fail + * to deliver #PF and will send #DF instead. Similarly, if we + * take any non-IST exception while too close to the bottom of + * the stack, the processor will get a page fault while + * delivering the exception and will generate a double fault. + * + * According to the SDM (footnote in 6.15 under "Interrupt 14 - + * Page-Fault Exception (#PF): + * + * Processors update CR2 whenever a page fault is detected. If a + * second page fault occurs while an earlier page fault is being + * deliv- ered, the faulting linear address of the second fault will + * overwrite the contents of CR2 (replacing the previous + * address). These updates to CR2 occur even if the page fault + * results in a double fault or occurs during the delivery of a + * double fault. + * + * The logic below has a small possibility of incorrectly diagnosing + * some errors as stack overflows. For example, if the IDT or GDT + * gets corrupted such that #GP delivery fails due to a bad descriptor + * causing #GP and we hit this condition while CR2 coincidentally + * points to the stack guard page, we'll think we overflowed the + * stack. Given that we're going to panic one way or another + * if this happens, this isn't necessarily worth fixing. + * + * If necessary, we could improve the test by only diagnosing + * a stack overflow if the saved RSP points within 47 bytes of + * the bottom of the stack: if RSP == tsk_stack + 48 and we + * take an exception, the stack is already aligned and there + * will be enough room SS, RSP, RFLAGS, CS, RIP, and a + * possible error code, so a stack overflow would *not* double + * fault. With any less space left, exception delivery could + * fail, and, as a practical matter, we've overflowed the + * stack even if the actual trigger for the double fault was + * something else. + */ + cr2 = read_cr2(); + if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE) + handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2); +#endif + #ifdef CONFIG_DOUBLEFAULT df_debug(regs, error_code); #endif diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 4dbe65622810..a7655f6caf7d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -77,10 +77,25 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, unsigned cpu = smp_processor_id(); if (likely(prev != next)) { + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + /* + * If our current stack is in vmalloc space and isn't + * mapped in the new pgd, we'll double-fault. Forcibly + * map it. + */ + unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); + + pgd_t *pgd = next->pgd + stack_pgd_index; + + if (unlikely(pgd_none(*pgd))) + set_pgd(pgd, init_mm.pgd[stack_pgd_index]); + } + #ifdef CONFIG_SMP this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); this_cpu_write(cpu_tlbstate.active_mm, next); #endif + cpumask_set_cpu(cpu, mm_cpumask(next)); /* From e4a744ef2fef5c803348b650a3a2d01da7797a9b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 19 Aug 2016 06:52:55 -0500 Subject: [PATCH 100/538] ftrace: Remove CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST from config Make HAVE_FUNCTION_GRAPH_FP_TEST a normal define, independent from kconfig. This removes some config file pollution and simplifies the checking for the fp test. Suggested-by: Steven Rostedt Signed-off-by: Josh Poimboeuf Acked-by: Steven Rostedt Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2c4e5f05054d6d367f702fd153af7a0109dd5c81.1471607358.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/arm64/kernel/entry-ftrace.S | 2 +- arch/blackfin/kernel/ftrace-entry.S | 4 ++-- arch/sparc/Kconfig | 1 - arch/sparc/include/asm/ftrace.h | 4 ++++ arch/x86/Kconfig | 1 - arch/x86/include/asm/ftrace.h | 1 + kernel/trace/Kconfig | 5 ----- kernel/trace/trace_functions_graph.c | 2 +- 8 files changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index 0f03a8fe2314..aef02d2af3b5 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -219,7 +219,7 @@ ENDPROC(ftrace_graph_caller) * * Run ftrace_return_to_handler() before going back to parent. * @fp is checked against the value passed by ftrace_graph_caller() - * only when CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST is enabled. + * only when HAVE_FUNCTION_GRAPH_FP_TEST is enabled. */ ENTRY(return_to_handler) save_return_regs diff --git a/arch/blackfin/kernel/ftrace-entry.S b/arch/blackfin/kernel/ftrace-entry.S index 28d059540424..3b8bdcbb7da3 100644 --- a/arch/blackfin/kernel/ftrace-entry.S +++ b/arch/blackfin/kernel/ftrace-entry.S @@ -169,7 +169,7 @@ ENTRY(_ftrace_graph_caller) r0 = sp; /* unsigned long *parent */ r1 = [sp]; /* unsigned long self_addr */ # endif -# ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST +# ifdef HAVE_FUNCTION_GRAPH_FP_TEST r2 = fp; /* unsigned long frame_pointer */ # endif r0 += 16; /* skip the 4 local regs on stack */ @@ -190,7 +190,7 @@ ENTRY(_return_to_handler) [--sp] = r1; /* get original return address */ -# ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST +# ifdef HAVE_FUNCTION_GRAPH_FP_TEST r0 = fp; /* Blackfin is sane, so omit this */ # endif call _ftrace_return_to_handler; diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 59b09600dd32..f5d60f14a0bc 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -56,7 +56,6 @@ config SPARC64 def_bool 64BIT select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER - select HAVE_FUNCTION_GRAPH_FP_TEST select HAVE_KRETPROBES select HAVE_KPROBES select HAVE_RCU_TABLE_FREE if SMP diff --git a/arch/sparc/include/asm/ftrace.h b/arch/sparc/include/asm/ftrace.h index 3192a8e42fd6..62755a339a59 100644 --- a/arch/sparc/include/asm/ftrace.h +++ b/arch/sparc/include/asm/ftrace.h @@ -9,6 +9,10 @@ void _mcount(void); #endif +#endif /* CONFIG_MCOUNT */ + +#if defined(CONFIG_SPARC64) && !defined(CC_USE_FENTRY) +#define HAVE_FUNCTION_GRAPH_FP_TEST #endif #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 21a6d0ec5983..ce8860cccc34 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -111,7 +111,6 @@ config X86 select HAVE_EXIT_THREAD select HAVE_FENTRY if X86_64 select HAVE_FTRACE_MCOUNT_RECORD - select HAVE_FUNCTION_GRAPH_FP_TEST select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index a4820d4df617..37f67cbba1c6 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -6,6 +6,7 @@ # define MCOUNT_ADDR ((unsigned long)(__fentry__)) #else # define MCOUNT_ADDR ((unsigned long)(mcount)) +# define HAVE_FUNCTION_GRAPH_FP_TEST #endif #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f4b86e8ca1e7..ba3326785ca4 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -24,11 +24,6 @@ config HAVE_FUNCTION_GRAPH_TRACER help See Documentation/trace/ftrace-design.txt -config HAVE_FUNCTION_GRAPH_FP_TEST - bool - help - See Documentation/trace/ftrace-design.txt - config HAVE_DYNAMIC_FTRACE bool help diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 7363ccf79512..fc173cd9fbfd 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -204,7 +204,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, return; } -#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY) +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST /* * The arch may choose to record the frame pointer used * and check it here to make sure that it is what we expect it From daa460a88c09b26b68e8b017de589c217e901afb Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 19 Aug 2016 06:52:56 -0500 Subject: [PATCH 101/538] ftrace: Only allocate the ret_stack 'fp' field when needed This saves some memory when HAVE_FUNCTION_GRAPH_FP_TEST isn't defined. On x86_64 with newer versions of gcc which have -mfentry, it saves 400 bytes per task. Signed-off-by: Josh Poimboeuf Acked-by: Steven Rostedt Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5c7747d9ea7b5cb47ef0a8ce8a6cea6bf7aa94bf.1471607358.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 2 ++ kernel/trace/trace_functions_graph.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 7d565afe35d2..4ad9ccc60e38 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -795,7 +795,9 @@ struct ftrace_ret_stack { unsigned long func; unsigned long long calltime; unsigned long long subtime; +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST unsigned long fp; +#endif }; /* diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index fc173cd9fbfd..0e03ed0eac68 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -171,7 +171,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, current->ret_stack[index].func = func; current->ret_stack[index].calltime = calltime; current->ret_stack[index].subtime = 0; +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST current->ret_stack[index].fp = frame_pointer; +#endif *depth = current->curr_ret_stack; return 0; From 9a7c348ba6a46f6270d4fe49577649dad5664fe7 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 19 Aug 2016 06:52:57 -0500 Subject: [PATCH 102/538] ftrace: Add return address pointer to ftrace_ret_stack Storing this value will help prevent unwinders from getting out of sync with the function graph tracer ret_stack. Now instead of needing a stateful iterator, they can compare the return address pointer to find the right ret_stack entry. Note that an array of 50 ftrace_ret_stack structs is allocated for every task. So when an arch implements this, it will add either 200 or 400 bytes of memory usage per task (depending on whether it's a 32-bit or 64-bit platform). Signed-off-by: Josh Poimboeuf Acked-by: Steven Rostedt Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a95cfcc39e8f26b89a430c56926af0bb217bc0a1.1471607358.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- Documentation/trace/ftrace-design.txt | 11 +++++++++++ arch/arm/kernel/ftrace.c | 2 +- arch/arm64/kernel/ftrace.c | 2 +- arch/blackfin/kernel/ftrace.c | 2 +- arch/microblaze/kernel/ftrace.c | 2 +- arch/mips/kernel/ftrace.c | 4 ++-- arch/parisc/kernel/ftrace.c | 2 +- arch/powerpc/kernel/ftrace.c | 3 ++- arch/s390/kernel/ftrace.c | 3 ++- arch/sh/kernel/ftrace.c | 2 +- arch/sparc/kernel/ftrace.c | 2 +- arch/tile/kernel/ftrace.c | 2 +- arch/x86/kernel/ftrace.c | 2 +- include/linux/ftrace.h | 5 ++++- kernel/trace/trace_functions_graph.c | 5 ++++- 15 files changed, 34 insertions(+), 15 deletions(-) diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt index dd5f916b351d..a273dd0bbaaa 100644 --- a/Documentation/trace/ftrace-design.txt +++ b/Documentation/trace/ftrace-design.txt @@ -203,6 +203,17 @@ along to ftrace_push_return_trace() instead of a stub value of 0. Similarly, when you call ftrace_return_to_handler(), pass it the frame pointer. +HAVE_FUNCTION_GRAPH_RET_ADDR_PTR +-------------------------------- + +An arch may pass in a pointer to the return address on the stack. This +prevents potential stack unwinding issues where the unwinder gets out of +sync with ret_stack and the wrong addresses are reported by +ftrace_graph_ret_addr(). + +Adding support for it is easy: just define the macro in asm/ftrace.h and +pass the return address pointer as the 'retp' argument to +ftrace_push_return_trace(). HAVE_FTRACE_NMI_ENTER --------------------- diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c index 709ee1d6d4df..3f1759411d51 100644 --- a/arch/arm/kernel/ftrace.c +++ b/arch/arm/kernel/ftrace.c @@ -218,7 +218,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, } err = ftrace_push_return_trace(old, self_addr, &trace.depth, - frame_pointer); + frame_pointer, NULL); if (err == -EBUSY) { *parent = old; return; diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index ebecf9aa33d1..40ad08ac569a 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -138,7 +138,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, return; err = ftrace_push_return_trace(old, self_addr, &trace.depth, - frame_pointer); + frame_pointer, NULL); if (err == -EBUSY) return; else diff --git a/arch/blackfin/kernel/ftrace.c b/arch/blackfin/kernel/ftrace.c index 095de0fa044d..8dad7589b843 100644 --- a/arch/blackfin/kernel/ftrace.c +++ b/arch/blackfin/kernel/ftrace.c @@ -107,7 +107,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, return; if (ftrace_push_return_trace(*parent, self_addr, &trace.depth, - frame_pointer) == -EBUSY) + frame_pointer, NULL) == -EBUSY) return; trace.func = self_addr; diff --git a/arch/microblaze/kernel/ftrace.c b/arch/microblaze/kernel/ftrace.c index fc7b48a52cd5..d57563c58a26 100644 --- a/arch/microblaze/kernel/ftrace.c +++ b/arch/microblaze/kernel/ftrace.c @@ -63,7 +63,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) return; } - err = ftrace_push_return_trace(old, self_addr, &trace.depth, 0); + err = ftrace_push_return_trace(old, self_addr, &trace.depth, 0, NULL); if (err == -EBUSY) { *parent = old; return; diff --git a/arch/mips/kernel/ftrace.c b/arch/mips/kernel/ftrace.c index 937c54bc8ccc..30a3b75e88eb 100644 --- a/arch/mips/kernel/ftrace.c +++ b/arch/mips/kernel/ftrace.c @@ -382,8 +382,8 @@ void prepare_ftrace_return(unsigned long *parent_ra_addr, unsigned long self_ra, if (unlikely(faulted)) goto out; - if (ftrace_push_return_trace(old_parent_ra, self_ra, &trace.depth, fp) - == -EBUSY) { + if (ftrace_push_return_trace(old_parent_ra, self_ra, &trace.depth, fp, + NULL) == -EBUSY) { *parent_ra_addr = old_parent_ra; return; } diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c index a828a0adf52c..5a5506a35395 100644 --- a/arch/parisc/kernel/ftrace.c +++ b/arch/parisc/kernel/ftrace.c @@ -48,7 +48,7 @@ static void __hot prepare_ftrace_return(unsigned long *parent, return; if (ftrace_push_return_trace(old, self_addr, &trace.depth, - 0 ) == -EBUSY) + 0, NULL) == -EBUSY) return; /* activate parisc_return_to_handler() as return point */ diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index cc52d9795f88..a95639b8d4ac 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -593,7 +593,8 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip) if (!ftrace_graph_entry(&trace)) goto out; - if (ftrace_push_return_trace(parent, ip, &trace.depth, 0) == -EBUSY) + if (ftrace_push_return_trace(parent, ip, &trace.depth, 0, + NULL) == -EBUSY) goto out; parent = return_hooker; diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 0f7bfeba6da6..60a8a4e207ed 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -209,7 +209,8 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip) /* Only trace if the calling function expects to. */ if (!ftrace_graph_entry(&trace)) goto out; - if (ftrace_push_return_trace(parent, ip, &trace.depth, 0) == -EBUSY) + if (ftrace_push_return_trace(parent, ip, &trace.depth, 0, + NULL) == -EBUSY) goto out; parent = (unsigned long) return_to_handler; out: diff --git a/arch/sh/kernel/ftrace.c b/arch/sh/kernel/ftrace.c index 38993e09ef03..95eccd49672f 100644 --- a/arch/sh/kernel/ftrace.c +++ b/arch/sh/kernel/ftrace.c @@ -382,7 +382,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) return; } - err = ftrace_push_return_trace(old, self_addr, &trace.depth, 0); + err = ftrace_push_return_trace(old, self_addr, &trace.depth, 0, NULL); if (err == -EBUSY) { __raw_writel(old, parent); return; diff --git a/arch/sparc/kernel/ftrace.c b/arch/sparc/kernel/ftrace.c index 0a2d2ddff543..6bcff698069b 100644 --- a/arch/sparc/kernel/ftrace.c +++ b/arch/sparc/kernel/ftrace.c @@ -131,7 +131,7 @@ unsigned long prepare_ftrace_return(unsigned long parent, return parent + 8UL; if (ftrace_push_return_trace(parent, self_addr, &trace.depth, - frame_pointer) == -EBUSY) + frame_pointer, NULL) == -EBUSY) return parent + 8UL; trace.func = self_addr; diff --git a/arch/tile/kernel/ftrace.c b/arch/tile/kernel/ftrace.c index 4a572088b270..b827a418b155 100644 --- a/arch/tile/kernel/ftrace.c +++ b/arch/tile/kernel/ftrace.c @@ -184,7 +184,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, *parent = return_hooker; err = ftrace_push_return_trace(old, self_addr, &trace.depth, - frame_pointer); + frame_pointer, NULL); if (err == -EBUSY) { *parent = old; return; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d036cfb4495d..ae3b1fb2f582 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -1029,7 +1029,7 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, } if (ftrace_push_return_trace(old, self_addr, &trace.depth, - frame_pointer) == -EBUSY) { + frame_pointer, NULL) == -EBUSY) { *parent = old; return; } diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4ad9ccc60e38..483e02a50d37 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -798,6 +798,9 @@ struct ftrace_ret_stack { #ifdef HAVE_FUNCTION_GRAPH_FP_TEST unsigned long fp; #endif +#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR + unsigned long *retp; +#endif }; /* @@ -809,7 +812,7 @@ extern void return_to_handler(void); extern int ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, - unsigned long frame_pointer); + unsigned long frame_pointer, unsigned long *retp); /* * Sometimes we don't want to trace a function with the function diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0e03ed0eac68..f7212ec643e2 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -119,7 +119,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, /* Add a function return address to the trace stack on thread info.*/ int ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, - unsigned long frame_pointer) + unsigned long frame_pointer, unsigned long *retp) { unsigned long long calltime; int index; @@ -173,6 +173,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, current->ret_stack[index].subtime = 0; #ifdef HAVE_FUNCTION_GRAPH_FP_TEST current->ret_stack[index].fp = frame_pointer; +#endif +#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR + current->ret_stack[index].retp = retp; #endif *depth = current->curr_ret_stack; From 223918e32a87c79ac55ca4aa513ba405ba4d57cd Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 19 Aug 2016 06:52:58 -0500 Subject: [PATCH 103/538] ftrace: Add ftrace_graph_ret_addr() stack unwinding helpers When function graph tracing is enabled for a function, ftrace modifies the stack by replacing the original return address with the address of a hook function (return_to_handler). Stack unwinders need a way to get the original return address. Add an arch-independent helper function for that named ftrace_graph_ret_addr(). This adds two variations of the function: one depends on HAVE_FUNCTION_GRAPH_RET_ADDR_PTR, and the other relies on an index state variable. The former is recommended because, in some cases, the latter can cause problems when the unwinder skips stack frames. It can get out of sync with the ret_stack index and wrong addresses can be reported for the stack trace. Once all arches have been ported to use HAVE_FUNCTION_GRAPH_RET_ADDR_PTR, we can get rid of the distinction. Signed-off-by: Josh Poimboeuf Acked-by: Steven Rostedt Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/36bd90f762fc5e5af3929e3797a68a64906421cf.1471607358.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 10 +++++ kernel/trace/trace_functions_graph.c | 58 ++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 483e02a50d37..6f93ac46e7f0 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -814,6 +814,9 @@ extern int ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, unsigned long frame_pointer, unsigned long *retp); +unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, + unsigned long ret, unsigned long *retp); + /* * Sometimes we don't want to trace a function with the function * graph tracer but we want them to keep traced by the usual function @@ -875,6 +878,13 @@ static inline int task_curr_ret_stack(struct task_struct *tsk) return -1; } +static inline unsigned long +ftrace_graph_ret_addr(struct task_struct *task, int *idx, unsigned long ret, + unsigned long *retp) +{ + return ret; +} + static inline void pause_graph_tracing(void) { } static inline void unpause_graph_tracing(void) { } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index f7212ec643e2..0cbe38a844fa 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -284,6 +284,64 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +/** + * ftrace_graph_ret_addr - convert a potentially modified stack return address + * to its original value + * + * This function can be called by stack unwinding code to convert a found stack + * return address ('ret') to its original value, in case the function graph + * tracer has modified it to be 'return_to_handler'. If the address hasn't + * been modified, the unchanged value of 'ret' is returned. + * + * 'idx' is a state variable which should be initialized by the caller to zero + * before the first call. + * + * 'retp' is a pointer to the return address on the stack. It's ignored if + * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined. + */ +#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR +unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, + unsigned long ret, unsigned long *retp) +{ + int index = task->curr_ret_stack; + int i; + + if (ret != (unsigned long)return_to_handler) + return ret; + + if (index < -1) + index += FTRACE_NOTRACE_DEPTH; + + if (index < 0) + return ret; + + for (i = 0; i <= index; i++) + if (task->ret_stack[i].retp == retp) + return task->ret_stack[i].ret; + + return ret; +} +#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ +unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, + unsigned long ret, unsigned long *retp) +{ + int task_idx; + + if (ret != (unsigned long)return_to_handler) + return ret; + + task_idx = task->curr_ret_stack; + + if (!task->ret_stack || task_idx < *idx) + return ret; + + task_idx -= *idx; + (*idx)++; + + return task->ret_stack[task_idx].ret; +} +#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ + int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned long flags, From 408fe5de2f2767059a9561e0ae6d4385d1b39dac Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 19 Aug 2016 06:52:59 -0500 Subject: [PATCH 104/538] x86/dumpstack/ftrace: Convert dump_trace() callbacks to use ftrace_graph_ret_addr() Convert print_context_stack() and print_context_stack_bp() to use the arch-independent ftrace_graph_ret_addr() helper. Signed-off-by: Josh Poimboeuf Acked-by: Steven Rostedt Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/56ec97cafc1bf2e34d1119e6443d897db406da86.1471607358.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 65 +++++++++++++------------------------ 1 file changed, 22 insertions(+), 43 deletions(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 5f49c043500a..9bf3d021609c 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -38,38 +38,6 @@ void printk_address(unsigned long address) pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address); } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -static void -print_ftrace_graph_addr(unsigned long addr, void *data, - const struct stacktrace_ops *ops, - struct task_struct *task, int *graph) -{ - unsigned long ret_addr; - int index; - - if (addr != (unsigned long)return_to_handler) - return; - - index = task->curr_ret_stack; - - if (!task->ret_stack || index < *graph) - return; - - index -= *graph; - ret_addr = task->ret_stack[index].ret; - - ops->address(data, ret_addr, 1); - - (*graph)++; -} -#else -static inline void -print_ftrace_graph_addr(unsigned long addr, void *data, - const struct stacktrace_ops *ops, - struct task_struct *task, int *graph) -{ } -#endif - /* * x86-64 can have up to three kernel stacks: * process stack @@ -107,18 +75,24 @@ print_context_stack(struct task_struct *task, stack = (unsigned long *)task_stack_page(task); while (valid_stack_ptr(task, stack, sizeof(*stack), end)) { - unsigned long addr; + unsigned long addr = *stack; - addr = *stack; if (__kernel_text_address(addr)) { + unsigned long real_addr; + int reliable = 0; + if ((unsigned long) stack == bp + sizeof(long)) { - ops->address(data, addr, 1); + reliable = 1; frame = frame->next_frame; bp = (unsigned long) frame; - } else { - ops->address(data, addr, 0); } - print_ftrace_graph_addr(addr, data, ops, task, graph); + + ops->address(data, addr, reliable); + + real_addr = ftrace_graph_ret_addr(task, graph, addr, + stack); + if (real_addr != addr) + ops->address(data, real_addr, 1); } stack++; } @@ -133,19 +107,24 @@ print_context_stack_bp(struct task_struct *task, unsigned long *end, int *graph) { struct stack_frame *frame = (struct stack_frame *)bp; - unsigned long *ret_addr = &frame->return_address; + unsigned long *retp = &frame->return_address; - while (valid_stack_ptr(task, ret_addr, sizeof(*ret_addr), end)) { - unsigned long addr = *ret_addr; + while (valid_stack_ptr(task, retp, sizeof(*retp), end)) { + unsigned long addr = *retp; + unsigned long real_addr; if (!__kernel_text_address(addr)) break; if (ops->address(data, addr, 1)) break; + + real_addr = ftrace_graph_ret_addr(task, graph, addr, retp); + if (real_addr != addr) + ops->address(data, real_addr, 1); + frame = frame->next_frame; - ret_addr = &frame->return_address; - print_ftrace_graph_addr(addr, data, ops, task, graph); + retp = &frame->return_address; } return (unsigned long)frame; From 471bd10f5e2880bd91a2627d887f6062494cfe9c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 19 Aug 2016 06:53:00 -0500 Subject: [PATCH 105/538] ftrace/x86: Implement HAVE_FUNCTION_GRAPH_RET_ADDR_PTR Use the more reliable version of ftrace_graph_ret_addr() so we no longer have to worry about the unwinder getting out of sync with the function graph ret_stack index, which can happen if the unwinder skips any frames before calling ftrace_graph_ret_addr(). This fixes this issue (and several others like it): $ cat /proc/self/stack [] save_stack_trace_tsk+0x22/0x40 [] proc_pid_stack+0xb9/0x110 [] proc_single_show+0x54/0x80 [] seq_read+0x108/0x3e0 [] __vfs_read+0x37/0x140 [] vfs_read+0x99/0x140 [] SyS_read+0x58/0xc0 [] entry_SYSCALL_64_fastpath+0x1f/0xbd [] 0xffffffffffffffff $ echo function_graph > /sys/kernel/debug/tracing/current_tracer $ cat /proc/self/stack [] return_to_handler+0x0/0x27 [] print_context_stack+0xfc/0x100 [] return_to_handler+0x0/0x27 [] dump_trace+0x12b/0x350 [] return_to_handler+0x0/0x27 [] save_stack_trace_tsk+0x22/0x40 [] return_to_handler+0x0/0x27 [] proc_pid_stack+0xb9/0x110 [] return_to_handler+0x0/0x27 [] proc_single_show+0x54/0x80 [] return_to_handler+0x0/0x27 [] seq_read+0x108/0x3e0 [] return_to_handler+0x0/0x27 [] __vfs_read+0x37/0x140 [] return_to_handler+0x0/0x27 [] vfs_read+0x99/0x140 [] 0xffffffffffffffff Enabling function graph tracing causes the stack trace to change in two ways: First, the real call addresses are confusingly interspersed with 'return_to_handler' addresses. This issue will be fixed by the next patch. Second, the stack trace is offset by two frames, because the unwinder skipped the first two frames and got out of sync with the ret_stack index. This patch fixes this issue. Signed-off-by: Josh Poimboeuf Acked-by: Steven Rostedt Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a6d623e36f8d08f9a17bd74d804d201177a23afd.1471607358.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ftrace.h | 2 ++ arch/x86/kernel/ftrace.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 37f67cbba1c6..eccd0ac6bc38 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -14,6 +14,8 @@ #define ARCH_SUPPORTS_FTRACE_OPS 1 #endif +#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR + #ifndef __ASSEMBLY__ extern void mcount(void); extern atomic_t modifying_ftrace_code; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index ae3b1fb2f582..8639bb2ae058 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -1029,7 +1029,7 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, } if (ftrace_push_return_trace(old, self_addr, &trace.depth, - frame_pointer, NULL) == -EBUSY) { + frame_pointer, parent) == -EBUSY) { *parent = old; return; } From 6f727b84e23421721025f4eb1b4f6cea1d4d723a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 19 Aug 2016 06:53:01 -0500 Subject: [PATCH 106/538] x86/dumpstack/ftrace: Mark function graph handler function as unreliable When function graph tracing is enabled for a function, its return address on the stack is replaced with the address of an ftrace handler (return_to_handler). Currently 'return_to_handler' can be reported as reliable. That's not ideal, and can actually be misleading. When saving or dumping the stack, you normally only care about what led up to that point (the call path), rather than what will happen in the future (the return path). That's especially true in the non-oops stack trace case, which isn't used for debugging. For example, in a perf profiling operation, reporting return_to_handler() in the trace would just be confusing. And in the oops case, where debugging is important, "unreliable" is also more appropriate there because it serves as a hint that graph tracing was involved, instead of trying to imply that return_to_handler() was the real caller. Signed-off-by: Josh Poimboeuf Acked-by: Steven Rostedt Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/f8af15749c7d632d3e7f815995831d5b7f82950d.1471607358.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 9bf3d021609c..6aad8d4e2ea6 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -87,12 +87,21 @@ print_context_stack(struct task_struct *task, bp = (unsigned long) frame; } - ops->address(data, addr, reliable); - + /* + * When function graph tracing is enabled for a + * function, its return address on the stack is + * replaced with the address of an ftrace handler + * (return_to_handler). In that case, before printing + * the "real" address, we want to print the handler + * address as an "unreliable" hint that function graph + * tracing was involved. + */ real_addr = ftrace_graph_ret_addr(task, graph, addr, stack); if (real_addr != addr) - ops->address(data, real_addr, 1); + ops->address(data, addr, 0); + + ops->address(data, real_addr, reliable); } stack++; } @@ -116,12 +125,11 @@ print_context_stack_bp(struct task_struct *task, if (!__kernel_text_address(addr)) break; - if (ops->address(data, addr, 1)) - break; - real_addr = ftrace_graph_ret_addr(task, graph, addr, retp); - if (real_addr != addr) - ops->address(data, real_addr, 1); + if (real_addr != addr && ops->address(data, addr, 0)) + break; + if (ops->address(data, real_addr, 1)) + break; frame = frame->next_frame; retp = &frame->return_address; From 13e25bab7e51bdd4ba7df1ef2388961294bb565e Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 19 Aug 2016 06:53:02 -0500 Subject: [PATCH 107/538] x86/dumpstack/ftrace: Don't print unreliable addresses in print_context_stack_bp() When function graph tracing is enabled, print_context_stack_bp() can report return_to_handler() as an unreliable address, which is confusing and misleading: return_to_handler() is really only useful as a hint for debugging, whereas print_context_stack_bp() users only care about the actual 'reliable' call path. Signed-off-by: Josh Poimboeuf Acked-by: Steven Rostedt Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c51aef578d8027791b38d2ad9bac0c7f499fde91.1471607358.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 6aad8d4e2ea6..01072e9e165e 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -126,8 +126,6 @@ print_context_stack_bp(struct task_struct *task, break; real_addr = ftrace_graph_ret_addr(task, graph, addr, retp); - if (real_addr != addr && ops->address(data, addr, 0)) - break; if (ops->address(data, real_addr, 1)) break; From b2d4c2edb2e4f89aaf85449dee3b87fbf0f8a4d4 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 18 Aug 2016 18:41:00 +0200 Subject: [PATCH 108/538] locking/hung_task: Show all locks When we get a hung task it can often be valuable to see _all_ the held locks on the system (in case we are being blocked on trying to acquire one), e.g. with this patch we can immediately see where the problem is below: INFO: task trinity-c3:14933 blocked for more than 120 seconds. Not tainted 4.8.0-rc1+ #135 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. trinity-c3 D ffff88010c16fc88 0 14933 1 0x00080004 ffff88010c16fc88 000000003b9aca00 0000000000000000 0000000000000296 00000000776cdf88 ffff88011a520ae0 ffff88011a520b08 ffff88011a520198 ffffffff867d7f00 ffff88011942c080 ffff880116841580 ffff88010c168000 Call Trace: [] schedule+0x77/0x230 [] __lock_sock+0x129/0x250 [] ? __sk_destruct+0x450/0x450 [] ? wake_bit_function+0x2e0/0x2e0 [] lock_sock_nested+0xeb/0x120 [] irda_setsockopt+0x65/0xb40 [] SyS_setsockopt+0x139/0x230 [] ? SyS_recv+0x20/0x20 [] ? trace_event_raw_event_sys_enter+0xb90/0xb90 [] ? __this_cpu_preempt_check+0x13/0x20 [] ? __context_tracking_exit.part.3+0x30/0x1b0 [] ? SyS_recv+0x20/0x20 [] do_syscall_64+0x1b3/0x4b0 [] entry_SYSCALL64_slow_path+0x25/0x25 Showing all locks held in the system: 2 locks held by khungtaskd/563: #0: (rcu_read_lock){......}, at: [] watchdog+0x106/0x910 #1: (tasklist_lock){......}, at: [] debug_show_all_locks+0x74/0x360 1 lock held by trinity-c0/19280: #0: (sk_lock-AF_IRDA){......}, at: [] irda_accept+0x176/0x10f0 1 lock held by trinity-c0/12865: #0: (sk_lock-AF_IRDA){......}, at: [] irda_accept+0x176/0x10f0 Signed-off-by: Vegard Nossum Cc: Andrew Morton Cc: Linus Torvalds Cc: Mandeep Singh Baines Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471538460-7505-1-git-send-email-vegard.nossum@oracle.com Signed-off-by: Ingo Molnar --- kernel/hung_task.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index d234022805dc..432c3d71d195 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -117,7 +117,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); - debug_show_held_locks(t); + debug_show_all_locks(); touch_nmi_watchdog(); From 4e047aa7f267c3449b6d323510d35864829aca70 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 13 Aug 2016 12:38:16 -0400 Subject: [PATCH 109/538] sched/x86/32, kgdb: Don't use thread.ip in sleeping_thread_to_gdb_regs() Match 64-bit and set gdb_regs[GDB_PC] to zero. thread.ip is always the same point in the scheduler (except for newly forked processes), and will be removed in a future patch. Signed-off-by: Brian Gerst Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jason Wessel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471106302-10159-2-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/kgdb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 04cde527d728..fe649a5f509f 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -172,7 +172,6 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_ES] = __KERNEL_DS; gdb_regs[GDB_PS] = 0; gdb_regs[GDB_CS] = __KERNEL_CS; - gdb_regs[GDB_PC] = p->thread.ip; gdb_regs[GDB_SS] = __KERNEL_DS; gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; @@ -180,7 +179,6 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); gdb_regs32[GDB_CS] = __KERNEL_CS; gdb_regs32[GDB_SS] = __KERNEL_DS; - gdb_regs[GDB_PC] = 0; gdb_regs[GDB_R8] = 0; gdb_regs[GDB_R9] = 0; gdb_regs[GDB_R10] = 0; @@ -190,6 +188,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_R14] = 0; gdb_regs[GDB_R15] = 0; #endif + gdb_regs[GDB_PC] = 0; gdb_regs[GDB_SP] = p->thread.sp; } From 163630191ecb0dd9e4146d3c910045aba1cfeec1 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 13 Aug 2016 12:38:17 -0400 Subject: [PATCH 110/538] sched/x86/64, kgdb: Clear GDB_PS on 64-bit switch_to() no longer saves EFLAGS, so it's bogus to look for it on the stack. Set it to zero like 32-bit. Signed-off-by: Brian Gerst Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jason Wessel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471106302-10159-3-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/kgdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index fe649a5f509f..5e3f294ce264 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -176,7 +176,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; #else - gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); + gdb_regs32[GDB_PS] = 0; gdb_regs32[GDB_CS] = __KERNEL_CS; gdb_regs32[GDB_SS] = __KERNEL_DS; gdb_regs[GDB_R8] = 0; From 7b32aeadbc95d4a41402c1c0da6aa3ab51af4c10 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 13 Aug 2016 12:38:18 -0400 Subject: [PATCH 111/538] sched/x86: Add 'struct inactive_task_frame' to better document the sleeping task stack frame Add 'struct inactive_task_frame', which defines the layout of the stack for a sleeping process. For now, the only defined field is the BP register (frame pointer). Signed-off-by: Brian Gerst Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471106302-10159-4-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/stacktrace.h | 4 ++-- arch/x86/include/asm/switch_to.h | 5 +++++ arch/x86/kernel/kgdb.c | 3 ++- arch/x86/kernel/process.c | 3 ++- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 0944218af9e2..7646fb2772f8 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -8,6 +8,7 @@ #include #include +#include extern int kstack_depth_to_print; @@ -70,8 +71,7 @@ stack_frame(struct task_struct *task, struct pt_regs *regs) return bp; } - /* bp is the last reg pushed by switch_to */ - return *(unsigned long *)task->thread.sp; + return ((struct inactive_task_frame *)task->thread.sp)->bp; } #else static inline unsigned long diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 14e4b20f0aaf..ec689c62c01f 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -30,6 +30,11 @@ static inline void prepare_switch_to(struct task_struct *prev, #endif } +/* data that is pointed to by thread.sp */ +struct inactive_task_frame { + unsigned long bp; +}; + #ifdef CONFIG_X86_32 #ifdef CONFIG_CC_STACKPROTECTOR diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 5e3f294ce264..8e36f249646e 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -50,6 +50,7 @@ #include #include #include +#include struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { @@ -166,7 +167,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_DX] = 0; gdb_regs[GDB_SI] = 0; gdb_regs[GDB_DI] = 0; - gdb_regs[GDB_BP] = *(unsigned long *)p->thread.sp; + gdb_regs[GDB_BP] = ((struct inactive_task_frame *)p->thread.sp)->bp; #ifdef CONFIG_X86_32 gdb_regs[GDB_DS] = __KERNEL_DS; gdb_regs[GDB_ES] = __KERNEL_DS; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 62c0b0ea2ce4..0115a4a4db96 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -32,6 +32,7 @@ #include #include #include +#include /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -556,7 +557,7 @@ unsigned long get_wchan(struct task_struct *p) if (sp < bottom || sp > top) return 0; - fp = READ_ONCE_NOCHECK(*(unsigned long *)sp); + fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp); do { if (fp < bottom || fp > top) return 0; From 0100301bfdf56a2a370c7157b5ab0fbf9313e1cd Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 13 Aug 2016 12:38:19 -0400 Subject: [PATCH 112/538] sched/x86: Rewrite the switch_to() code Move the low-level context switch code to an out-of-line asm stub instead of using complex inline asm. This allows constructing a new stack frame for the child process to make it seamlessly flow to ret_from_fork without an extra test and branch in __switch_to(). It also improves code generation for __schedule() by using the C calling convention instead of clobbering all registers. Signed-off-by: Brian Gerst Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471106302-10159-5-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 37 ++++++++ arch/x86/entry/entry_64.S | 41 ++++++++- arch/x86/include/asm/processor.h | 3 - arch/x86/include/asm/switch_to.h | 139 +++++------------------------ arch/x86/include/asm/thread_info.h | 2 - arch/x86/kernel/asm-offsets.c | 6 ++ arch/x86/kernel/asm-offsets_32.c | 5 ++ arch/x86/kernel/asm-offsets_64.c | 5 ++ arch/x86/kernel/process_32.c | 9 +- arch/x86/kernel/process_64.c | 9 +- arch/x86/kernel/smpboot.c | 1 - 11 files changed, 125 insertions(+), 132 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 0b56666e6039..bf8f221f9c94 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -204,6 +204,43 @@ POP_GS_EX .endm +/* + * %eax: prev task + * %edx: next task + */ +ENTRY(__switch_to_asm) + /* + * Save callee-saved registers + * This must match the order in struct inactive_task_frame + */ + pushl %ebp + pushl %ebx + pushl %edi + pushl %esi + + /* switch stack */ + movl %esp, TASK_threadsp(%eax) + movl TASK_threadsp(%edx), %esp + +#ifdef CONFIG_CC_STACKPROTECTOR + movl TASK_stack_canary(%edx), %ebx + movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset +#endif + + /* restore callee-saved registers */ + popl %esi + popl %edi + popl %ebx + popl %ebp + + jmp __switch_to +END(__switch_to_asm) + +/* + * A newly forked process directly context switches into this address. + * + * eax: prev task we switched from + */ ENTRY(ret_from_fork) pushl %eax call schedule_tail diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f6b40e5c88f1..c1af8acd366b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -367,14 +367,49 @@ END(ptregs_\func) #define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym) #include +/* + * %rdi: prev task + * %rsi: next task + */ +ENTRY(__switch_to_asm) + /* + * Save callee-saved registers + * This must match the order in inactive_task_frame + */ + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + /* switch stack */ + movq %rsp, TASK_threadsp(%rdi) + movq TASK_threadsp(%rsi), %rsp + +#ifdef CONFIG_CC_STACKPROTECTOR + movq TASK_stack_canary(%rsi), %rbx + movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset +#endif + + /* restore callee-saved registers */ + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + + jmp __switch_to +END(__switch_to_asm) + /* * A newly forked process directly context switches into this address. * - * rdi: prev task we switched from + * rax: prev task we switched from */ ENTRY(ret_from_fork) - LOCK ; btr $TIF_FORK, TI_flags(%r8) - + movq %rax, %rdi call schedule_tail /* rdi: 'prev' task parameter */ testb $3, CS(%rsp) /* from kernel_thread? */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 63def9537a2d..6fee8635340b 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -389,9 +389,6 @@ struct thread_struct { unsigned short fsindex; unsigned short gsindex; #endif -#ifdef CONFIG_X86_32 - unsigned long ip; -#endif #ifdef CONFIG_X86_64 unsigned long fsbase; unsigned long gsbase; diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index ec689c62c01f..886d5ea09dba 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -2,8 +2,12 @@ #define _ASM_X86_SWITCH_TO_H struct task_struct; /* one of the stranger aspects of C forward declarations */ + +struct task_struct *__switch_to_asm(struct task_struct *prev, + struct task_struct *next); + __visible struct task_struct *__switch_to(struct task_struct *prev, - struct task_struct *next); + struct task_struct *next); struct tss_struct; void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss); @@ -32,131 +36,30 @@ static inline void prepare_switch_to(struct task_struct *prev, /* data that is pointed to by thread.sp */ struct inactive_task_frame { +#ifdef CONFIG_X86_64 + unsigned long r15; + unsigned long r14; + unsigned long r13; + unsigned long r12; +#else + unsigned long si; + unsigned long di; +#endif + unsigned long bx; unsigned long bp; + unsigned long ret_addr; }; -#ifdef CONFIG_X86_32 - -#ifdef CONFIG_CC_STACKPROTECTOR -#define __switch_canary \ - "movl %P[task_canary](%[next]), %%ebx\n\t" \ - "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" -#define __switch_canary_oparam \ - , [stack_canary] "=m" (stack_canary.canary) -#define __switch_canary_iparam \ - , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) -#else /* CC_STACKPROTECTOR */ -#define __switch_canary -#define __switch_canary_oparam -#define __switch_canary_iparam -#endif /* CC_STACKPROTECTOR */ +struct fork_frame { + struct inactive_task_frame frame; + struct pt_regs regs; +}; -/* - * Saving eflags is important. It switches not only IOPL between tasks, - * it also protects other tasks from NT leaking through sysenter etc. - */ #define switch_to(prev, next, last) \ do { \ - /* \ - * Context-switching clobbers all registers, so we clobber \ - * them explicitly, via unused output variables. \ - * (EAX and EBP is not listed because EBP is saved/restored \ - * explicitly for wchan access and EAX is the return value of \ - * __switch_to()) \ - */ \ - unsigned long ebx, ecx, edx, esi, edi; \ - \ prepare_switch_to(prev, next); \ \ - asm volatile("pushl %%ebp\n\t" /* save EBP */ \ - "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ - "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ - "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ - "pushl %[next_ip]\n\t" /* restore EIP */ \ - __switch_canary \ - "jmp __switch_to\n" /* regparm call */ \ - "1:\t" \ - "popl %%ebp\n\t" /* restore EBP */ \ - \ - /* output parameters */ \ - : [prev_sp] "=m" (prev->thread.sp), \ - [prev_ip] "=m" (prev->thread.ip), \ - "=a" (last), \ - \ - /* clobbered output registers: */ \ - "=b" (ebx), "=c" (ecx), "=d" (edx), \ - "=S" (esi), "=D" (edi) \ - \ - __switch_canary_oparam \ - \ - /* input parameters: */ \ - : [next_sp] "m" (next->thread.sp), \ - [next_ip] "m" (next->thread.ip), \ - \ - /* regparm parameters for __switch_to(): */ \ - [prev] "a" (prev), \ - [next] "d" (next) \ - \ - __switch_canary_iparam \ - \ - : /* reloaded segment registers */ \ - "memory"); \ + ((last) = __switch_to_asm((prev), (next))); \ } while (0) -#else /* CONFIG_X86_32 */ - -/* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t" - -#define __EXTRA_CLOBBER \ - , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ - "r12", "r13", "r14", "r15", "flags" - -#ifdef CONFIG_CC_STACKPROTECTOR -#define __switch_canary \ - "movq %P[task_canary](%%rsi),%%r8\n\t" \ - "movq %%r8,"__percpu_arg([gs_canary])"\n\t" -#define __switch_canary_oparam \ - , [gs_canary] "=m" (irq_stack_union.stack_canary) -#define __switch_canary_iparam \ - , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) -#else /* CC_STACKPROTECTOR */ -#define __switch_canary -#define __switch_canary_oparam -#define __switch_canary_iparam -#endif /* CC_STACKPROTECTOR */ - -/* - * There is no need to save or restore flags, because flags are always - * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL - * has no effect. - */ -#define switch_to(prev, next, last) \ - prepare_switch_to(prev, next); \ - \ - asm volatile(SAVE_CONTEXT \ - "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ - "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ - "call __switch_to\n\t" \ - "movq "__percpu_arg([current_task])",%%rsi\n\t" \ - __switch_canary \ - "movq %P[thread_info](%%rsi),%%r8\n\t" \ - "movq %%rax,%%rdi\n\t" \ - "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ - "jnz ret_from_fork\n\t" \ - RESTORE_CONTEXT \ - : "=a" (last) \ - __switch_canary_oparam \ - : [next] "S" (next), [prev] "D" (prev), \ - [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ - [ti_flags] "i" (offsetof(struct thread_info, flags)), \ - [_tif_fork] "i" (_TIF_FORK), \ - [thread_info] "i" (offsetof(struct task_struct, stack)), \ - [current_task] "m" (current_task) \ - __switch_canary_iparam \ - : "memory", "cc" __EXTRA_CLOBBER) - -#endif /* CONFIG_X86_32 */ - #endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 8b7c8d8e0852..494c4b5ada34 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -95,7 +95,6 @@ struct thread_info { #define TIF_UPROBE 12 /* breakpointed or singlestepping */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* IA32 compatibility process */ -#define TIF_FORK 18 /* ret_from_fork */ #define TIF_NOHZ 19 /* in adaptive nohz mode */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ @@ -119,7 +118,6 @@ struct thread_info { #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) -#define _TIF_FORK (1 << TIF_FORK) #define _TIF_NOHZ (1 << TIF_NOHZ) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 2bd5c6ff7ee7..db3a0af9b9ec 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -28,6 +28,12 @@ #endif void common(void) { + BLANK(); + OFFSET(TASK_threadsp, task_struct, thread.sp); +#ifdef CONFIG_CC_STACKPROTECTOR + OFFSET(TASK_stack_canary, task_struct, stack_canary); +#endif + BLANK(); OFFSET(TI_flags, thread_info, flags); OFFSET(TI_status, thread_info, status); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index ecdc1d217dc0..880aa093268d 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -57,6 +57,11 @@ void foo(void) /* Size of SYSENTER_stack */ DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); +#ifdef CONFIG_CC_STACKPROTECTOR + BLANK(); + OFFSET(stack_canary_offset, stack_canary, canary); +#endif + #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index d875f97d4e0b..210927ee2e74 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -56,6 +56,11 @@ int main(void) OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); BLANK(); +#ifdef CONFIG_CC_STACKPROTECTOR + DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary)); + BLANK(); +#endif + DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); DEFINE(NR_syscalls, sizeof(syscalls_64)); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index d86be29c38c7..4bedbc08e53c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -133,17 +133,20 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls) { struct pt_regs *childregs = task_pt_regs(p); + struct fork_frame *fork_frame = container_of(childregs, struct fork_frame, regs); + struct inactive_task_frame *frame = &fork_frame->frame; struct task_struct *tsk; int err; - p->thread.sp = (unsigned long) childregs; + frame->bp = 0; + p->thread.sp = (unsigned long) fork_frame; p->thread.sp0 = (unsigned long) (childregs+1); memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); - p->thread.ip = (unsigned long) ret_from_kernel_thread; + frame->ret_addr = (unsigned long) ret_from_kernel_thread; task_user_gs(p) = __KERNEL_STACK_CANARY; childregs->ds = __USER_DS; childregs->es = __USER_DS; @@ -161,7 +164,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, if (sp) childregs->sp = sp; - p->thread.ip = (unsigned long) ret_from_fork; + frame->ret_addr = (unsigned long) ret_from_fork; task_user_gs(p) = get_user_gs(current_pt_regs()); p->thread.io_bitmap_ptr = NULL; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 63236d8f84bf..827eeed03e16 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -141,12 +141,17 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, { int err; struct pt_regs *childregs; + struct fork_frame *fork_frame; + struct inactive_task_frame *frame; struct task_struct *me = current; p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; childregs = task_pt_regs(p); - p->thread.sp = (unsigned long) childregs; - set_tsk_thread_flag(p, TIF_FORK); + fork_frame = container_of(childregs, struct fork_frame, regs); + frame = &fork_frame->frame; + frame->bp = 0; + frame->ret_addr = (unsigned long) ret_from_fork; + p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c85d2c636092..7e52f83d3a4b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -942,7 +942,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) per_cpu(cpu_current_top_of_stack, cpu) = (unsigned long)task_stack_page(idle) + THREAD_SIZE; #else - clear_tsk_thread_flag(idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); #endif } From 616d24835eeafa8ef3466479db028abfdfc77531 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 13 Aug 2016 12:38:20 -0400 Subject: [PATCH 113/538] sched/x86: Pass kernel thread parameters in 'struct fork_frame' Instead of setting up a fake pt_regs context, put the kernel thread function pointer and arg into the unused callee-restored registers of 'struct fork_frame'. Signed-off-by: Brian Gerst Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471106302-10159-6-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 31 +++++++++++++------------- arch/x86/entry/entry_64.S | 37 +++++++++++++++----------------- arch/x86/include/asm/switch_to.h | 2 ++ arch/x86/kernel/process_32.c | 18 ++++------------ arch/x86/kernel/process_64.c | 12 +++-------- 5 files changed, 41 insertions(+), 59 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index bf8f221f9c94..b75a8bcd2d23 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -240,35 +240,34 @@ END(__switch_to_asm) * A newly forked process directly context switches into this address. * * eax: prev task we switched from + * ebx: kernel thread func (NULL for user thread) + * edi: kernel thread arg */ ENTRY(ret_from_fork) pushl %eax call schedule_tail popl %eax + testl %ebx, %ebx + jnz 1f /* kernel threads are uncommon */ + +2: /* When we fork, we trace the syscall return in the child, too. */ movl %esp, %eax call syscall_return_slowpath jmp restore_all -END(ret_from_fork) - -ENTRY(ret_from_kernel_thread) - pushl %eax - call schedule_tail - popl %eax - movl PT_EBP(%esp), %eax - call *PT_EBX(%esp) - movl $0, PT_EAX(%esp) + /* kernel thread */ +1: movl %edi, %eax + call *%ebx /* - * Kernel threads return to userspace as if returning from a syscall. - * We should check whether anything actually uses this path and, if so, - * consider switching it over to ret_from_fork. + * A kernel thread is allowed to return here after successfully + * calling do_execve(). Exit to userspace to complete the execve() + * syscall. */ - movl %esp, %eax - call syscall_return_slowpath - jmp restore_all -ENDPROC(ret_from_kernel_thread) + movl $0, PT_EAX(%esp) + jmp 2b +END(ret_from_fork) /* * Return to user mode is not as complex as all this looks, diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index c1af8acd366b..c0373d667674 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -407,37 +407,34 @@ END(__switch_to_asm) * A newly forked process directly context switches into this address. * * rax: prev task we switched from + * rbx: kernel thread func (NULL for user thread) + * r12: kernel thread arg */ ENTRY(ret_from_fork) movq %rax, %rdi call schedule_tail /* rdi: 'prev' task parameter */ - testb $3, CS(%rsp) /* from kernel_thread? */ - jnz 1f - - /* - * We came from kernel_thread. This code path is quite twisted, and - * someone should clean it up. - * - * copy_thread_tls stashes the function pointer in RBX and the - * parameter to be passed in RBP. The called function is permitted - * to call do_execve and thereby jump to user mode. - */ - movq RBP(%rsp), %rdi - call *RBX(%rsp) - movl $0, RAX(%rsp) - - /* - * Fall through as though we're exiting a syscall. This makes a - * twisted sort of sense if we just called do_execve. - */ + testq %rbx, %rbx /* from kernel_thread? */ + jnz 1f /* kernel threads are uncommon */ -1: +2: movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ SWAPGS jmp restore_regs_and_iret + +1: + /* kernel thread */ + movq %r12, %rdi + call *%rbx + /* + * A kernel thread is allowed to return here after successfully + * calling do_execve(). Exit to userspace to complete the execve() + * syscall. + */ + movq $0, RAX(%rsp) + jmp 2b END(ret_from_fork) /* diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 886d5ea09dba..5cb436acd463 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -34,6 +34,8 @@ static inline void prepare_switch_to(struct task_struct *prev, #endif } +asmlinkage void ret_from_fork(void); + /* data that is pointed to by thread.sp */ struct inactive_task_frame { #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4bedbc08e53c..18714a191b2d 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -55,9 +55,6 @@ #include #include -asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread"); - /* * Return saved PC of a blocked thread. */ @@ -139,6 +136,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, int err; frame->bp = 0; + frame->ret_addr = (unsigned long) ret_from_fork; p->thread.sp = (unsigned long) fork_frame; p->thread.sp0 = (unsigned long) (childregs+1); memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); @@ -146,25 +144,17 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); - frame->ret_addr = (unsigned long) ret_from_kernel_thread; - task_user_gs(p) = __KERNEL_STACK_CANARY; - childregs->ds = __USER_DS; - childregs->es = __USER_DS; - childregs->fs = __KERNEL_PERCPU; - childregs->bx = sp; /* function */ - childregs->bp = arg; - childregs->orig_ax = -1; - childregs->cs = __KERNEL_CS | get_kernel_rpl(); - childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; + frame->bx = sp; /* function */ + frame->di = arg; p->thread.io_bitmap_ptr = NULL; return 0; } + frame->bx = 0; *childregs = *current_pt_regs(); childregs->ax = 0; if (sp) childregs->sp = sp; - frame->ret_addr = (unsigned long) ret_from_fork; task_user_gs(p) = get_user_gs(current_pt_regs()); p->thread.io_bitmap_ptr = NULL; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 827eeed03e16..b812cd0d7889 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -50,8 +50,6 @@ #include #include -asmlinkage extern void ret_from_fork(void); - __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); /* Prints also some state that isn't saved in the pt_regs */ @@ -165,15 +163,11 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); - childregs->sp = (unsigned long)childregs; - childregs->ss = __KERNEL_DS; - childregs->bx = sp; /* function */ - childregs->bp = arg; - childregs->orig_ax = -1; - childregs->cs = __KERNEL_CS | get_kernel_rpl(); - childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; + frame->bx = sp; /* function */ + frame->r12 = arg; return 0; } + frame->bx = 0; *childregs = *current_pt_regs(); childregs->ax = 0; From ffcb043ba524d3fbd979a9dac2c9ce8ad352000d Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 13 Aug 2016 12:38:21 -0400 Subject: [PATCH 114/538] sched/x86: Fix thread_saved_pc() thread_saved_pc() was using a completely bogus method to get the return address. Since switch_to() was previously inlined, there was no sane way to know where on the stack the return address was stored. Now with the frame of a sleeping thread well defined, this can be implemented correctly. Signed-off-by: Brian Gerst Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471106302-10159-7-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 10 ++-------- arch/x86/kernel/process.c | 11 +++++++++++ arch/x86/kernel/process_32.c | 8 -------- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6fee8635340b..b22fb5a4ff3c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -721,8 +721,6 @@ static inline void spin_lock_prefetch(const void *x) .addr_limit = KERNEL_DS, \ } -extern unsigned long thread_saved_pc(struct task_struct *tsk); - /* * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack. * This is necessary to guarantee that the entire "struct pt_regs" @@ -773,17 +771,13 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); .addr_limit = KERNEL_DS, \ } -/* - * Return saved PC of a blocked thread. - * What is this good for? it will be always the scheduler or ret_from_fork. - */ -#define thread_saved_pc(t) READ_ONCE_NOCHECK(*(unsigned long *)((t)->thread.sp - 8)) - #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) extern unsigned long KSTK_ESP(struct task_struct *task); #endif /* CONFIG_X86_64 */ +extern unsigned long thread_saved_pc(struct task_struct *tsk); + extern void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 0115a4a4db96..c1fa790c81cd 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -513,6 +513,17 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) return randomize_range(mm->brk, range_end, 0) ? : mm->brk; } +/* + * Return saved PC of a blocked thread. + * What is this good for? it will be always the scheduler or ret_from_fork. + */ +unsigned long thread_saved_pc(struct task_struct *tsk) +{ + struct inactive_task_frame *frame = + (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp); + return READ_ONCE_NOCHECK(frame->ret_addr); +} + /* * Called from fs/proc with a reference on @p to find the function * which called into schedule(). This needs to be done carefully diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 18714a191b2d..404efdfa083b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -55,14 +55,6 @@ #include #include -/* - * Return saved PC of a blocked thread. - */ -unsigned long thread_saved_pc(struct task_struct *tsk) -{ - return ((unsigned long *)tsk->thread.sp)[3]; -} - void __show_regs(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; From 01175255fd8e3e993353a779f819ec8c0c59137e Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 13 Aug 2016 12:38:22 -0400 Subject: [PATCH 115/538] sched: Remove __schedule() non-standard frame annotation Now that the x86 switch_to() uses the standard C calling convention, the STACK_FRAME_NON_STANDARD() annotation is no longer needed. Suggested-by: Josh Poimboeuf Signed-off-by: Brian Gerst Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471106302-10159-8-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2a906f20fba7..3d91b63dd2f6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3381,7 +3381,6 @@ static void __sched notrace __schedule(bool preempt) balance_callback(rq); } -STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ static inline void sched_submit_work(struct task_struct *tsk) { From 0d06108c65e572085b2d1f7c8273f417cad68734 Mon Sep 17 00:00:00 2001 From: Tiffany Lin Date: Sun, 14 Aug 2016 23:31:13 -0300 Subject: [PATCH 116/538] [media] vcodec:mediatek:code refine for v4l2 Encoder driver This patch remove unused header and define from haeder files Signed-off-by: Tiffany Lin Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/mtk-vcodec/mtk_vcodec_drv.h | 1 - drivers/media/platform/mtk-vcodec/mtk_vcodec_intr.h | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/media/platform/mtk-vcodec/mtk_vcodec_drv.h b/drivers/media/platform/mtk-vcodec/mtk_vcodec_drv.h index 94f0a425be42..3a8e6958adae 100644 --- a/drivers/media/platform/mtk-vcodec/mtk_vcodec_drv.h +++ b/drivers/media/platform/mtk-vcodec/mtk_vcodec_drv.h @@ -23,7 +23,6 @@ #include #include -#include "mtk_vcodec_util.h" #define MTK_VCODEC_DRV_NAME "mtk_vcodec_drv" #define MTK_VCODEC_ENC_NAME "mtk-vcodec-enc" diff --git a/drivers/media/platform/mtk-vcodec/mtk_vcodec_intr.h b/drivers/media/platform/mtk-vcodec/mtk_vcodec_intr.h index 33e890f5aa9c..12131855b46a 100644 --- a/drivers/media/platform/mtk-vcodec/mtk_vcodec_intr.h +++ b/drivers/media/platform/mtk-vcodec/mtk_vcodec_intr.h @@ -16,7 +16,6 @@ #define _MTK_VCODEC_INTR_H_ #define MTK_INST_IRQ_RECEIVED 0x1 -#define MTK_INST_WORK_THREAD_ABORT_DONE 0x2 struct mtk_vcodec_ctx; From ad34f5412d2a04a894b2cd2912538ae2e5d64e76 Mon Sep 17 00:00:00 2001 From: Tiffany Lin Date: Sun, 14 Aug 2016 23:47:20 -0300 Subject: [PATCH 117/538] [media] vcodec:mediatek: Fix fops_vcodec_release flow for V4L2 Encoder This patch fix that mtk_vcodec_venc_release should be called after v4l2_m2m_ctx_release Signed-off-by: Tiffany Lin Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/mtk-vcodec/mtk_vcodec_enc.c | 7 ++++++- drivers/media/platform/mtk-vcodec/mtk_vcodec_enc_drv.c | 6 +++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/media/platform/mtk-vcodec/mtk_vcodec_enc.c b/drivers/media/platform/mtk-vcodec/mtk_vcodec_enc.c index 3ed3f2d31df5..3b0691f2deb4 100644 --- a/drivers/media/platform/mtk-vcodec/mtk_vcodec_enc.c +++ b/drivers/media/platform/mtk-vcodec/mtk_vcodec_enc.c @@ -1288,5 +1288,10 @@ int mtk_venc_lock(struct mtk_vcodec_ctx *ctx) void mtk_vcodec_enc_release(struct mtk_vcodec_ctx *ctx) { - venc_if_deinit(ctx); + int ret = venc_if_deinit(ctx); + + if (ret) + mtk_v4l2_err("venc_if_deinit failed=%d", ret); + + ctx->state = MTK_STATE_FREE; } diff --git a/drivers/media/platform/mtk-vcodec/mtk_vcodec_enc_drv.c b/drivers/media/platform/mtk-vcodec/mtk_vcodec_enc_drv.c index c7806ecda2dd..5cd2151431bf 100644 --- a/drivers/media/platform/mtk-vcodec/mtk_vcodec_enc_drv.c +++ b/drivers/media/platform/mtk-vcodec/mtk_vcodec_enc_drv.c @@ -218,11 +218,15 @@ static int fops_vcodec_release(struct file *file) mtk_v4l2_debug(1, "[%d] encoder", ctx->id); mutex_lock(&dev->dev_mutex); + /* + * Call v4l2_m2m_ctx_release to make sure the worker thread is not + * running after venc_if_deinit. + */ + v4l2_m2m_ctx_release(ctx->m2m_ctx); mtk_vcodec_enc_release(ctx); v4l2_fh_del(&ctx->fh); v4l2_fh_exit(&ctx->fh); v4l2_ctrl_handler_free(&ctx->ctrl_hdl); - v4l2_m2m_ctx_release(ctx->m2m_ctx); list_del_init(&ctx->list); dev->num_instances--; From 91ae0e1ec6ec91cd297933886b424f9a4a8acbd4 Mon Sep 17 00:00:00 2001 From: Tiffany Lin Date: Mon, 15 Aug 2016 00:08:03 -0300 Subject: [PATCH 118/538] [media] vcodec:mediatek: Fix visible_height larger than coded_height issue in s_fmt_out The original code add extra 32 line to visible_height. It is incorrect, 32 line should be add to coded_height. The purpose is that user space could calcuate real buffer size needed by using coded_width * coded_height. But this method will make v4l2-compliance test fail, since g_fmt != s_fmt(g_fmt) So remove extend visible_height or coded_height, user space should just use sizeimage to get real buffer size needed Signed-off-by: Tiffany Lin Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/mtk-vcodec/mtk_vcodec_enc.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/media/platform/mtk-vcodec/mtk_vcodec_enc.c b/drivers/media/platform/mtk-vcodec/mtk_vcodec_enc.c index 3b0691f2deb4..9b0187ecfa6a 100644 --- a/drivers/media/platform/mtk-vcodec/mtk_vcodec_enc.c +++ b/drivers/media/platform/mtk-vcodec/mtk_vcodec_enc.c @@ -487,7 +487,6 @@ static int vidioc_venc_s_fmt_out(struct file *file, void *priv, struct mtk_q_data *q_data; int ret, i; struct mtk_video_fmt *fmt; - unsigned int pitch_w_div16; struct v4l2_pix_format_mplane *pix_fmt_mp = &f->fmt.pix_mp; vq = v4l2_m2m_get_vq(ctx->m2m_ctx, f->type); @@ -530,15 +529,6 @@ static int vidioc_venc_s_fmt_out(struct file *file, void *priv, q_data->coded_width = f->fmt.pix_mp.width; q_data->coded_height = f->fmt.pix_mp.height; - pitch_w_div16 = DIV_ROUND_UP(q_data->visible_width, 16); - if (pitch_w_div16 % 8 != 0) { - /* Adjust returned width/height, so application could correctly - * allocate hw required memory - */ - q_data->visible_height += 32; - vidioc_try_fmt(f, q_data->fmt); - } - q_data->field = f->fmt.pix_mp.field; ctx->colorspace = f->fmt.pix_mp.colorspace; ctx->ycbcr_enc = f->fmt.pix_mp.ycbcr_enc; From 16060f7ef660a11f282909b01fb6096e21cf5389 Mon Sep 17 00:00:00 2001 From: Tiffany Lin Date: Mon, 15 Aug 2016 00:15:44 -0300 Subject: [PATCH 119/538] [media] vcodec:mediatek: Add timestamp and timecode copy for V4L2 Encoder This patch add copying timestamp and timecode from src buffer to dst buffer Signed-off-by: Tiffany Lin Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../platform/mtk-vcodec/mtk_vcodec_enc.c | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/media/platform/mtk-vcodec/mtk_vcodec_enc.c b/drivers/media/platform/mtk-vcodec/mtk_vcodec_enc.c index 9b0187ecfa6a..0ca230e0a812 100644 --- a/drivers/media/platform/mtk-vcodec/mtk_vcodec_enc.c +++ b/drivers/media/platform/mtk-vcodec/mtk_vcodec_enc.c @@ -868,7 +868,8 @@ static int mtk_venc_encode_header(void *priv) { struct mtk_vcodec_ctx *ctx = priv; int ret; - struct vb2_buffer *dst_buf; + struct vb2_buffer *src_buf, *dst_buf; + struct vb2_v4l2_buffer *dst_vb2_v4l2, *src_vb2_v4l2; struct mtk_vcodec_mem bs_buf; struct venc_done_result enc_result; @@ -901,6 +902,15 @@ static int mtk_venc_encode_header(void *priv) mtk_v4l2_err("venc_if_encode failed=%d", ret); return -EINVAL; } + src_buf = v4l2_m2m_next_src_buf(ctx->m2m_ctx); + if (src_buf) { + src_vb2_v4l2 = to_vb2_v4l2_buffer(src_buf); + dst_vb2_v4l2 = to_vb2_v4l2_buffer(dst_buf); + dst_buf->timestamp = src_buf->timestamp; + dst_vb2_v4l2->timecode = src_vb2_v4l2->timecode; + } else { + mtk_v4l2_err("No timestamp for the header buffer."); + } ctx->state = MTK_STATE_HEADER; dst_buf->planes[0].bytesused = enc_result.bs_size; @@ -993,7 +1003,7 @@ static void mtk_venc_worker(struct work_struct *work) struct mtk_vcodec_mem bs_buf; struct venc_done_result enc_result; int ret, i; - struct vb2_v4l2_buffer *vb2_v4l2; + struct vb2_v4l2_buffer *dst_vb2_v4l2, *src_vb2_v4l2; /* check dst_buf, dst_buf may be removed in device_run * to stored encdoe header so we need check dst_buf and @@ -1033,9 +1043,14 @@ static void mtk_venc_worker(struct work_struct *work) ret = venc_if_encode(ctx, VENC_START_OPT_ENCODE_FRAME, &frm_buf, &bs_buf, &enc_result); - vb2_v4l2 = container_of(dst_buf, struct vb2_v4l2_buffer, vb2_buf); + src_vb2_v4l2 = to_vb2_v4l2_buffer(src_buf); + dst_vb2_v4l2 = to_vb2_v4l2_buffer(dst_buf); + + dst_buf->timestamp = src_buf->timestamp; + dst_vb2_v4l2->timecode = src_vb2_v4l2->timecode; + if (enc_result.is_key_frm) - vb2_v4l2->flags |= V4L2_BUF_FLAG_KEYFRAME; + dst_vb2_v4l2->flags |= V4L2_BUF_FLAG_KEYFRAME; if (ret) { v4l2_m2m_buf_done(to_vb2_v4l2_buffer(src_buf), From 158d6071bc0aad6663109d2fe9249c3cf570d423 Mon Sep 17 00:00:00 2001 From: Tiffany Lin Date: Mon, 15 Aug 2016 00:26:02 -0300 Subject: [PATCH 120/538] [media] vcodec:mediatek: change H264 profile default to profile high This patch change default H264 profile from V4L2_MPEG_VIDEO_H264_PROFILE_MAIN to V4L2_MPEG_VIDEO_H264_PROFILE_HIGH Signed-off-by: Tiffany Lin Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/mtk-vcodec/mtk_vcodec_enc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/mtk-vcodec/mtk_vcodec_enc.c b/drivers/media/platform/mtk-vcodec/mtk_vcodec_enc.c index 0ca230e0a812..2c5719ac23b2 100644 --- a/drivers/media/platform/mtk-vcodec/mtk_vcodec_enc.c +++ b/drivers/media/platform/mtk-vcodec/mtk_vcodec_enc.c @@ -1222,7 +1222,7 @@ int mtk_vcodec_enc_ctrls_setup(struct mtk_vcodec_ctx *ctx) 0, V4L2_MPEG_VIDEO_HEADER_MODE_SEPARATE); v4l2_ctrl_new_std_menu(handler, ops, V4L2_CID_MPEG_VIDEO_H264_PROFILE, V4L2_MPEG_VIDEO_H264_PROFILE_HIGH, - 0, V4L2_MPEG_VIDEO_H264_PROFILE_MAIN); + 0, V4L2_MPEG_VIDEO_H264_PROFILE_HIGH); v4l2_ctrl_new_std_menu(handler, ops, V4L2_CID_MPEG_VIDEO_H264_LEVEL, V4L2_MPEG_VIDEO_H264_LEVEL_4_2, 0, V4L2_MPEG_VIDEO_H264_LEVEL_4_0); From 2d683b6dad73b5636297ac4978f73f2c638a0b19 Mon Sep 17 00:00:00 2001 From: Tiffany Lin Date: Mon, 15 Aug 2016 00:33:32 -0300 Subject: [PATCH 121/538] [media] vcodec:mediatek: Refine H264 encoder driver This patch : 1. remove field and function that unused anymore 2. add support V4L2_MPEG_VIDEO_H264_LEVEL_4_2 Signed-off-by: Tiffany Lin Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../platform/mtk-vcodec/venc/venc_h264_if.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/media/platform/mtk-vcodec/venc/venc_h264_if.c b/drivers/media/platform/mtk-vcodec/venc/venc_h264_if.c index 9a600525b3c1..63d4be4ff327 100644 --- a/drivers/media/platform/mtk-vcodec/venc/venc_h264_if.c +++ b/drivers/media/platform/mtk-vcodec/venc/venc_h264_if.c @@ -61,6 +61,8 @@ enum venc_h264_bs_mode { /* * struct venc_h264_vpu_config - Structure for h264 encoder configuration + * AP-W/R : AP is writer/reader on this item + * VPU-W/R: VPU is write/reader on this item * @input_fourcc: input fourcc * @bitrate: target bitrate (in bps) * @pic_w: picture width. Picture size is visible stream resolution, in pixels, @@ -94,13 +96,13 @@ struct venc_h264_vpu_config { /* * struct venc_h264_vpu_buf - Structure for buffer information - * @align: buffer alignment (in bytes) + * AP-W/R : AP is writer/reader on this item + * VPU-W/R: VPU is write/reader on this item * @iova: IO virtual address * @vpua: VPU side memory addr which is used by RC_CODE * @size: buffer size (in bytes) */ struct venc_h264_vpu_buf { - u32 align; u32 iova; u32 vpua; u32 size; @@ -108,6 +110,8 @@ struct venc_h264_vpu_buf { /* * struct venc_h264_vsi - Structure for VPU driver control and info share + * AP-W/R : AP is writer/reader on this item + * VPU-W/R: VPU is write/reader on this item * This structure is allocated in VPU side and shared to AP side. * @config: h264 encoder configuration * @work_bufs: working buffer information in VPU side @@ -150,12 +154,6 @@ struct venc_h264_inst { struct mtk_vcodec_ctx *ctx; }; -static inline void h264_write_reg(struct venc_h264_inst *inst, u32 addr, - u32 val) -{ - writel(val, inst->hw_base + addr); -} - static inline u32 h264_read_reg(struct venc_h264_inst *inst, u32 addr) { return readl(inst->hw_base + addr); @@ -214,6 +212,8 @@ static unsigned int h264_get_level(struct venc_h264_inst *inst, return 40; case V4L2_MPEG_VIDEO_H264_LEVEL_4_1: return 41; + case V4L2_MPEG_VIDEO_H264_LEVEL_4_2: + return 42; default: mtk_vcodec_debug(inst, "unsupported level %d", level); return 31; From 19d6837a52f1683cf448265952d559a44a7df924 Mon Sep 17 00:00:00 2001 From: Tiffany Lin Date: Mon, 15 Aug 2016 00:37:19 -0300 Subject: [PATCH 122/538] [media] vcodec:mediatek: Refine VP8 encoder driver This patch remove field and function that unused anymore Signed-off-by: Tiffany Lin Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../media/platform/mtk-vcodec/venc/venc_vp8_if.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/media/platform/mtk-vcodec/venc/venc_vp8_if.c b/drivers/media/platform/mtk-vcodec/venc/venc_vp8_if.c index 60bbcd2a0510..6d9758479f9a 100644 --- a/drivers/media/platform/mtk-vcodec/venc/venc_vp8_if.c +++ b/drivers/media/platform/mtk-vcodec/venc/venc_vp8_if.c @@ -56,6 +56,8 @@ enum venc_vp8_vpu_work_buf { /* * struct venc_vp8_vpu_config - Structure for vp8 encoder configuration + * AP-W/R : AP is writer/reader on this item + * VPU-W/R: VPU is write/reader on this item * @input_fourcc: input fourcc * @bitrate: target bitrate (in bps) * @pic_w: picture width. Picture size is visible stream resolution, in pixels, @@ -83,14 +85,14 @@ struct venc_vp8_vpu_config { }; /* - * struct venc_vp8_vpu_buf -Structure for buffer information - * @align: buffer alignment (in bytes) + * struct venc_vp8_vpu_buf - Structure for buffer information + * AP-W/R : AP is writer/reader on this item + * VPU-W/R: VPU is write/reader on this item * @iova: IO virtual address * @vpua: VPU side memory addr which is used by RC_CODE * @size: buffer size (in bytes) */ struct venc_vp8_vpu_buf { - u32 align; u32 iova; u32 vpua; u32 size; @@ -98,6 +100,8 @@ struct venc_vp8_vpu_buf { /* * struct venc_vp8_vsi - Structure for VPU driver control and info share + * AP-W/R : AP is writer/reader on this item + * VPU-W/R: VPU is write/reader on this item * This structure is allocated in VPU side and shared to AP side. * @config: vp8 encoder configuration * @work_bufs: working buffer information in VPU side @@ -138,12 +142,6 @@ struct venc_vp8_inst { struct mtk_vcodec_ctx *ctx; }; -static inline void vp8_enc_write_reg(struct venc_vp8_inst *inst, u32 addr, - u32 val) -{ - writel(val, inst->hw_base + addr); -} - static inline u32 vp8_enc_read_reg(struct venc_vp8_inst *inst, u32 addr) { return readl(inst->hw_base + addr); From 936523441bb64cdc9a5b263e8fd2782e70313a57 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 6 Aug 2016 15:50:52 +0200 Subject: [PATCH 123/538] batman-adv: Add missing refcnt for last_candidate batadv_find_router dereferences last_bonding_candidate from orig_node without making sure that it has a valid reference. This reference has to be retrieved by increasing the reference counter while holding neigh_list_lock. The lock is required to avoid that batadv_last_bonding_replace removes the current last_bonding_candidate, reduces the reference counter and maybe destroys the object in this process. Fixes: f3b3d9018975 ("batman-adv: add bonding again") Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/routing.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 7602c001e92b..3d199478c405 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -469,6 +469,29 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv, return 0; } +/** + * batadv_last_bonding_get - Get last_bonding_candidate of orig_node + * @orig_node: originator node whose last bonding candidate should be retrieved + * + * Return: last bonding candidate of router or NULL if not found + * + * The object is returned with refcounter increased by 1. + */ +static struct batadv_orig_ifinfo * +batadv_last_bonding_get(struct batadv_orig_node *orig_node) +{ + struct batadv_orig_ifinfo *last_bonding_candidate; + + spin_lock_bh(&orig_node->neigh_list_lock); + last_bonding_candidate = orig_node->last_bonding_candidate; + + if (last_bonding_candidate) + kref_get(&last_bonding_candidate->refcount); + spin_unlock_bh(&orig_node->neigh_list_lock); + + return last_bonding_candidate; +} + /** * batadv_last_bonding_replace - Replace last_bonding_candidate of orig_node * @orig_node: originator node whose bonding candidates should be replaced @@ -539,7 +562,7 @@ batadv_find_router(struct batadv_priv *bat_priv, * router - obviously there are no other candidates. */ rcu_read_lock(); - last_candidate = orig_node->last_bonding_candidate; + last_candidate = batadv_last_bonding_get(orig_node); if (last_candidate) last_cand_router = rcu_dereference(last_candidate->router); @@ -631,6 +654,9 @@ batadv_find_router(struct batadv_priv *bat_priv, batadv_orig_ifinfo_put(next_candidate); } + if (last_candidate) + batadv_orig_ifinfo_put(last_candidate); + return router; } From 1e5d343b8f23770e8ac5d31f5c439826bdb35148 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Tue, 23 Aug 2016 03:13:03 +0200 Subject: [PATCH 124/538] batman-adv: fix elp packet data reservation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The skb_reserve() call only reserved headroom for the mac header, but not the elp packet header itself. Fixing this by using skb_put()'ing towards the skb tail instead of skb_push()'ing towards the skb head. Fixes: d6f94d91f766 ("batman-adv: ELP - adding basic infrastructure") Signed-off-by: Linus Lüssing Signed-off-by: Marek Lindner Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_elp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 7d170010beb9..ee08540ce503 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -335,7 +335,7 @@ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface) goto out; skb_reserve(hard_iface->bat_v.elp_skb, ETH_HLEN + NET_IP_ALIGN); - elp_buff = skb_push(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN); + elp_buff = skb_put(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN); elp_packet = (struct batadv_elp_packet *)elp_buff; memset(elp_packet, 0, BATADV_ELP_HLEN); From b4d90e9f1ef1f19dcb2b1b1942c786c9c4225460 Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Thu, 23 Jun 2016 20:50:37 +0200 Subject: [PATCH 125/538] hrtimer: Spelling fixes Fix a minor spelling error. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Signed-off-by: Pratyush Patel [jstultz: Added commit message] Signed-off-by: John Stultz --- kernel/time/hrtimer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 9ba7c820fc23..252ea4741592 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -703,7 +703,7 @@ static void clock_was_set_work(struct work_struct *work) static DECLARE_WORK(hrtimer_work, clock_was_set_work); /* - * Called from timekeeping and resume code to reprogramm the hrtimer + * Called from timekeeping and resume code to reprogram the hrtimer * interrupt device on all cpus. */ void clock_was_set_delayed(void) @@ -1241,7 +1241,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, /* * Note: We clear the running state after enqueue_hrtimer and - * we do not reprogramm the event hardware. Happens either in + * we do not reprogram the event hardware. Happens either in * hrtimer_start_range_ns() or in hrtimer_interrupt() * * Note: Because we dropped the cpu_base->lock above, From 36374583f9084cdab4b5dcf5521a3ce55bebb9fa Mon Sep 17 00:00:00 2001 From: Kyle Walker Date: Sat, 6 Aug 2016 12:07:30 -0400 Subject: [PATCH 126/538] clocksource: Defer override invalidation unless clock is unstable Clocksources don't get the VALID_FOR_HRES flag until they have been checked by a watchdog. However, when using an override, the clocksource_select logic will clear the override value if the clocksource is not marked VALID_FOR_HRES during that inititial check. When using the boot arguments clocksource=, this selection can run before the watchdog, and can cause the override to be incorrectly cleared. To address this condition, the override_name is only invalidated for unstable clocksources. Otherwise, the override is left intact until after the watchdog has validated the clocksource as stable/unstable. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Martin Schwidefsky Signed-off-by: Kyle Walker Signed-off-by: John Stultz --- kernel/time/clocksource.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6a5a310a1a53..7e4fad75acaa 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -600,9 +600,18 @@ static void __clocksource_select(bool skipcur) */ if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { /* Override clocksource cannot be used. */ - pr_warn("Override clocksource %s is not HRT compatible - cannot switch while in HRT/NOHZ mode\n", - cs->name); - override_name[0] = 0; + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n", + cs->name); + override_name[0] = 0; + } else { + /* + * The override cannot be currently verified. + * Deferring to let the watchdog check. + */ + pr_info("Override clocksource %s is not currently HRT compatible - deferring\n", + cs->name); + } } else /* Override clocksource can be used. */ best = cs; From 0bf43f15db857e83daf4134aa062c8b157a80ee0 Mon Sep 17 00:00:00 2001 From: Ruchi Kandoi Date: Thu, 11 Aug 2016 14:35:01 -0700 Subject: [PATCH 127/538] timekeeping: Prints the amounts of time spent during suspend In addition to keeping a histogram of suspend times, also print out the time spent in suspend to dmesg. This helps to keep track of suspend time while debugging using kernel logs. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Signed-off-by: Ruchi Kandoi [jstultz: Tweaked commit message] Signed-off-by: John Stultz --- kernel/time/timekeeping_debug.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 107310a6f36f..ca9fb800336b 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -75,5 +75,7 @@ void tk_debug_account_sleep_time(struct timespec64 *t) int bin = min(fls(t->tv_sec), NUM_BINS-1); sleep_time_bin[bin]++; + pr_info("Suspended for %lld.%03lu seconds\n", (s64)t->tv_sec, + t->tv_nsec / NSEC_PER_MSEC); } From 469e857f374640f6164913835ce30d0736b40a60 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 12 Aug 2016 20:14:09 +0200 Subject: [PATCH 128/538] time: Avoid undefined behaviour in timespec64_add_safe() I ran into this: ================================================================================ UBSAN: Undefined behaviour in kernel/time/time.c:783:2 signed integer overflow: 5273 + 9223372036854771711 cannot be represented in type 'long int' CPU: 0 PID: 17363 Comm: trinity-c0 Not tainted 4.8.0-rc1+ #88 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.3-0-ge2fc41e-prebuilt.qemu-project.org 04/01/2014 0000000000000000 ffff88011457f8f0 ffffffff82344f50 0000000041b58ab3 ffffffff84f98080 ffffffff82344ea4 ffff88011457f918 ffff88011457f8c8 ffff88011457f8e0 7fffffffffffefff ffff88011457f6d8 dffffc0000000000 Call Trace: [] dump_stack+0xac/0xfc [] ? _atomic_dec_and_lock+0xc4/0xc4 [] ubsan_epilogue+0xd/0x8a [] handle_overflow+0x202/0x23d [] ? val_to_string.constprop.6+0x11e/0x11e [] ? debug_smp_processor_id+0x17/0x20 [] ? __sigqueue_free.part.13+0x51/0x70 [] ? rcu_is_watching+0x110/0x110 [] __ubsan_handle_add_overflow+0xe/0x10 [] timespec64_add_safe+0x298/0x340 [] ? timespec_add_safe+0x330/0x330 [] ? wait_noreap_copyout+0x1d0/0x1d0 [] poll_select_set_timeout+0xf8/0x170 [] ? poll_schedule_timeout+0x2b0/0x2b0 [] ? __might_sleep+0x5b/0x260 [] __sys_recvmmsg+0x107/0x790 [] ? SyS_recvmsg+0x20/0x20 [] ? hrtimer_start_range_ns+0x3b8/0x1380 [] ? _raw_spin_unlock_irqrestore+0x3b/0x60 [] ? do_setitimer+0x39a/0x8e0 [] ? __might_sleep+0x5b/0x260 [] ? __sys_recvmmsg+0x790/0x790 [] SyS_recvmmsg+0xd9/0x160 [] ? __sys_recvmmsg+0x790/0x790 [] ? __this_cpu_preempt_check+0x13/0x20 [] ? __context_tracking_exit.part.3+0x30/0x1b0 [] ? __sys_recvmmsg+0x790/0x790 [] do_syscall_64+0x1b3/0x4b0 [] entry_SYSCALL64_slow_path+0x25/0x25 ================================================================================ Line 783 is this: 783 set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec, 784 lhs.tv_nsec + rhs.tv_nsec); In other words, since lhs.tv_sec and rhs.tv_sec are both time64_t, this is a signed addition which will cause undefined behaviour on overflow. Note that this is not currently a huge concern since the kernel should be built with -fno-strict-overflow by default, but could be a problem in the future, a problem with older compilers, or other compilers than gcc. The easiest way to avoid the overflow is to cast one of the arguments to unsigned (so the addition will be done using unsigned arithmetic). Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Signed-off-by: Vegard Nossum Signed-off-by: John Stultz --- include/linux/time64.h | 1 + kernel/time/time.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/time64.h b/include/linux/time64.h index 7e5d2fa9ac46..980c71b3001a 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -5,6 +5,7 @@ #include typedef __s64 time64_t; +typedef __u64 timeu64_t; /* * This wants to go into uapi/linux/time.h once we agreed about the diff --git a/kernel/time/time.c b/kernel/time/time.c index 667b9335f5d6..bd62fb8e8e77 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -780,7 +780,7 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, { struct timespec64 res; - set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec, + set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec, lhs.tv_nsec + rhs.tv_nsec); if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) { From 979515c5645830465739254abc1b1648ada41518 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sat, 13 Aug 2016 01:37:04 +0200 Subject: [PATCH 129/538] time: Avoid undefined behaviour in ktime_add_safe() I ran into this: ================================================================================ UBSAN: Undefined behaviour in kernel/time/hrtimer.c:310:16 signed integer overflow: 9223372036854775807 + 50000 cannot be represented in type 'long long int' CPU: 2 PID: 4798 Comm: trinity-c2 Not tainted 4.8.0-rc1+ #91 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.3-0-ge2fc41e-prebuilt.qemu-project.org 04/01/2014 0000000000000000 ffff88010ce6fb88 ffffffff82344740 0000000041b58ab3 ffffffff84f97a20 ffffffff82344694 ffff88010ce6fbb0 ffff88010ce6fb60 000000000000c350 ffff88010ce6f968 dffffc0000000000 ffffffff857bc320 Call Trace: [] dump_stack+0xac/0xfc [] ? _atomic_dec_and_lock+0xc4/0xc4 [] ubsan_epilogue+0xd/0x8a [] handle_overflow+0x202/0x23d [] ? val_to_string.constprop.6+0x11e/0x11e [] ? timerqueue_add+0x151/0x410 [] ? hrtimer_start_range_ns+0x3b8/0x1380 [] ? memset+0x31/0x40 [] __ubsan_handle_add_overflow+0xe/0x10 [] hrtimer_nanosleep+0x5d9/0x790 [] ? hrtimer_init_sleeper+0x80/0x80 [] ? __might_sleep+0x5b/0x260 [] common_nsleep+0x20/0x30 [] SyS_clock_nanosleep+0x197/0x210 [] ? SyS_clock_getres+0x150/0x150 [] ? __this_cpu_preempt_check+0x13/0x20 [] ? __context_tracking_exit.part.3+0x30/0x1b0 [] ? SyS_clock_getres+0x150/0x150 [] do_syscall_64+0x1b3/0x4b0 [] entry_SYSCALL64_slow_path+0x25/0x25 ================================================================================ Add a new ktime_add_unsafe() helper which doesn't check for overflow, but doesn't throw a UBSAN warning when it does overflow either. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Signed-off-by: Vegard Nossum Signed-off-by: John Stultz --- include/linux/ktime.h | 7 +++++++ kernel/time/hrtimer.c | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 2b6a204bd8d4..3ffc69ebe967 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -63,6 +63,13 @@ static inline ktime_t ktime_set(const s64 secs, const unsigned long nsecs) #define ktime_add(lhs, rhs) \ ({ (ktime_t){ .tv64 = (lhs).tv64 + (rhs).tv64 }; }) +/* + * Same as ktime_add(), but avoids undefined behaviour on overflow; however, + * this means that you must check the result for overflow yourself. + */ +#define ktime_add_unsafe(lhs, rhs) \ + ({ (ktime_t){ .tv64 = (u64) (lhs).tv64 + (rhs).tv64 }; }) + /* * Add a ktime_t variable and a scalar nanosecond value. * res = kt + nsval: diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 252ea4741592..bb5ec425dfe0 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -307,7 +307,7 @@ EXPORT_SYMBOL_GPL(__ktime_divns); */ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) { - ktime_t res = ktime_add(lhs, rhs); + ktime_t res = ktime_add_unsafe(lhs, rhs); /* * We use KTIME_SEC_MAX here, the maximum timeout which we can From dd8d6ec672f9796528a31033084a4947817d6316 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 31 Aug 2016 16:57:12 +0300 Subject: [PATCH 130/538] x86/platform/intel-mid: Enable WiFi on Intel Edison Intel Edison board provides built-in WiFi dongle based on Broadcom BCM43340. Append the essential data to enable WiFi on Intel Edison. Signed-off-by: Andy Shevchenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160831135713.79066-1-andriy.shevchenko@linux.intel.com Signed-off-by: Ingo Molnar --- .../platform/intel-mid/device_libs/Makefile | 2 + .../intel-mid/device_libs/platform_bcm43xx.c | 95 +++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile index fc135bf70511..a4711b5e94f3 100644 --- a/arch/x86/platform/intel-mid/device_libs/Makefile +++ b/arch/x86/platform/intel-mid/device_libs/Makefile @@ -1,5 +1,7 @@ # Family-Level Interface Shim (FLIS) obj-$(subst m,y,$(CONFIG_PINCTRL_MERRIFIELD)) += platform_mrfld_pinctrl.o +# WiFi +obj-$(subst m,y,$(CONFIG_BRCMFMAC_SDIO)) += platform_bcm43xx.o # IPC Devices obj-y += platform_ipc.o obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic.o diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c new file mode 100644 index 000000000000..4392c15ed9e0 --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c @@ -0,0 +1,95 @@ +/* + * platform_bcm43xx.c: bcm43xx platform data initilization file + * + * (C) Copyright 2016 Intel Corporation + * Author: Andy Shevchenko + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include +#include +#include +#include +#include + +#include + +#define WLAN_SFI_GPIO_IRQ_NAME "WLAN-interrupt" +#define WLAN_SFI_GPIO_ENABLE_NAME "WLAN-enable" + +#define WLAN_DEV_NAME "0000:00:01.3" + +static struct regulator_consumer_supply bcm43xx_vmmc_supply = { + .dev_name = WLAN_DEV_NAME, + .supply = "vmmc", +}; + +static struct regulator_init_data bcm43xx_vmmc_data = { + .constraints = { + .valid_ops_mask = REGULATOR_CHANGE_STATUS, + }, + .num_consumer_supplies = 1, + .consumer_supplies = &bcm43xx_vmmc_supply, +}; + +static struct fixed_voltage_config bcm43xx_vmmc = { + .supply_name = "bcm43xx-vmmc-regulator", + /* + * Announce 2.0V here to be compatible with SDIO specification. The + * real voltage and signaling are still 1.8V. + */ + .microvolts = 2000000, /* 1.8V */ + .gpio = -EINVAL, + .startup_delay = 250 * 1000, /* 250ms */ + .enable_high = 1, /* active high */ + .enabled_at_boot = 0, /* disabled at boot */ + .init_data = &bcm43xx_vmmc_data, +}; + +static struct platform_device bcm43xx_vmmc_regulator = { + .name = "reg-fixed-voltage", + .id = PLATFORM_DEVID_AUTO, + .dev = { + .platform_data = &bcm43xx_vmmc, + }, +}; + +static int __init bcm43xx_regulator_register(void) +{ + int ret; + + bcm43xx_vmmc.gpio = get_gpio_by_name(WLAN_SFI_GPIO_ENABLE_NAME); + ret = platform_device_register(&bcm43xx_vmmc_regulator); + if (ret) { + pr_err("%s: vmmc regulator register failed\n", __func__); + return ret; + } + + return 0; +} + +static void __init *bcm43xx_platform_data(void *info) +{ + int ret; + + ret = bcm43xx_regulator_register(); + if (ret) + return NULL; + + pr_info("Using generic wifi platform data\n"); + + /* For now it's empty */ + return NULL; +} + +static const struct devs_id bcm43xx_clk_vmmc_dev_id __initconst = { + .name = "bcm43xx_clk_vmmc", + .type = SFI_DEV_TYPE_SD, + .get_platform_data = &bcm43xx_platform_data, +}; + +sfi_device(bcm43xx_clk_vmmc_dev_id); From 3976b0380b315651137ce4321b1171ac0a1d26ed Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 31 Aug 2016 16:57:13 +0300 Subject: [PATCH 131/538] x86/platform/intel-mid: Enable SD card detection on Merrifield Intel Merrifield platform provides SD card interface. The interface allows user to plug SD card to extend storage capacity. Append the essential data to enable SD card detection on it. Signed-off-by: Andy Shevchenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160831135713.79066-2-andriy.shevchenko@linux.intel.com Signed-off-by: Ingo Molnar --- .../platform/intel-mid/device_libs/Makefile | 2 + .../intel-mid/device_libs/platform_mrfld_sd.c | 47 +++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile index a4711b5e94f3..429d08be7848 100644 --- a/arch/x86/platform/intel-mid/device_libs/Makefile +++ b/arch/x86/platform/intel-mid/device_libs/Makefile @@ -1,5 +1,7 @@ # Family-Level Interface Shim (FLIS) obj-$(subst m,y,$(CONFIG_PINCTRL_MERRIFIELD)) += platform_mrfld_pinctrl.o +# SDHCI Devices +obj-$(subst m,y,$(CONFIG_MMC_SDHCI_PCI)) += platform_mrfld_sd.o # WiFi obj-$(subst m,y,$(CONFIG_BRCMFMAC_SDIO)) += platform_bcm43xx.o # IPC Devices diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c new file mode 100644 index 000000000000..00c4a034ad93 --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c @@ -0,0 +1,47 @@ +/* + * SDHCI platform data initilisation file + * + * (C) Copyright 2016 Intel Corporation + * Author: Andy Shevchenko + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include +#include + +#include + +#include + +#define INTEL_MRFLD_SD 2 +#define INTEL_MRFLD_SD_CD_GPIO 77 + +static struct sdhci_pci_data mrfld_sdhci_pci_data = { + .rst_n_gpio = -EINVAL, + .cd_gpio = INTEL_MRFLD_SD_CD_GPIO, +}; + +static struct sdhci_pci_data * +mrfld_sdhci_pci_get_data(struct pci_dev *pdev, int slotno) +{ + unsigned int func = PCI_FUNC(pdev->devfn); + + if (func == INTEL_MRFLD_SD) + return &mrfld_sdhci_pci_data; + + return NULL; +} + +static int __init mrfld_sd_init(void) +{ + if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER) + return -ENODEV; + + sdhci_pci_get_data = mrfld_sdhci_pci_get_data; + return 0; +} +arch_initcall(mrfld_sd_init); From 2349f205df22c51fd717cd52b56df0e63c6c3ff2 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 1 Sep 2016 11:44:54 +0100 Subject: [PATCH 132/538] irqchip/keystone: Fix typo "sporious" -> "spurious" Trivial fix to typo in dev_warn message. Signed-off-by: Colin Ian King Link: https://lkml.kernel.org/r/20160901104454.26092-1-colin.king@canonical.com Signed-off-by: Jason Cooper --- drivers/irqchip/irq-keystone.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-keystone.c b/drivers/irqchip/irq-keystone.c index deb89d63a728..54a5e870a8f5 100644 --- a/drivers/irqchip/irq-keystone.c +++ b/drivers/irqchip/irq-keystone.c @@ -109,7 +109,7 @@ static void keystone_irq_handler(struct irq_desc *desc) dev_dbg(kirq->dev, "dispatch bit %d, virq %d\n", src, virq); if (!virq) - dev_warn(kirq->dev, "sporious irq detected hwirq %d, virq %d\n", + dev_warn(kirq->dev, "spurious irq detected hwirq %d, virq %d\n", src, virq); generic_handle_irq(virq); } From fc590c22f9f056ab50190b797f6cacead29f9b75 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Sep 2016 14:45:19 +0200 Subject: [PATCH 133/538] genirq: Robustify handle_percpu_devid_irq() The percpu_devid handler is not robust against spurious interrupts. If a spurious interrupt happens and no action is installed then the handler crashes with a NULL pointer dereference. Add a sanity check for this and log the wreckage once in dmesg. Reported-by: Majun Signed-off-by: Thomas Gleixner Cc: Mark Rutland Cc: Marc Zyngier Cc: guohanjun@huawei.com Cc: dingtianhong@huawei.com Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1609021436160.5647@nanos --- kernel/irq/chip.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b4c1bc7c9ca2..93c373a8b12b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -756,7 +756,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irqaction *action = desc->action; - void *dev_id = raw_cpu_ptr(action->percpu_dev_id); unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; @@ -765,9 +764,20 @@ void handle_percpu_devid_irq(struct irq_desc *desc) if (chip->irq_ack) chip->irq_ack(&desc->irq_data); - trace_irq_handler_entry(irq, action); - res = action->handler(irq, dev_id); - trace_irq_handler_exit(irq, action, res); + if (likely(action)) { + trace_irq_handler_entry(irq, action); + res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); + trace_irq_handler_exit(irq, action, res); + } else { + unsigned int cpu = smp_processor_id(); + bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled); + + if (enabled) + irq_percpu_disable(desc, cpu); + + pr_err_once("Spurious%s percpu IRQ%u on CPU%u\n", + enabled ? " and unmasked" : "", irq, cpu); + } if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); From 48e0fba842c7daab80f3351398146368c5504a27 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Sep 2016 17:30:35 +0200 Subject: [PATCH 134/538] genirq: Remove export of irq_map_generic_chip() No module users. Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index abd286afbd27..5fbb94b077b3 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -409,7 +409,6 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); return 0; } -EXPORT_SYMBOL_GPL(irq_map_generic_chip); struct irq_domain_ops irq_generic_chip_ops = { .map = irq_map_generic_chip, From f0c450eaa364cb77c778f2a46ee2aa3ff464b332 Mon Sep 17 00:00:00 2001 From: Sebastian Frias Date: Mon, 1 Aug 2016 16:27:53 +0200 Subject: [PATCH 135/538] genirq/generic_chip: Get rid of code duplication irq_map_generic_chip() contains about the same code as irq_get_domain_generic_chip() except for the return values. Split out the irq_get_domain_generic_chip() implementation so it can be reused. [ tglx: Removed the extra churn in irq_get_domain_generic_chip() callers and massaged changelog ] Signed-off-by: Sebastian Frias Cc: Marc Zyngier Cc: Mason Cc: Jason Cooper Link: http://lkml.kernel.org/r/579F5C69.8070006@laposte.net Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 5fbb94b077b3..11ad73b39d2e 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -328,6 +328,20 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, } EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); +static struct irq_chip_generic * +__irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) +{ + struct irq_domain_chip_generic *dgc = d->gc; + int idx; + + if (!dgc) + return ERR_PTR(-ENODEV); + idx = hw_irq / dgc->irqs_per_chip; + if (idx >= dgc->num_chips) + return ERR_PTR(-EINVAL); + return dgc->gc[idx]; +} + /** * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq * @d: irq domain pointer @@ -336,15 +350,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); struct irq_chip_generic * irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) { - struct irq_domain_chip_generic *dgc = d->gc; - int idx; + struct irq_chip_generic *gc = __irq_get_domain_generic_chip(d, hw_irq); - if (!dgc) - return NULL; - idx = hw_irq / dgc->irqs_per_chip; - if (idx >= dgc->num_chips) - return NULL; - return dgc->gc[idx]; + return !IS_ERR(gc) ? gc : NULL; } EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); @@ -368,13 +376,9 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, unsigned long flags; int idx; - if (!d->gc) - return -ENODEV; - - idx = hw_irq / dgc->irqs_per_chip; - if (idx >= dgc->num_chips) - return -EINVAL; - gc = dgc->gc[idx]; + gc = __irq_get_domain_generic_chip(d, hw_irq); + if (IS_ERR(gc)) + return PTR_ERR(gc); idx = hw_irq % dgc->irqs_per_chip; From ee26c013cdee0b947e29d6cadfb9ff3341c69ff9 Mon Sep 17 00:00:00 2001 From: Sebastian Frias Date: Mon, 1 Aug 2016 16:27:38 +0200 Subject: [PATCH 136/538] genirq/generic_chip: Add irq_unmap callback Without this patch irq_domain_disassociate() cannot properly release the interrupt. In fact, irq_map_generic_chip() checks a bit on 'gc->installed' but said bit is never cleared, only set. Commit 088f40b7b027 ("genirq: Generic chip: Add linear irq domain support") added irq_map_generic_chip() function and also stated "This lacks a removal function for now". This commit provides an implementation of an unmap function that can be called by irq_domain_disassociate(). [ tglx: Made the function static and removed the export as we have neither a prototype nor a modular user. ] Fixes: 088f40b7b027 ("genirq: Generic chip: Add linear irq domain support") Signed-off-by: Sebastian Frias Cc: Marc Zyngier Cc: Mason Cc: Jason Cooper Link: http://lkml.kernel.org/r/579F5C5A.2070507@laposte.net Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 11ad73b39d2e..a3a392097804 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -414,8 +414,29 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, return 0; } +static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq) +{ + struct irq_data *data = irq_domain_get_irq_data(d, virq); + struct irq_domain_chip_generic *dgc = d->gc; + unsigned int hw_irq = data->hwirq; + struct irq_chip_generic *gc; + int irq_idx; + + gc = irq_get_domain_generic_chip(d, hw_irq); + if (!gc) + return; + + irq_idx = hw_irq % dgc->irqs_per_chip; + + clear_bit(irq_idx, &gc->installed); + irq_domain_set_info(d, virq, hw_irq, &no_irq_chip, NULL, NULL, NULL, + NULL); + +} + struct irq_domain_ops irq_generic_chip_ops = { .map = irq_map_generic_chip, + .unmap = irq_unmap_generic_chip, .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_generic_chip_ops); From 0c228919e04ddec195402296e7ebf2472ed6caef Mon Sep 17 00:00:00 2001 From: Sebastian Frias Date: Tue, 2 Aug 2016 10:52:45 +0200 Subject: [PATCH 137/538] irqdomain: Mask irq type in irq_domain_xlate_onetwocell() According to the xlate() callback definition, the 'out_type' parameter needs to be the "linux irq type". A mask for such bits exists, IRQ_TYPE_SENSE_MASK, which is correctly applied in irq_domain_xlate_twocell() So use it for irq_domain_xlate_onetwocell() as well. Signed-off-by: Sebastian Frias Cc: Grant Likely Cc: Marc Zyngier Cc: Mason Cc: Jason Cooper Link: http://lkml.kernel.org/r/57A05F5D.103@laposte.net Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 4752b43662e0..f10cffe8aefb 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -868,7 +868,10 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, if (WARN_ON(intsize < 1)) return -EINVAL; *out_hwirq = intspec[0]; - *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE; + if (intsize > 1) + *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; + else + *out_type = IRQ_TYPE_NONE; return 0; } EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); From a724632ca0c84b494875e9367e07e29472c139ba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 12 Aug 2016 19:49:38 +0200 Subject: [PATCH 138/538] cpu/hotplug: Rework callback invocation logic This is preparation for the following patch. This rework here changes the arguments of cpuhp_invoke_callback(). It passes now `state' and whether `startup' or `teardown' callback should be invoked. The callback then is looked up by the function. The following is a clanup of callers: - cpuhp_issue_call() has one argument less - struct cpuhp_cpu_state (which is used by the hotplug thread) gets also its callback removed. The decision if it is a single callback invocation moved to the `single' variable. Also a `bringup' variable has been added to distinguish between startup and teardown callback. - take_cpu_down() needs to start one step earlier. We always get here via CPUHP_TEARDOWN_CPU callback. Before that change cpuhp_ap_states + CPUHP_TEARDOWN_CPU pointed to an empty entry because TEARDOWN is saved in bp_states for this reason. Now that we use cpuhp_get_step() to lookup the state we must explicitly skip it in order not to invoke it twice. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Cc: Mark Rutland Cc: Peter Zijlstra Cc: Will Deacon Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/1471024183-12666-2-git-send-email-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 162 +++++++++++++++++++++++++-------------------------- 1 file changed, 80 insertions(+), 82 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index ec12b726fa6f..d36d8e0abfb8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -37,8 +37,9 @@ * @thread: Pointer to the hotplug thread * @should_run: Thread should execute * @rollback: Perform a rollback - * @cb_stat: The state for a single callback (install/uninstall) - * @cb: Single callback function (install/uninstall) + * @single: Single callback invocation + * @bringup: Single callback bringup or teardown selector + * @cb_state: The state for a single callback (install/uninstall) * @result: Result of the operation * @done: Signal completion to the issuer of the task */ @@ -49,8 +50,9 @@ struct cpuhp_cpu_state { struct task_struct *thread; bool should_run; bool rollback; + bool single; + bool bringup; enum cpuhp_state cb_state; - int (*cb)(unsigned int cpu); int result; struct completion done; #endif @@ -79,24 +81,43 @@ static DEFINE_MUTEX(cpuhp_state_mutex); static struct cpuhp_step cpuhp_bp_states[]; static struct cpuhp_step cpuhp_ap_states[]; +static bool cpuhp_is_ap_state(enum cpuhp_state state) +{ + /* + * The extra check for CPUHP_TEARDOWN_CPU is only for documentation + * purposes as that state is handled explicitly in cpu_down. + */ + return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; +} + +static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) +{ + struct cpuhp_step *sp; + + sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; + return sp + state; +} + /** * cpuhp_invoke_callback _ Invoke the callbacks for a given state * @cpu: The cpu for which the callback should be invoked * @step: The step in the state machine - * @cb: The callback function to invoke + * @bringup: True if the bringup callback should be invoked * * Called from cpu hotplug and from the state register machinery */ -static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step, - int (*cb)(unsigned int)) +static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, + bool bringup) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + struct cpuhp_step *step = cpuhp_get_step(state); + int (*cb)(unsigned int cpu) = bringup ? step->startup : step->teardown; int ret = 0; if (cb) { - trace_cpuhp_enter(cpu, st->target, step, cb); + trace_cpuhp_enter(cpu, st->target, state, cb); ret = cb(cpu); - trace_cpuhp_exit(cpu, st->state, step, ret); + trace_cpuhp_exit(cpu, st->state, state, ret); } return ret; } @@ -371,62 +392,55 @@ static int bringup_cpu(unsigned int cpu) /* * Hotplug state machine related functions */ -static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st, - struct cpuhp_step *steps) +static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) { for (st->state++; st->state < st->target; st->state++) { - struct cpuhp_step *step = steps + st->state; + struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, step->startup); + cpuhp_invoke_callback(cpu, st->state, true); } } static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, - struct cpuhp_step *steps, enum cpuhp_state target) + enum cpuhp_state target) { enum cpuhp_state prev_state = st->state; int ret = 0; for (; st->state > target; st->state--) { - struct cpuhp_step *step = steps + st->state; - - ret = cpuhp_invoke_callback(cpu, st->state, step->teardown); + ret = cpuhp_invoke_callback(cpu, st->state, false); if (ret) { st->target = prev_state; - undo_cpu_down(cpu, st, steps); + undo_cpu_down(cpu, st); break; } } return ret; } -static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st, - struct cpuhp_step *steps) +static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) { for (st->state--; st->state > st->target; st->state--) { - struct cpuhp_step *step = steps + st->state; + struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, step->teardown); + cpuhp_invoke_callback(cpu, st->state, false); } } static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, - struct cpuhp_step *steps, enum cpuhp_state target) + enum cpuhp_state target) { enum cpuhp_state prev_state = st->state; int ret = 0; while (st->state < target) { - struct cpuhp_step *step; - st->state++; - step = steps + st->state; - ret = cpuhp_invoke_callback(cpu, st->state, step->startup); + ret = cpuhp_invoke_callback(cpu, st->state, true); if (ret) { st->target = prev_state; - undo_cpu_up(cpu, st, steps); + undo_cpu_up(cpu, st); break; } } @@ -455,13 +469,13 @@ static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) { enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); - return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target); + return cpuhp_down_callbacks(cpu, st, target); } /* Execute the online startup callbacks. Used to be CPU_ONLINE */ static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) { - return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target); + return cpuhp_up_callbacks(cpu, st, st->target); } /* @@ -484,18 +498,20 @@ static void cpuhp_thread_fun(unsigned int cpu) st->should_run = false; /* Single callback invocation for [un]install ? */ - if (st->cb) { + if (st->single) { if (st->cb_state < CPUHP_AP_ONLINE) { local_irq_disable(); - ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); + ret = cpuhp_invoke_callback(cpu, st->cb_state, + st->bringup); local_irq_enable(); } else { - ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); + ret = cpuhp_invoke_callback(cpu, st->cb_state, + st->bringup); } } else if (st->rollback) { BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); - undo_cpu_down(cpu, st, cpuhp_ap_states); + undo_cpu_down(cpu, st); /* * This is a momentary workaround to keep the notifier users * happy. Will go away once we got rid of the notifiers. @@ -517,8 +533,8 @@ static void cpuhp_thread_fun(unsigned int cpu) } /* Invoke a single callback on a remote cpu */ -static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, - int (*cb)(unsigned int)) +static int +cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); @@ -530,10 +546,12 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, * we invoke the thread function directly. */ if (!st->thread) - return cpuhp_invoke_callback(cpu, state, cb); + return cpuhp_invoke_callback(cpu, state, bringup); st->cb_state = state; - st->cb = cb; + st->single = true; + st->bringup = bringup; + /* * Make sure the above stores are visible before should_run becomes * true. Paired with the mb() above in cpuhp_thread_fun() @@ -549,7 +567,7 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) { st->result = 0; - st->cb = NULL; + st->single = false; /* * Make sure the above stores are visible before should_run becomes * true. Paired with the mb() above in cpuhp_thread_fun() @@ -700,12 +718,16 @@ static int take_cpu_down(void *_param) if (err < 0) return err; + /* + * We get here while we are in CPUHP_TEARDOWN_CPU state and we must not + * do this step again. + */ + WARN_ON(st->state != CPUHP_TEARDOWN_CPU); + st->state--; /* Invoke the former CPU_DYING callbacks */ - for (; st->state > target; st->state--) { - struct cpuhp_step *step = cpuhp_ap_states + st->state; + for (; st->state > target; st->state--) + cpuhp_invoke_callback(cpu, st->state, false); - cpuhp_invoke_callback(cpu, st->state, step->teardown); - } /* Give up timekeeping duties */ tick_handover_do_timer(); /* Park the stopper thread */ @@ -844,7 +866,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need * to do the further cleanups. */ - ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target); + ret = cpuhp_down_callbacks(cpu, st, target); if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { st->target = prev_state; st->rollback = true; @@ -898,11 +920,8 @@ void notify_cpu_starting(unsigned int cpu) enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); while (st->state < target) { - struct cpuhp_step *step; - st->state++; - step = cpuhp_ap_states + st->state; - cpuhp_invoke_callback(cpu, st->state, step->startup); + cpuhp_invoke_callback(cpu, st->state, true); } } @@ -987,7 +1006,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) * responsible for bringing it up to the target state. */ target = min((int)target, CPUHP_BRINGUP_CPU); - ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target); + ret = cpuhp_up_callbacks(cpu, st, target); out: cpu_hotplug_done(); return ret; @@ -1364,23 +1383,6 @@ static int cpuhp_cb_check(enum cpuhp_state state) return 0; } -static bool cpuhp_is_ap_state(enum cpuhp_state state) -{ - /* - * The extra check for CPUHP_TEARDOWN_CPU is only for documentation - * purposes as that state is handled explicitely in cpu_down. - */ - return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; -} - -static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) -{ - struct cpuhp_step *sp; - - sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; - return sp + state; -} - static void cpuhp_store_callbacks(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu), @@ -1406,12 +1408,12 @@ static void *cpuhp_get_teardown_cb(enum cpuhp_state state) * Call the startup/teardown function for a step either on the AP or * on the current CPU. */ -static int cpuhp_issue_call(int cpu, enum cpuhp_state state, - int (*cb)(unsigned int), bool bringup) +static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup) { + struct cpuhp_step *sp = cpuhp_get_step(state); int ret; - if (!cb) + if ((bringup && !sp->startup) || (!bringup && !sp->teardown)) return 0; /* * The non AP bound callbacks can fail on bringup. On teardown @@ -1419,11 +1421,11 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, */ #ifdef CONFIG_SMP if (cpuhp_is_ap_state(state)) - ret = cpuhp_invoke_ap_callback(cpu, state, cb); + ret = cpuhp_invoke_ap_callback(cpu, state, bringup); else - ret = cpuhp_invoke_callback(cpu, state, cb); + ret = cpuhp_invoke_callback(cpu, state, bringup); #else - ret = cpuhp_invoke_callback(cpu, state, cb); + ret = cpuhp_invoke_callback(cpu, state, bringup); #endif BUG_ON(ret && !bringup); return ret; @@ -1434,14 +1436,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, * * Note: The teardown callbacks for rollback are not allowed to fail! */ -static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, - int (*teardown)(unsigned int cpu)) +static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state) { int cpu; - if (!teardown) - return; - /* Roll back the already executed steps on the other cpus */ for_each_present_cpu(cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); @@ -1452,7 +1450,7 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, /* Did we invoke the startup call on that cpu ? */ if (cpustate >= state) - cpuhp_issue_call(cpu, state, teardown, false); + cpuhp_issue_call(cpu, state, false); } } @@ -1527,9 +1525,10 @@ int __cpuhp_setup_state(enum cpuhp_state state, if (cpustate < state) continue; - ret = cpuhp_issue_call(cpu, state, startup, true); + ret = cpuhp_issue_call(cpu, state, true); if (ret) { - cpuhp_rollback_install(cpu, state, teardown); + if (teardown) + cpuhp_rollback_install(cpu, state); cpuhp_store_callbacks(state, NULL, NULL, NULL); goto out; } @@ -1553,14 +1552,13 @@ EXPORT_SYMBOL(__cpuhp_setup_state); */ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) { - int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state); int cpu; BUG_ON(cpuhp_cb_check(state)); get_online_cpus(); - if (!invoke || !teardown) + if (!invoke || !cpuhp_get_teardown_cb(state)) goto remove; /* @@ -1573,7 +1571,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) int cpustate = st->state; if (cpustate >= state) - cpuhp_issue_call(cpu, state, teardown, false); + cpuhp_issue_call(cpu, state, false); } remove: cpuhp_store_callbacks(state, NULL, NULL, NULL); From cf392d10b69e6e6c57ceea48b347a2ab1a4b75b2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 12 Aug 2016 19:49:39 +0200 Subject: [PATCH 139/538] cpu/hotplug: Add multi instance support This patch adds the ability for a given state to have multiple instances. Until now all states have a single instance and the startup / teardown callback use global variables. A few drivers need to perform a the same callbacks on multiple "instances". Currently we have three drivers in tree which all have a global list which they iterate over. With multi instance they support don't need their private list and the functionality has been moved into core code. Plus we hold the hotplug lock in core so no cpus comes/goes while instances are registered and we do rollback in error case :) Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Cc: Mark Rutland Cc: Peter Zijlstra Cc: Will Deacon Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/1471024183-12666-3-git-send-email-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 110 +++++++++++++++++- include/trace/events/cpuhp.h | 28 +++++ kernel/cpu.c | 218 +++++++++++++++++++++++++++++------ 3 files changed, 318 insertions(+), 38 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 242bf530edfc..dcfe619171b4 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -99,7 +99,7 @@ enum cpuhp_state { int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke, int (*startup)(unsigned int cpu), - int (*teardown)(unsigned int cpu)); + int (*teardown)(unsigned int cpu), bool multi_instance); /** * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks @@ -116,7 +116,7 @@ static inline int cpuhp_setup_state(enum cpuhp_state state, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu)) { - return __cpuhp_setup_state(state, name, true, startup, teardown); + return __cpuhp_setup_state(state, name, true, startup, teardown, false); } /** @@ -135,7 +135,66 @@ static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu)) { - return __cpuhp_setup_state(state, name, false, startup, teardown); + return __cpuhp_setup_state(state, name, false, startup, teardown, + false); +} + +/** + * cpuhp_setup_state_multi - Add callbacks for multi state + * @state: The state for which the calls are installed + * @name: Name of the callback. + * @startup: startup callback function + * @teardown: teardown callback function + * + * Sets the internal multi_instance flag and prepares a state to work as a multi + * instance callback. No callbacks are invoked at this point. The callbacks are + * invoked once an instance for this state are registered via + * @cpuhp_state_add_instance or @cpuhp_state_add_instance_nocalls. + */ +static inline int cpuhp_setup_state_multi(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu, + struct hlist_node *node), + int (*teardown)(unsigned int cpu, + struct hlist_node *node)) +{ + return __cpuhp_setup_state(state, name, false, + (void *) startup, + (void *) teardown, true); +} + +int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, + bool invoke); + +/** + * cpuhp_state_add_instance - Add an instance for a state and invoke startup + * callback. + * @state: The state for which the instance is installed + * @node: The node for this individual state. + * + * Installs the instance for the @state and invokes the startup callback on + * the present cpus which have already reached the @state. The @state must have + * been earlier marked as multi-instance by @cpuhp_setup_state_multi. + */ +static inline int cpuhp_state_add_instance(enum cpuhp_state state, + struct hlist_node *node) +{ + return __cpuhp_state_add_instance(state, node, true); +} + +/** + * cpuhp_state_add_instance_nocalls - Add an instance for a state without + * invoking the startup callback. + * @state: The state for which the instance is installed + * @node: The node for this individual state. + * + * Installs the instance for the @state The @state must have been earlier + * marked as multi-instance by @cpuhp_setup_state_multi. + */ +static inline int cpuhp_state_add_instance_nocalls(enum cpuhp_state state, + struct hlist_node *node) +{ + return __cpuhp_state_add_instance(state, node, false); } void __cpuhp_remove_state(enum cpuhp_state state, bool invoke); @@ -162,6 +221,51 @@ static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state) __cpuhp_remove_state(state, false); } +/** + * cpuhp_remove_multi_state - Remove hotplug multi state callback + * @state: The state for which the calls are removed + * + * Removes the callback functions from a multi state. This is the reverse of + * cpuhp_setup_state_multi(). All instances should have been removed before + * invoking this function. + */ +static inline void cpuhp_remove_multi_state(enum cpuhp_state state) +{ + __cpuhp_remove_state(state, false); +} + +int __cpuhp_state_remove_instance(enum cpuhp_state state, + struct hlist_node *node, bool invoke); + +/** + * cpuhp_state_remove_instance - Remove hotplug instance from state and invoke + * the teardown callback + * @state: The state from which the instance is removed + * @node: The node for this individual state. + * + * Removes the instance and invokes the teardown callback on the present cpus + * which have already reached the @state. + */ +static inline int cpuhp_state_remove_instance(enum cpuhp_state state, + struct hlist_node *node) +{ + return __cpuhp_state_remove_instance(state, node, true); +} + +/** + * cpuhp_state_remove_instance_nocalls - Remove hotplug instance from state + * without invoking the reatdown callback + * @state: The state from which the instance is removed + * @node: The node for this individual state. + * + * Removes the instance without invoking the teardown callback. + */ +static inline int cpuhp_state_remove_instance_nocalls(enum cpuhp_state state, + struct hlist_node *node) +{ + return __cpuhp_state_remove_instance(state, node, false); +} + #ifdef CONFIG_SMP void cpuhp_online_idle(enum cpuhp_state state); #else diff --git a/include/trace/events/cpuhp.h b/include/trace/events/cpuhp.h index a72bd93ec7e5..996953db91d7 100644 --- a/include/trace/events/cpuhp.h +++ b/include/trace/events/cpuhp.h @@ -33,6 +33,34 @@ TRACE_EVENT(cpuhp_enter, __entry->cpu, __entry->target, __entry->idx, __entry->fun) ); +TRACE_EVENT(cpuhp_multi_enter, + + TP_PROTO(unsigned int cpu, + int target, + int idx, + int (*fun)(unsigned int, struct hlist_node *), + struct hlist_node *node), + + TP_ARGS(cpu, target, idx, fun, node), + + TP_STRUCT__entry( + __field( unsigned int, cpu ) + __field( int, target ) + __field( int, idx ) + __field( void *, fun ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->target = target; + __entry->idx = idx; + __entry->fun = fun; + ), + + TP_printk("cpu: %04u target: %3d step: %3d (%pf)", + __entry->cpu, __entry->target, __entry->idx, __entry->fun) +); + TRACE_EVENT(cpuhp_exit, TP_PROTO(unsigned int cpu, diff --git a/kernel/cpu.c b/kernel/cpu.c index d36d8e0abfb8..c506485eaa75 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -52,6 +52,7 @@ struct cpuhp_cpu_state { bool rollback; bool single; bool bringup; + struct hlist_node *node; enum cpuhp_state cb_state; int result; struct completion done; @@ -70,11 +71,21 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); * @cant_stop: Bringup/teardown can't be stopped at this step */ struct cpuhp_step { - const char *name; - int (*startup)(unsigned int cpu); - int (*teardown)(unsigned int cpu); - bool skip_onerr; - bool cant_stop; + const char *name; + union { + int (*startup)(unsigned int cpu); + int (*startup_multi)(unsigned int cpu, + struct hlist_node *node); + }; + union { + int (*teardown)(unsigned int cpu); + int (*teardown_multi)(unsigned int cpu, + struct hlist_node *node); + }; + struct hlist_head list; + bool skip_onerr; + bool cant_stop; + bool multi_instance; }; static DEFINE_MUTEX(cpuhp_state_mutex); @@ -104,20 +115,59 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) * @step: The step in the state machine * @bringup: True if the bringup callback should be invoked * - * Called from cpu hotplug and from the state register machinery + * Called from cpu hotplug and from the state register machinery. */ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, - bool bringup) + bool bringup, struct hlist_node *node) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct cpuhp_step *step = cpuhp_get_step(state); - int (*cb)(unsigned int cpu) = bringup ? step->startup : step->teardown; - int ret = 0; - - if (cb) { + int (*cbm)(unsigned int cpu, struct hlist_node *node); + int (*cb)(unsigned int cpu); + int ret, cnt; + + if (!step->multi_instance) { + cb = bringup ? step->startup : step->teardown; + if (!cb) + return 0; trace_cpuhp_enter(cpu, st->target, state, cb); ret = cb(cpu); trace_cpuhp_exit(cpu, st->state, state, ret); + return ret; + } + cbm = bringup ? step->startup_multi : step->teardown_multi; + if (!cbm) + return 0; + + /* Single invocation for instance add/remove */ + if (node) { + trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); + ret = cbm(cpu, node); + trace_cpuhp_exit(cpu, st->state, state, ret); + return ret; + } + + /* State transition. Invoke on all instances */ + cnt = 0; + hlist_for_each(node, &step->list) { + trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); + ret = cbm(cpu, node); + trace_cpuhp_exit(cpu, st->state, state, ret); + if (ret) + goto err; + cnt++; + } + return 0; +err: + /* Rollback the instances if one failed */ + cbm = !bringup ? step->startup_multi : step->teardown_multi; + if (!cbm) + return ret; + + hlist_for_each(node, &step->list) { + if (!cnt--) + break; + cbm(cpu, node); } return ret; } @@ -398,7 +448,7 @@ static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, true); + cpuhp_invoke_callback(cpu, st->state, true, NULL); } } @@ -409,7 +459,7 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, int ret = 0; for (; st->state > target; st->state--) { - ret = cpuhp_invoke_callback(cpu, st->state, false); + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL); if (ret) { st->target = prev_state; undo_cpu_down(cpu, st); @@ -425,7 +475,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, false); + cpuhp_invoke_callback(cpu, st->state, false, NULL); } } @@ -437,7 +487,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, while (st->state < target) { st->state++; - ret = cpuhp_invoke_callback(cpu, st->state, true); + ret = cpuhp_invoke_callback(cpu, st->state, true, NULL); if (ret) { st->target = prev_state; undo_cpu_up(cpu, st); @@ -502,11 +552,11 @@ static void cpuhp_thread_fun(unsigned int cpu) if (st->cb_state < CPUHP_AP_ONLINE) { local_irq_disable(); ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup); + st->bringup, st->node); local_irq_enable(); } else { ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup); + st->bringup, st->node); } } else if (st->rollback) { BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); @@ -534,7 +584,8 @@ static void cpuhp_thread_fun(unsigned int cpu) /* Invoke a single callback on a remote cpu */ static int -cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup) +cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, + struct hlist_node *node) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); @@ -546,11 +597,12 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup) * we invoke the thread function directly. */ if (!st->thread) - return cpuhp_invoke_callback(cpu, state, bringup); + return cpuhp_invoke_callback(cpu, state, bringup, node); st->cb_state = state; st->single = true; st->bringup = bringup; + st->node = node; /* * Make sure the above stores are visible before should_run becomes @@ -726,7 +778,7 @@ static int take_cpu_down(void *_param) st->state--; /* Invoke the former CPU_DYING callbacks */ for (; st->state > target; st->state--) - cpuhp_invoke_callback(cpu, st->state, false); + cpuhp_invoke_callback(cpu, st->state, false, NULL); /* Give up timekeeping duties */ tick_handover_do_timer(); @@ -921,7 +973,7 @@ void notify_cpu_starting(unsigned int cpu) while (st->state < target) { st->state++; - cpuhp_invoke_callback(cpu, st->state, true); + cpuhp_invoke_callback(cpu, st->state, true, NULL); } } @@ -1386,7 +1438,8 @@ static int cpuhp_cb_check(enum cpuhp_state state) static void cpuhp_store_callbacks(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu), - int (*teardown)(unsigned int cpu)) + int (*teardown)(unsigned int cpu), + bool multi_instance) { /* (Un)Install the callbacks for further cpu hotplug operations */ struct cpuhp_step *sp; @@ -1396,6 +1449,8 @@ static void cpuhp_store_callbacks(enum cpuhp_state state, sp->startup = startup; sp->teardown = teardown; sp->name = name; + sp->multi_instance = multi_instance; + INIT_HLIST_HEAD(&sp->list); mutex_unlock(&cpuhp_state_mutex); } @@ -1408,7 +1463,8 @@ static void *cpuhp_get_teardown_cb(enum cpuhp_state state) * Call the startup/teardown function for a step either on the AP or * on the current CPU. */ -static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup) +static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, + struct hlist_node *node) { struct cpuhp_step *sp = cpuhp_get_step(state); int ret; @@ -1421,11 +1477,11 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup) */ #ifdef CONFIG_SMP if (cpuhp_is_ap_state(state)) - ret = cpuhp_invoke_ap_callback(cpu, state, bringup); + ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node); else - ret = cpuhp_invoke_callback(cpu, state, bringup); + ret = cpuhp_invoke_callback(cpu, state, bringup, node); #else - ret = cpuhp_invoke_callback(cpu, state, bringup); + ret = cpuhp_invoke_callback(cpu, state, bringup, node); #endif BUG_ON(ret && !bringup); return ret; @@ -1436,7 +1492,8 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup) * * Note: The teardown callbacks for rollback are not allowed to fail! */ -static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state) +static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, + struct hlist_node *node) { int cpu; @@ -1450,7 +1507,7 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state) /* Did we invoke the startup call on that cpu ? */ if (cpustate >= state) - cpuhp_issue_call(cpu, state, false); + cpuhp_issue_call(cpu, state, false, node); } } @@ -1477,6 +1534,52 @@ static int cpuhp_reserve_state(enum cpuhp_state state) return -ENOSPC; } +int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, + bool invoke) +{ + struct cpuhp_step *sp; + int cpu; + int ret; + + sp = cpuhp_get_step(state); + if (sp->multi_instance == false) + return -EINVAL; + + get_online_cpus(); + + if (!invoke || !sp->startup_multi) + goto add_node; + + /* + * Try to call the startup callback for each present cpu + * depending on the hotplug state of the cpu. + */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpustate < state) + continue; + + ret = cpuhp_issue_call(cpu, state, true, node); + if (ret) { + if (sp->teardown_multi) + cpuhp_rollback_install(cpu, state, node); + goto err; + } + } +add_node: + ret = 0; + mutex_lock(&cpuhp_state_mutex); + hlist_add_head(node, &sp->list); + mutex_unlock(&cpuhp_state_mutex); + +err: + put_online_cpus(); + return ret; +} +EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); + /** * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state * @state: The state to setup @@ -1490,7 +1593,8 @@ static int cpuhp_reserve_state(enum cpuhp_state state) int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke, int (*startup)(unsigned int cpu), - int (*teardown)(unsigned int cpu)) + int (*teardown)(unsigned int cpu), + bool multi_instance) { int cpu, ret = 0; int dyn_state = 0; @@ -1509,7 +1613,7 @@ int __cpuhp_setup_state(enum cpuhp_state state, state = ret; } - cpuhp_store_callbacks(state, name, startup, teardown); + cpuhp_store_callbacks(state, name, startup, teardown, multi_instance); if (!invoke || !startup) goto out; @@ -1525,11 +1629,11 @@ int __cpuhp_setup_state(enum cpuhp_state state, if (cpustate < state) continue; - ret = cpuhp_issue_call(cpu, state, true); + ret = cpuhp_issue_call(cpu, state, true, NULL); if (ret) { if (teardown) - cpuhp_rollback_install(cpu, state); - cpuhp_store_callbacks(state, NULL, NULL, NULL); + cpuhp_rollback_install(cpu, state, NULL); + cpuhp_store_callbacks(state, NULL, NULL, NULL, false); goto out; } } @@ -1541,6 +1645,42 @@ int __cpuhp_setup_state(enum cpuhp_state state, } EXPORT_SYMBOL(__cpuhp_setup_state); +int __cpuhp_state_remove_instance(enum cpuhp_state state, + struct hlist_node *node, bool invoke) +{ + struct cpuhp_step *sp = cpuhp_get_step(state); + int cpu; + + BUG_ON(cpuhp_cb_check(state)); + + if (!sp->multi_instance) + return -EINVAL; + + get_online_cpus(); + if (!invoke || !cpuhp_get_teardown_cb(state)) + goto remove; + /* + * Call the teardown callback for each present cpu depending + * on the hotplug state of the cpu. This function is not + * allowed to fail currently! + */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpustate >= state) + cpuhp_issue_call(cpu, state, false, node); + } + +remove: + mutex_lock(&cpuhp_state_mutex); + hlist_del(node); + mutex_unlock(&cpuhp_state_mutex); + put_online_cpus(); + + return 0; +} +EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance); /** * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state * @state: The state to remove @@ -1552,12 +1692,20 @@ EXPORT_SYMBOL(__cpuhp_setup_state); */ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) { + struct cpuhp_step *sp = cpuhp_get_step(state); int cpu; BUG_ON(cpuhp_cb_check(state)); get_online_cpus(); + if (sp->multi_instance) { + WARN(!hlist_empty(&sp->list), + "Error: Removing state %d which has instances left.\n", + state); + goto remove; + } + if (!invoke || !cpuhp_get_teardown_cb(state)) goto remove; @@ -1571,10 +1719,10 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) int cpustate = st->state; if (cpustate >= state) - cpuhp_issue_call(cpu, state, false); + cpuhp_issue_call(cpu, state, false, NULL); } remove: - cpuhp_store_callbacks(state, NULL, NULL, NULL); + cpuhp_store_callbacks(state, NULL, NULL, NULL, false); put_online_cpus(); } EXPORT_SYMBOL(__cpuhp_remove_state); From 6e103c0cfeb9ab8d40822a015da9769595096411 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 17 Aug 2016 19:14:20 +0200 Subject: [PATCH 140/538] arm/perf: Use multi instance instead of custom list Signed-off-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: Mark Rutland Cc: Will Deacon Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160817171420.sdwk2qivxunzryz4@linutronix.de Signed-off-by: Thomas Gleixner --- drivers/perf/arm_pmu.c | 44 +++++++++++++++--------------------- include/linux/perf/arm_pmu.h | 2 +- 2 files changed, 19 insertions(+), 27 deletions(-) diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index c494613c1909..b2f742f84111 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -688,28 +688,20 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler) return 0; } -static DEFINE_SPINLOCK(arm_pmu_lock); -static LIST_HEAD(arm_pmu_list); - /* * PMU hardware loses all context when a CPU goes offline. * When a CPU is hotplugged back in, since some hardware registers are * UNKNOWN at reset, the PMU must be explicitly reset to avoid reading * junk values out of them. */ -static int arm_perf_starting_cpu(unsigned int cpu) +static int arm_perf_starting_cpu(unsigned int cpu, struct hlist_node *node) { - struct arm_pmu *pmu; - - spin_lock(&arm_pmu_lock); - list_for_each_entry(pmu, &arm_pmu_list, entry) { + struct arm_pmu *pmu = hlist_entry_safe(node, struct arm_pmu, node); - if (!cpumask_test_cpu(cpu, &pmu->supported_cpus)) - continue; - if (pmu->reset) - pmu->reset(pmu); - } - spin_unlock(&arm_pmu_lock); + if (!cpumask_test_cpu(cpu, &pmu->supported_cpus)) + return 0; + if (pmu->reset) + pmu->reset(pmu); return 0; } @@ -821,9 +813,10 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu) if (!cpu_hw_events) return -ENOMEM; - spin_lock(&arm_pmu_lock); - list_add_tail(&cpu_pmu->entry, &arm_pmu_list); - spin_unlock(&arm_pmu_lock); + err = cpuhp_state_add_instance_nocalls(CPUHP_AP_PERF_ARM_STARTING, + &cpu_pmu->node); + if (err) + goto out_free; err = cpu_pm_pmu_register(cpu_pmu); if (err) @@ -859,9 +852,9 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu) return 0; out_unregister: - spin_lock(&arm_pmu_lock); - list_del(&cpu_pmu->entry); - spin_unlock(&arm_pmu_lock); + cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_STARTING, + &cpu_pmu->node); +out_free: free_percpu(cpu_hw_events); return err; } @@ -869,9 +862,8 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu) static void cpu_pmu_destroy(struct arm_pmu *cpu_pmu) { cpu_pm_pmu_unregister(cpu_pmu); - spin_lock(&arm_pmu_lock); - list_del(&cpu_pmu->entry); - spin_unlock(&arm_pmu_lock); + cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_STARTING, + &cpu_pmu->node); free_percpu(cpu_pmu->hw_events); } @@ -1068,9 +1060,9 @@ static int arm_pmu_hp_init(void) { int ret; - ret = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_ARM_STARTING, - "AP_PERF_ARM_STARTING", - arm_perf_starting_cpu, NULL); + ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_STARTING, + "AP_PERF_ARM_STARTING", + arm_perf_starting_cpu, NULL); if (ret) pr_err("CPU hotplug notifier for ARM PMU could not be registered: %d\n", ret); diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index e18843809eec..4ad1b408c0bb 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -109,7 +109,7 @@ struct arm_pmu { DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); struct platform_device *plat_device; struct pmu_hw_events __percpu *hw_events; - struct list_head entry; + struct hlist_node node; struct notifier_block cpu_pm_nb; }; From b230f0db913136f465a951806f2978b179df95d5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 12 Aug 2016 19:49:41 +0200 Subject: [PATCH 141/538] bus/arm-cci: Use cpu-hp's multi instance support instead custom list Signed-off-by: Sebastian Andrzej Siewior Cc: Mark Rutland Cc: Suzuki K Poulose Cc: Peter Zijlstra Cc: Will Deacon Cc: rt@linutronix.de Cc: Olof Johansson Link: http://lkml.kernel.org/r/1471024183-12666-5-git-send-email-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- drivers/bus/arm-cci.c | 45 +++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/drivers/bus/arm-cci.c b/drivers/bus/arm-cci.c index 5755907f836f..4c44ba2d3412 100644 --- a/drivers/bus/arm-cci.c +++ b/drivers/bus/arm-cci.c @@ -144,15 +144,12 @@ struct cci_pmu { int num_cntrs; atomic_t active_events; struct mutex reserve_mutex; - struct list_head entry; + struct hlist_node node; cpumask_t cpus; }; #define to_cci_pmu(c) (container_of(c, struct cci_pmu, pmu)) -static DEFINE_MUTEX(cci_pmu_mutex); -static LIST_HEAD(cci_pmu_list); - enum cci_models { #ifdef CONFIG_ARM_CCI400_PMU CCI400_R0, @@ -1506,25 +1503,21 @@ static int cci_pmu_init(struct cci_pmu *cci_pmu, struct platform_device *pdev) return perf_pmu_register(&cci_pmu->pmu, name, -1); } -static int cci_pmu_offline_cpu(unsigned int cpu) +static int cci_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) { - struct cci_pmu *cci_pmu; + struct cci_pmu *cci_pmu = hlist_entry_safe(node, struct cci_pmu, node); unsigned int target; - mutex_lock(&cci_pmu_mutex); - list_for_each_entry(cci_pmu, &cci_pmu_list, entry) { - if (!cpumask_test_and_clear_cpu(cpu, &cci_pmu->cpus)) - continue; - target = cpumask_any_but(cpu_online_mask, cpu); - if (target >= nr_cpu_ids) - continue; - /* - * TODO: migrate context once core races on event->ctx have - * been fixed. - */ - cpumask_set_cpu(target, &cci_pmu->cpus); - } - mutex_unlock(&cci_pmu_mutex); + if (!cpumask_test_and_clear_cpu(cpu, &cci_pmu->cpus)) + return 0; + target = cpumask_any_but(cpu_online_mask, cpu); + if (target >= nr_cpu_ids) + return 0; + /* + * TODO: migrate context once core races on event->ctx have + * been fixed. + */ + cpumask_set_cpu(target, &cci_pmu->cpus); return 0; } @@ -1768,10 +1761,8 @@ static int cci_pmu_probe(struct platform_device *pdev) if (ret) return ret; - mutex_lock(&cci_pmu_mutex); - list_add(&cci_pmu->entry, &cci_pmu_list); - mutex_unlock(&cci_pmu_mutex); - + cpuhp_state_add_instance_nocalls(CPUHP_AP_PERF_ARM_CCI_ONLINE, + &cci_pmu->node); pr_info("ARM %s PMU driver probed", cci_pmu->model->name); return 0; } @@ -1804,9 +1795,9 @@ static int __init cci_platform_init(void) { int ret; - ret = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_ARM_CCI_ONLINE, - "AP_PERF_ARM_CCI_ONLINE", NULL, - cci_pmu_offline_cpu); + ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_CCI_ONLINE, + "AP_PERF_ARM_CCI_ONLINE", NULL, + cci_pmu_offline_cpu); if (ret) return ret; From 8df038725ad5351a9730759e0a24a5c5d96be661 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 12 Aug 2016 19:49:42 +0200 Subject: [PATCH 142/538] bus/arm-ccn: Use cpu-hp's multi instance support instead custom list Signed-off-by: Sebastian Andrzej Siewior Cc: Mark Rutland Cc: Pawel Moll Cc: Arnd Bergmann Cc: Suzuki K Poulose Cc: Peter Zijlstra Cc: Will Deacon Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/1471024183-12666-6-git-send-email-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- drivers/bus/arm-ccn.c | 54 +++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 33 deletions(-) diff --git a/drivers/bus/arm-ccn.c b/drivers/bus/arm-ccn.c index 97a9185af433..e0ad47534126 100644 --- a/drivers/bus/arm-ccn.c +++ b/drivers/bus/arm-ccn.c @@ -167,7 +167,7 @@ struct arm_ccn_dt { struct hrtimer hrtimer; cpumask_t cpu; - struct list_head entry; + struct hlist_node node; struct pmu pmu; }; @@ -189,9 +189,6 @@ struct arm_ccn { struct arm_ccn_dt dt; }; -static DEFINE_MUTEX(arm_ccn_mutex); -static LIST_HEAD(arm_ccn_list); - static int arm_ccn_node_to_xp(int node) { return node / CCN_NUM_XP_PORTS; @@ -1173,30 +1170,24 @@ static enum hrtimer_restart arm_ccn_pmu_timer_handler(struct hrtimer *hrtimer) } -static int arm_ccn_pmu_offline_cpu(unsigned int cpu) +static int arm_ccn_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) { - struct arm_ccn_dt *dt; + struct arm_ccn_dt *dt = hlist_entry_safe(node, struct arm_ccn_dt, node); + struct arm_ccn *ccn = container_of(dt, struct arm_ccn, dt); unsigned int target; - mutex_lock(&arm_ccn_mutex); - list_for_each_entry(dt, &arm_ccn_list, entry) { - struct arm_ccn *ccn = container_of(dt, struct arm_ccn, dt); - - if (!cpumask_test_and_clear_cpu(cpu, &dt->cpu)) - continue; - target = cpumask_any_but(cpu_online_mask, cpu); - if (target >= nr_cpu_ids) - continue; - perf_pmu_migrate_context(&dt->pmu, cpu, target); - cpumask_set_cpu(target, &dt->cpu); - if (ccn->irq) - WARN_ON(irq_set_affinity_hint(ccn->irq, &dt->cpu) != 0); - } - mutex_unlock(&arm_ccn_mutex); + if (!cpumask_test_and_clear_cpu(cpu, &dt->cpu)) + return 0; + target = cpumask_any_but(cpu_online_mask, cpu); + if (target >= nr_cpu_ids) + return 0; + perf_pmu_migrate_context(&dt->pmu, cpu, target); + cpumask_set_cpu(target, &dt->cpu); + if (ccn->irq) + WARN_ON(irq_set_affinity_hint(ccn->irq, &dt->cpu) != 0); return 0; } - static DEFINE_IDA(arm_ccn_pmu_ida); static int arm_ccn_pmu_init(struct arm_ccn *ccn) @@ -1278,9 +1269,8 @@ static int arm_ccn_pmu_init(struct arm_ccn *ccn) if (err) goto error_pmu_register; - mutex_lock(&arm_ccn_mutex); - list_add(&ccn->dt.entry, &arm_ccn_list); - mutex_unlock(&arm_ccn_mutex); + cpuhp_state_add_instance_nocalls(CPUHP_AP_PERF_ARM_CCN_ONLINE, + &ccn->dt.node); return 0; error_pmu_register: @@ -1296,10 +1286,8 @@ static void arm_ccn_pmu_cleanup(struct arm_ccn *ccn) { int i; - mutex_lock(&arm_ccn_mutex); - list_del(&ccn->dt.entry); - mutex_unlock(&arm_ccn_mutex); - + cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_CCN_ONLINE, + &ccn->dt.node); if (ccn->irq) irq_set_affinity_hint(ccn->irq, NULL); for (i = 0; i < ccn->num_xps; i++) @@ -1527,9 +1515,9 @@ static int __init arm_ccn_init(void) { int i, ret; - ret = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_ARM_CCN_ONLINE, - "AP_PERF_ARM_CCN_ONLINE", NULL, - arm_ccn_pmu_offline_cpu); + ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_CCN_ONLINE, + "AP_PERF_ARM_CCN_ONLINE", NULL, + arm_ccn_pmu_offline_cpu); if (ret) return ret; @@ -1541,7 +1529,7 @@ static int __init arm_ccn_init(void) static void __exit arm_ccn_exit(void) { - cpuhp_remove_state_nocalls(CPUHP_AP_PERF_ARM_CCN_ONLINE); + cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_CCN_ONLINE); platform_driver_unregister(&arm_ccn_driver); } From 8017c279196ab29174bafc104ac4ebbd42c7ca7f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 12 Aug 2016 19:49:43 +0200 Subject: [PATCH 143/538] net/virtio-net: Convert to hotplug state machine Install the callbacks via the state machine. The driver supports multiple instances and therefore the new cpuhp_state_add_instance_nocalls() infrastrucure is used. The driver currently uses get_online_cpus() to avoid missing a CPU hotplug event while invoking virtnet_set_affinity(). This could be avoided by using cpuhp_state_add_instance() variant which holds the hotplug lock and invokes callback during registration. This is more or less a 1:1 conversion of the current code. Signed-off-by: Sebastian Andrzej Siewior Cc: Mark Rutland Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: netdev@vger.kernel.org Cc: Will Deacon Cc: virtualization@lists.linux-foundation.org Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/1471024183-12666-7-git-send-email-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- drivers/net/virtio_net.c | 110 +++++++++++++++++++++++++++++-------- include/linux/cpuhotplug.h | 1 + 2 files changed, 87 insertions(+), 24 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 1b5f531eeb25..fad84f3f4109 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -138,8 +138,9 @@ struct virtnet_info { /* Does the affinity hint is set for virtqueues? */ bool affinity_hint_set; - /* CPU hot plug notifier */ - struct notifier_block nb; + /* CPU hotplug instances for online & dead */ + struct hlist_node node; + struct hlist_node node_dead; /* Control VQ buffers: protected by the rtnl lock */ struct virtio_net_ctrl_hdr ctrl_hdr; @@ -1237,25 +1238,53 @@ static void virtnet_set_affinity(struct virtnet_info *vi) vi->affinity_hint_set = true; } -static int virtnet_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node) { - struct virtnet_info *vi = container_of(nfb, struct virtnet_info, nb); + struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info, + node); + virtnet_set_affinity(vi); + return 0; +} - switch(action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - case CPU_DEAD: - virtnet_set_affinity(vi); - break; - case CPU_DOWN_PREPARE: - virtnet_clean_affinity(vi, (long)hcpu); - break; - default: - break; - } +static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node) +{ + struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info, + node_dead); + virtnet_set_affinity(vi); + return 0; +} - return NOTIFY_OK; +static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node) +{ + struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info, + node); + + virtnet_clean_affinity(vi, cpu); + return 0; +} + +static enum cpuhp_state virtionet_online; + +static int virtnet_cpu_notif_add(struct virtnet_info *vi) +{ + int ret; + + ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node); + if (ret) + return ret; + ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD, + &vi->node_dead); + if (!ret) + return ret; + cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node); + return ret; +} + +static void virtnet_cpu_notif_remove(struct virtnet_info *vi) +{ + cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node); + cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD, + &vi->node_dead); } static void virtnet_get_ringparam(struct net_device *dev, @@ -1879,8 +1908,7 @@ static int virtnet_probe(struct virtio_device *vdev) virtio_device_ready(vdev); - vi->nb.notifier_call = &virtnet_cpu_callback; - err = register_hotcpu_notifier(&vi->nb); + err = virtnet_cpu_notif_add(vi); if (err) { pr_debug("virtio_net: registering cpu notifier failed\n"); goto free_unregister_netdev; @@ -1934,7 +1962,7 @@ static void virtnet_remove(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; - unregister_hotcpu_notifier(&vi->nb); + virtnet_cpu_notif_remove(vi); /* Make sure no work handler is accessing the device. */ flush_work(&vi->config_work); @@ -1953,7 +1981,7 @@ static int virtnet_freeze(struct virtio_device *vdev) struct virtnet_info *vi = vdev->priv; int i; - unregister_hotcpu_notifier(&vi->nb); + virtnet_cpu_notif_remove(vi); /* Make sure no work handler is accessing the device */ flush_work(&vi->config_work); @@ -1997,7 +2025,7 @@ static int virtnet_restore(struct virtio_device *vdev) virtnet_set_queues(vi, vi->curr_queue_pairs); rtnl_unlock(); - err = register_hotcpu_notifier(&vi->nb); + err = virtnet_cpu_notif_add(vi); if (err) return err; @@ -2039,7 +2067,41 @@ static struct virtio_driver virtio_net_driver = { #endif }; -module_virtio_driver(virtio_net_driver); +static __init int virtio_net_driver_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "AP_VIRT_NET_ONLINE", + virtnet_cpu_online, + virtnet_cpu_down_prep); + if (ret < 0) + goto out; + virtionet_online = ret; + ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "VIRT_NET_DEAD", + NULL, virtnet_cpu_dead); + if (ret) + goto err_dead; + + ret = register_virtio_driver(&virtio_net_driver); + if (ret) + goto err_virtio; + return 0; +err_virtio: + cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD); +err_dead: + cpuhp_remove_multi_state(virtionet_online); +out: + return ret; +} +module_init(virtio_net_driver_init); + +static __exit void virtio_net_driver_exit(void) +{ + cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD); + cpuhp_remove_multi_state(virtionet_online); + unregister_virtio_driver(&virtio_net_driver); +} +module_exit(virtio_net_driver_exit); MODULE_DEVICE_TABLE(virtio, id_table); MODULE_DESCRIPTION("Virtio network driver"); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index dcfe619171b4..b95f7adfbf8b 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -14,6 +14,7 @@ enum cpuhp_state { CPUHP_PERF_SUPERH, CPUHP_X86_HPET_DEAD, CPUHP_X86_APB_DEAD, + CPUHP_VIRT_NET_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, From f88eecfe2f22b2790e7527c0aaec14ea175919de Mon Sep 17 00:00:00 2001 From: Sebastian Frias Date: Tue, 16 Aug 2016 16:05:08 +0200 Subject: [PATCH 144/538] genirq/generic_chip: Verify irqs_per_chip <= 32 Most (if not all) code here implicitly assumes that the maximum number of IRQs per chip will be 32, and thus uses 'u32' or 'unsigned long' for many tasks (for example "struct irq_data" declares its 'mask' field as 'u32', and "struct irq_chip_generic" declares its 'installed' field as 'unsigned long') However, there is no check to verify that irqs_per_chip is <= 32. Hence, calling irq_alloc_domain_generic_chips() with a bigger value will result in unexpected results. Provide a wrapper with a MAYBE_BUILD_BUG_ON(nrirqs >= 32) to catch such cases. [ tglx: Reduced changelog to the essential information ] Signed-off-by: Sebastian Frias Cc: Marc Zyngier Cc: Mason Cc: Jason Cooper Link: http://lkml.kernel.org/r/57B31D94.5040701@laposte.net Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 18 +++++++++++++----- kernel/irq/generic-chip.c | 16 ++++++++-------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index b52424eaa0ed..603986741f2c 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -916,12 +916,20 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int clr, unsigned int set); struct irq_chip_generic *irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq); -int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, - int num_ct, const char *name, - irq_flow_handler_t handler, - unsigned int clr, unsigned int set, - enum irq_gc_flags flags); +int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, + int num_ct, const char *name, + irq_flow_handler_t handler, + unsigned int clr, unsigned int set, + enum irq_gc_flags flags); + +#define irq_alloc_domain_generic_chips(d, irqs_per_chip, num_ct, name, \ + handler, clr, set, flags) \ +({ \ + MAYBE_BUILD_BUG_ON(irqs_per_chip > 32); \ + __irq_alloc_domain_generic_chips(d, irqs_per_chip, num_ct, name,\ + handler, clr, set, flags); \ +}) static inline struct irq_chip_type *irq_data_get_chip_type(struct irq_data *d) { diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index a3a392097804..ee32870079c9 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -260,9 +260,9 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) } /** - * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain + * __irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain * @d: irq domain for which to allocate chips - * @irqs_per_chip: Number of interrupts each chip handles + * @irqs_per_chip: Number of interrupts each chip handles (max 32) * @num_ct: Number of irq_chip_type instances associated with this * @name: Name of the irq chip * @handler: Default flow handler associated with these chips @@ -270,11 +270,11 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) * @set: IRQ_* bits to set in the mapping function * @gcflags: Generic chip specific setup flags */ -int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, - int num_ct, const char *name, - irq_flow_handler_t handler, - unsigned int clr, unsigned int set, - enum irq_gc_flags gcflags) +int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, + int num_ct, const char *name, + irq_flow_handler_t handler, + unsigned int clr, unsigned int set, + enum irq_gc_flags gcflags) { struct irq_domain_chip_generic *dgc; struct irq_chip_generic *gc; @@ -326,7 +326,7 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, d->name = name; return 0; } -EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); +EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips); static struct irq_chip_generic * __irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) From 01b41159066531cc8d664362ff0cd89dd137bbfa Mon Sep 17 00:00:00 2001 From: Lianwei Wang Date: Thu, 9 Jun 2016 23:43:28 -0700 Subject: [PATCH 145/538] cpu/hotplug: Handle unbalanced hotplug enable/disable When cpu_hotplug_enable() is called unbalanced w/o a preceeding cpu_hotplug_disable() the code emits a warning, but happily decrements the disabled counter. This causes the next operations to malfunction. Prevent the decrement and just emit a warning. Signed-off-by: Lianwei Wang Cc: peterz@infradead.org Cc: linux-pm@vger.kernel.org Cc: oleg@redhat.com Link: http://lkml.kernel.org/r/1465541008-12476-1-git-send-email-lianwei.wang@gmail.com Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index c506485eaa75..c90f839c5b86 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -331,10 +331,17 @@ void cpu_hotplug_disable(void) } EXPORT_SYMBOL_GPL(cpu_hotplug_disable); +static void __cpu_hotplug_enable(void) +{ + if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n")) + return; + cpu_hotplug_disabled--; +} + void cpu_hotplug_enable(void) { cpu_maps_update_begin(); - WARN_ON(--cpu_hotplug_disabled < 0); + __cpu_hotplug_enable(); cpu_maps_update_done(); } EXPORT_SYMBOL_GPL(cpu_hotplug_enable); @@ -1160,7 +1167,7 @@ void enable_nonboot_cpus(void) /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); - WARN_ON(--cpu_hotplug_disabled < 0); + __cpu_hotplug_enable(); if (cpumask_empty(frozen_cpus)) goto out; From b8fb03785d4de097507d0cf45873525e0ac4d2b2 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 1 Sep 2016 11:39:33 -0700 Subject: [PATCH 146/538] locking/static_keys: Provide DECLARE and well as DEFINE macros We will need to provide declarations of static keys in header files. Provide DECLARE_STATIC_KEY_{TRUE,FALSE} macros. Signed-off-by: Tony Luck Acked-by: Borislav Petkov Cc: Peter Zijlstra Cc: Dan Williams Cc: Linus Torvalds Link: http://lkml.kernel.org/r/816881cf85bd3cf13385d212882618f38a3b5d33.1472754711.git.tony.luck@intel.com Signed-off-by: Thomas Gleixner --- include/linux/jump_label.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 661af564fae8..595fb46213fc 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -267,9 +267,15 @@ struct static_key_false { #define DEFINE_STATIC_KEY_TRUE(name) \ struct static_key_true name = STATIC_KEY_TRUE_INIT +#define DECLARE_STATIC_KEY_TRUE(name) \ + extern struct static_key_true name + #define DEFINE_STATIC_KEY_FALSE(name) \ struct static_key_false name = STATIC_KEY_FALSE_INIT +#define DECLARE_STATIC_KEY_FALSE(name) \ + extern struct static_key_false name + extern bool ____wrong_branch_error(void); #define static_key_enabled(x) \ From 3637efb00864f465baebd49464e58319fd295b65 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 1 Sep 2016 11:39:33 -0700 Subject: [PATCH 147/538] x86/mce: Add PCI quirks to identify Xeons with machine check recovery Each Xeon includes a number of capability registers in PCI space that describe some features not enumerated by CPUID. Use these to determine that we are running on a model that can recover from machine checks. Hooks for Ivybridge ... Skylake provided. Signed-off-by: Tony Luck Acked-by: Borislav Petkov Cc: Peter Zijlstra Cc: Dan Williams Cc: Boris Petkov Cc: Linus Torvalds Link: http://lkml.kernel.org/r/abf331dc4a3e2a2d17444129bc51127437bcf4ba.1472754711.git.tony.luck@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/string_64.h | 3 +++ arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++++++ arch/x86/kernel/quirks.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 90dbbd9666d4..877a1dfbf770 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -2,6 +2,7 @@ #define _ASM_X86_STRING_64_H #ifdef __KERNEL__ +#include /* Written 2002 by Andi Kleen */ @@ -78,6 +79,8 @@ int strcmp(const char *cs, const char *ct); #define memset(s, c, n) __memset(s, c, n) #endif +DECLARE_STATIC_KEY_FALSE(mcsafe_key); + /** * memcpy_mcsafe - copy memory with indication if a machine check happened * diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 79d8ec849468..acccebcc836d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -2080,6 +2081,7 @@ void mce_disable_bank(int bank) * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. * mce=nobootlog Don't log MCEs from before booting. * mce=bios_cmci_threshold Don't program the CMCI threshold + * mce=recovery force enable memcpy_mcsafe() */ static int __init mcheck_enable(char *str) { @@ -2676,8 +2678,14 @@ static int __init mcheck_debugfs_init(void) static int __init mcheck_debugfs_init(void) { return -EINVAL; } #endif +DEFINE_STATIC_KEY_FALSE(mcsafe_key); +EXPORT_SYMBOL_GPL(mcsafe_key); + static int __init mcheck_late_init(void) { + if (mca_cfg.recovery) + static_branch_inc(&mcsafe_key); + mcheck_debugfs_init(); /* diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index cc457ff818ad..51402a7e4ca6 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -626,3 +626,34 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3, amd_disable_seq_and_redirect_scrub); #endif + +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) +#include +#include + +/* Ivy Bridge, Haswell, Broadwell */ +static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev) +{ + u32 capid0; + + pci_read_config_dword(pdev, 0x84, &capid0); + + if (capid0 & 0x10) + static_branch_inc(&mcsafe_key); +} + +/* Skylake */ +static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev) +{ + u32 capid0; + + pci_read_config_dword(pdev, 0x84, &capid0); + + if ((capid0 & 0xc0) == 0xc0) + static_branch_inc(&mcsafe_key); +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap); +#endif From 9a6fb28a355d2609ace4dab4e6425442c647894d Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 1 Sep 2016 11:39:33 -0700 Subject: [PATCH 148/538] x86/mce: Improve memcpy_mcsafe() Use the mcsafe_key defined in the previous patch to make decisions on which copy function to use. We can't use the FEATURE bit any more because PCI quirks run too late to affect the patching of code. So we use a static key. Turn memcpy_mcsafe() into an inline function to make life easier for callers. The assembly code that actually does the copy is now named memcpy_mcsafe_unrolled() Signed-off-by: Tony Luck Acked-by: Borislav Petkov Cc: Peter Zijlstra Cc: Dan Williams Cc: Boris Petkov Cc: Linus Torvalds Link: http://lkml.kernel.org/r/bfde2fc774e94f53d91b70a4321c85a0d33e7118.1472754712.git.tony.luck@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/pmem.h | 5 +---- arch/x86/include/asm/string_64.h | 16 +++++++++++++++- arch/x86/kernel/x8664_ksyms_64.c | 2 +- arch/x86/lib/memcpy_64.S | 6 +++--- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index 643eba42d620..2c1ebeb4d737 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -46,10 +46,7 @@ static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n) static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n) { - if (static_cpu_has(X86_FEATURE_MCE_RECOVERY)) - return memcpy_mcsafe(dst, src, n); - memcpy(dst, src, n); - return 0; + return memcpy_mcsafe(dst, src, n); } /** diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 877a1dfbf770..a164862d77e3 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -79,6 +79,7 @@ int strcmp(const char *cs, const char *ct); #define memset(s, c, n) __memset(s, c, n) #endif +__must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t cnt); DECLARE_STATIC_KEY_FALSE(mcsafe_key); /** @@ -89,10 +90,23 @@ DECLARE_STATIC_KEY_FALSE(mcsafe_key); * @cnt: number of bytes to copy * * Low level memory copy function that catches machine checks + * We only call into the "safe" function on systems that can + * actually do machine check recovery. Everyone else can just + * use memcpy(). * * Return 0 for success, -EFAULT for fail */ -int memcpy_mcsafe(void *dst, const void *src, size_t cnt); +static __always_inline __must_check int +memcpy_mcsafe(void *dst, const void *src, size_t cnt) +{ +#ifdef CONFIG_X86_MCE + if (static_branch_unlikely(&mcsafe_key)) + return memcpy_mcsafe_unrolled(dst, src, cnt); + else +#endif + memcpy(dst, src, cnt); + return 0; +} #endif /* __KERNEL__ */ diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 95e49f6e4fc3..b2cee3d19477 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -38,7 +38,7 @@ EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(_copy_to_user); -EXPORT_SYMBOL_GPL(memcpy_mcsafe); +EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 2ec0b0abbfaa..49e6ebac7e73 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -181,11 +181,11 @@ ENDPROC(memcpy_orig) #ifndef CONFIG_UML /* - * memcpy_mcsafe - memory copy with machine check exception handling + * memcpy_mcsafe_unrolled - memory copy with machine check exception handling * Note that we only catch machine checks when reading the source addresses. * Writes to target are posted and don't generate machine checks. */ -ENTRY(memcpy_mcsafe) +ENTRY(memcpy_mcsafe_unrolled) cmpl $8, %edx /* Less than 8 bytes? Go to byte copy loop */ jb .L_no_whole_words @@ -273,7 +273,7 @@ ENTRY(memcpy_mcsafe) .L_done_memcpy_trap: xorq %rax, %rax ret -ENDPROC(memcpy_mcsafe) +ENDPROC(memcpy_mcsafe_unrolled) .section .fixup, "ax" /* Return -EFAULT for any failure */ From ffb173e657fa8123bffa2a169e124b4bca0b5bc4 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 1 Sep 2016 11:39:33 -0700 Subject: [PATCH 149/538] x86/mce: Drop X86_FEATURE_MCE_RECOVERY and the related model string test We now have a better way to determine if we are running on a cpu that supports machine check recovery. Free up this feature bit. Signed-off-by: Tony Luck Acked-by: Borislav Petkov Cc: Peter Zijlstra Cc: Dan Williams Cc: Boris Petkov Cc: Linus Torvalds Link: http://lkml.kernel.org/r/d5db39e08d46cf1012d94d3902275d08ba931926.1472754712.git.tony.luck@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/cpufeatures.h | 1 - arch/x86/kernel/cpu/mcheck/mce.c | 11 ----------- 2 files changed, 12 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 92a8308b96f6..1188bc849ee3 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -106,7 +106,6 @@ #define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ #define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */ #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ -#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index acccebcc836d..7f3f0e147242 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1634,17 +1634,6 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model == 45) quirk_no_way_out = quirk_sandybridge_ifu; - /* - * MCG_CAP.MCG_SER_P is necessary but not sufficient to know - * whether this processor will actually generate recoverable - * machine checks. Check to see if this is an E7 model Xeon. - * We can't do a model number check because E5 and E7 use the - * same model number. E5 doesn't support recovery, E7 does. - */ - if (mca_cfg.recovery || (mca_cfg.ser && - !strncmp(c->x86_model_id, - "Intel(R) Xeon(R) CPU E7-", 24))) - set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY); } if (cfg->monarch_timeout < 0) cfg->monarch_timeout = 0; From 97a7142f157a6361a659ff3eec2c3cf636bd7490 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Sun, 5 Jul 2015 18:33:48 +0900 Subject: [PATCH 150/538] sched/fair: Make update_min_vruntime() more readable The update_min_vruntime() control flow can be simplified. Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: minchan.kim@lge.com Link: http://lkml.kernel.org/r/1436088829-25768-1-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 61d485421bed..9a18aae0b0ad 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -464,20 +464,17 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) { u64 vruntime = cfs_rq->min_vruntime; - if (cfs_rq->curr) - vruntime = cfs_rq->curr->vruntime; - if (cfs_rq->rb_leftmost) { struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, struct sched_entity, run_node); - if (!cfs_rq->curr) - vruntime = se->vruntime; - else - vruntime = min_vruntime(vruntime, se->vruntime); + vruntime = se->vruntime; } + if (cfs_rq->curr) + vruntime = min_vruntime(vruntime, cfs_rq->curr->vruntime); + /* ensure we never gain time by being placed backwards. */ cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); #ifndef CONFIG_64BIT @@ -5988,7 +5985,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * The adjacency matrix of the resulting graph is given by: * - * log_2 n + * log_2 n * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) * k = 0 * @@ -6034,7 +6031,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * [XXX write more on how we solve this.. _after_ merging pjt's patches that * rewrite all of this once again.] - */ + */ static unsigned long __read_mostly max_load_balance_interval = HZ/10; @@ -6696,7 +6693,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) /* * !SD_OVERLAP domains can assume that child groups * span the current group. - */ + */ group = child->groups; do { From 126b3b6842cc848fc9880e7816e0a8d743be51f1 Mon Sep 17 00:00:00 2001 From: Tommaso Cucinotta Date: Sun, 14 Aug 2016 16:27:06 +0200 Subject: [PATCH 151/538] sched/deadline: Refactor CPU heap code 1. heapify up factored out in new dedicated function heapify_up() (avoids repetition of same code) 2. call to cpudl_change_key() replaced with heapify_up() when cpudl_set actually inserts a new node in the heap 3. cpudl_change_key() replaced with heapify() that heapifies up or down as needed. Signed-off-by: Tommaso Cucinotta Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Luca Abeni Reviewed-by: Juri Lelli Cc: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-dl@retis.sssup.it Link: http://lkml.kernel.org/r/1471184828-12644-2-git-send-email-tommaso.cucinotta@sssup.it Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 50 ++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index d4184498c9f5..0acb0d4e2fb7 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -41,7 +41,7 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b) swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); } -static void cpudl_heapify(struct cpudl *cp, int idx) +static void cpudl_heapify_down(struct cpudl *cp, int idx) { int l, r, largest; @@ -66,23 +66,24 @@ static void cpudl_heapify(struct cpudl *cp, int idx) } } -static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) +static void cpudl_heapify_up(struct cpudl *cp, int idx) { - WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); - - if (dl_time_before(new_dl, cp->elements[idx].dl)) { - cp->elements[idx].dl = new_dl; - cpudl_heapify(cp, idx); - } else { - cp->elements[idx].dl = new_dl; - while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, - cp->elements[idx].dl)) { - cpudl_exchange(cp, idx, parent(idx)); - idx = parent(idx); - } + while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, + cp->elements[idx].dl)) { + cpudl_exchange(cp, idx, parent(idx)); + idx = parent(idx); } } +static void cpudl_heapify(struct cpudl *cp, int idx) +{ + if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, + cp->elements[idx].dl)) + cpudl_heapify_up(cp, idx); + else + cpudl_heapify_down(cp, idx); +} + static inline int cpudl_maximum(struct cpudl *cp) { return cp->elements[0].cpu; @@ -154,27 +155,22 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) cp->size--; cp->elements[new_cpu].idx = old_idx; cp->elements[cpu].idx = IDX_INVALID; - while (old_idx > 0 && dl_time_before( - cp->elements[parent(old_idx)].dl, - cp->elements[old_idx].dl)) { - cpudl_exchange(cp, old_idx, parent(old_idx)); - old_idx = parent(old_idx); - } + cpudl_heapify(cp, old_idx); cpumask_set_cpu(cpu, cp->free_cpus); - cpudl_heapify(cp, old_idx); goto out; } if (old_idx == IDX_INVALID) { - cp->size++; - cp->elements[cp->size - 1].dl = dl; - cp->elements[cp->size - 1].cpu = cpu; - cp->elements[cpu].idx = cp->size - 1; - cpudl_change_key(cp, cp->size - 1, dl); + int new_idx = cp->size++; + cp->elements[new_idx].dl = dl; + cp->elements[new_idx].cpu = cpu; + cp->elements[cpu].idx = new_idx; + cpudl_heapify_up(cp, new_idx); cpumask_clear_cpu(cpu, cp->free_cpus); } else { - cpudl_change_key(cp, old_idx, dl); + cp->elements[old_idx].dl = dl; + cpudl_heapify(cp, old_idx); } out: From 8e1bc301aaf9f9a2d731bf8d50d549ac2dcfdab2 Mon Sep 17 00:00:00 2001 From: Tommaso Cucinotta Date: Sun, 14 Aug 2016 16:27:07 +0200 Subject: [PATCH 152/538] sched/deadline: Make CPU heap faster avoiding real swaps on heapify This change goes from heapify() ops done by swapping with parent/child so that the item to fix moves along, to heapify() ops done by just pulling the parent/child chain by 1 pos, then storing the item to fix just at the end. On a non-trivial heapify(), this performs roughly half stores wrt swaps. This has been measured to achieve up to 10% of speed-up for cpudl_set() calls, with a randomly generated workload of 1K,10K,100K random heap insertions and deletions (75% cpudl_set() calls with is_valid=1 and 25% with is_valid=0), and randomly generated cpu IDs, with up to 256 CPUs, as measured on an Intel Core2 Duo. Signed-off-by: Tommaso Cucinotta Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Luca Abeni Reviewed-by: Juri Lelli Cc: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-dl@retis.sssup.it Link: http://lkml.kernel.org/r/1471184828-12644-3-git-send-email-tommaso.cucinotta@sssup.it Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 66 ++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 0acb0d4e2fb7..0ace75a7a87b 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -31,48 +31,72 @@ static inline int right_child(int i) return (i << 1) + 2; } -static void cpudl_exchange(struct cpudl *cp, int a, int b) -{ - int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; - - swap(cp->elements[a].cpu, cp->elements[b].cpu); - swap(cp->elements[a].dl , cp->elements[b].dl ); - - swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); -} - static void cpudl_heapify_down(struct cpudl *cp, int idx) { int l, r, largest; + int orig_cpu = cp->elements[idx].cpu; + u64 orig_dl = cp->elements[idx].dl; + + if (left_child(idx) >= cp->size) + return; + /* adapted from lib/prio_heap.c */ while(1) { + u64 largest_dl; l = left_child(idx); r = right_child(idx); largest = idx; + largest_dl = orig_dl; - if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, - cp->elements[l].dl)) + if ((l < cp->size) && dl_time_before(orig_dl, + cp->elements[l].dl)) { largest = l; - if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, - cp->elements[r].dl)) + largest_dl = cp->elements[l].dl; + } + if ((r < cp->size) && dl_time_before(largest_dl, + cp->elements[r].dl)) largest = r; + if (largest == idx) break; - /* Push idx down the heap one level and bump one up */ - cpudl_exchange(cp, largest, idx); + /* pull largest child onto idx */ + cp->elements[idx].cpu = cp->elements[largest].cpu; + cp->elements[idx].dl = cp->elements[largest].dl; + cp->elements[cp->elements[idx].cpu].idx = idx; idx = largest; } + /* actual push down of saved original values orig_* */ + cp->elements[idx].cpu = orig_cpu; + cp->elements[idx].dl = orig_dl; + cp->elements[cp->elements[idx].cpu].idx = idx; } static void cpudl_heapify_up(struct cpudl *cp, int idx) { - while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, - cp->elements[idx].dl)) { - cpudl_exchange(cp, idx, parent(idx)); - idx = parent(idx); - } + int p; + + int orig_cpu = cp->elements[idx].cpu; + u64 orig_dl = cp->elements[idx].dl; + + if (idx == 0) + return; + + do { + p = parent(idx); + if (dl_time_before(orig_dl, cp->elements[p].dl)) + break; + /* pull parent onto idx */ + cp->elements[idx].cpu = cp->elements[p].cpu; + cp->elements[idx].dl = cp->elements[p].dl; + cp->elements[cp->elements[idx].cpu].idx = idx; + idx = p; + } while (idx != 0); + /* actual push up of saved original values orig_* */ + cp->elements[idx].cpu = orig_cpu; + cp->elements[idx].dl = orig_dl; + cp->elements[cp->elements[idx].cpu].idx = idx; } static void cpudl_heapify(struct cpudl *cp, int idx) From d8206bb3ffe0eaee03abfad46fd44d8b17142e88 Mon Sep 17 00:00:00 2001 From: Tommaso Cucinotta Date: Sun, 14 Aug 2016 16:27:08 +0200 Subject: [PATCH 153/538] sched/deadline: Split cpudl_set() into cpudl_set() and cpudl_clear() These 2 exercise independent code paths and need different arguments. After this change, you call: cpudl_clear(cp, cpu); cpudl_set(cp, cpu, dl); instead of: cpudl_set(cp, cpu, 0 /* dl */, 0 /* is_valid */); cpudl_set(cp, cpu, dl, 1 /* is_valid */); Signed-off-by: Tommaso Cucinotta Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Luca Abeni Reviewed-by: Juri Lelli Cc: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-dl@retis.sssup.it Link: http://lkml.kernel.org/r/1471184828-12644-4-git-send-email-tommaso.cucinotta@sssup.it Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 49 +++++++++++++++++++++++++------------- kernel/sched/cpudeadline.h | 3 ++- kernel/sched/deadline.c | 10 ++++---- 3 files changed, 40 insertions(+), 22 deletions(-) diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 0ace75a7a87b..e73119013c53 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -145,16 +145,15 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, } /* - * cpudl_set - update the cpudl max-heap + * cpudl_clear - remove a cpu from the cpudl max-heap * @cp: the cpudl max-heap context * @cpu: the target cpu - * @dl: the new earliest deadline for this cpu * * Notes: assumes cpu_rq(cpu)->lock is locked * * Returns: (void) */ -void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) +void cpudl_clear(struct cpudl *cp, int cpu) { int old_idx, new_cpu; unsigned long flags; @@ -162,17 +161,15 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) WARN_ON(!cpu_present(cpu)); raw_spin_lock_irqsave(&cp->lock, flags); + old_idx = cp->elements[cpu].idx; - if (!is_valid) { - /* remove item */ - if (old_idx == IDX_INVALID) { - /* - * Nothing to remove if old_idx was invalid. - * This could happen if a rq_offline_dl is - * called for a CPU without -dl tasks running. - */ - goto out; - } + if (old_idx == IDX_INVALID) { + /* + * Nothing to remove if old_idx was invalid. + * This could happen if a rq_offline_dl is + * called for a CPU without -dl tasks running. + */ + } else { new_cpu = cp->elements[cp->size - 1].cpu; cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; cp->elements[old_idx].cpu = new_cpu; @@ -180,11 +177,32 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) cp->elements[new_cpu].idx = old_idx; cp->elements[cpu].idx = IDX_INVALID; cpudl_heapify(cp, old_idx); - cpumask_set_cpu(cpu, cp->free_cpus); - goto out; + cpumask_set_cpu(cpu, cp->free_cpus); } + raw_spin_unlock_irqrestore(&cp->lock, flags); +} + +/* + * cpudl_set - update the cpudl max-heap + * @cp: the cpudl max-heap context + * @cpu: the target cpu + * @dl: the new earliest deadline for this cpu + * + * Notes: assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpudl_set(struct cpudl *cp, int cpu, u64 dl) +{ + int old_idx; + unsigned long flags; + + WARN_ON(!cpu_present(cpu)); + raw_spin_lock_irqsave(&cp->lock, flags); + + old_idx = cp->elements[cpu].idx; if (old_idx == IDX_INVALID) { int new_idx = cp->size++; cp->elements[new_idx].dl = dl; @@ -197,7 +215,6 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) cpudl_heapify(cp, old_idx); } -out: raw_spin_unlock_irqrestore(&cp->lock, flags); } diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index fcbdf83fed7e..f7da8c55bba0 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -23,7 +23,8 @@ struct cpudl { #ifdef CONFIG_SMP int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); -void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); +void cpudl_set(struct cpudl *cp, int cpu, u64 dl); +void cpudl_clear(struct cpudl *cp, int cpu); int cpudl_init(struct cpudl *cp); void cpudl_set_freecpu(struct cpudl *cp, int cpu); void cpudl_clear_freecpu(struct cpudl *cp, int cpu); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d091f4a95416..18fb0b8fc911 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -798,7 +798,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (dl_rq->earliest_dl.curr == 0 || dl_time_before(deadline, dl_rq->earliest_dl.curr)) { dl_rq->earliest_dl.curr = deadline; - cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); + cpudl_set(&rq->rd->cpudl, rq->cpu, deadline); } } @@ -813,14 +813,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (!dl_rq->dl_nr_running) { dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; - cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); + cpudl_clear(&rq->rd->cpudl, rq->cpu); } else { struct rb_node *leftmost = dl_rq->rb_leftmost; struct sched_dl_entity *entry; entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); dl_rq->earliest_dl.curr = entry->deadline; - cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); + cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline); } } @@ -1671,7 +1671,7 @@ static void rq_online_dl(struct rq *rq) cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); if (rq->dl.dl_nr_running > 0) - cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); + cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr); } /* Assumes rq->lock is held */ @@ -1680,7 +1680,7 @@ static void rq_offline_dl(struct rq *rq) if (rq->dl.overloaded) dl_clear_overload(rq); - cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); + cpudl_clear(&rq->rd->cpudl, rq->cpu); cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); } From 2665621506e178a1f62e59200403c359c463ea5e Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 10 Aug 2016 11:27:27 +0100 Subject: [PATCH 154/538] sched/fair: Fix load_above_capacity fixed point arithmetic width Since commit: 2159197d6677 ("sched/core: Enable increased load resolution on 64-bit kernels") we now have two different fixed point units for load. load_above_capacity has to have 10 bits fixed point unit like PELT, whereas NICE_0_LOAD has 20 bit fixed point unit on 64-bit kernels. Fix this by scaling down NICE_0_LOAD when multiplying load_above_capacity with it. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Acked-by: Morten Rasmussen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yuyang Du Link: http://lkml.kernel.org/r/1470824847-5316-1-git-send-email-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9a18aae0b0ad..6011bfe81665 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7193,7 +7193,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; if (load_above_capacity > busiest->group_capacity) { load_above_capacity -= busiest->group_capacity; - load_above_capacity *= NICE_0_LOAD; + load_above_capacity *= scale_load_down(NICE_0_LOAD); load_above_capacity /= busiest->group_capacity; } else load_above_capacity = ~0UL; From efca03ecbe29a46c2c5ae539563b6326af9dcba7 Mon Sep 17 00:00:00 2001 From: "seokhoon.yoon" Date: Tue, 16 Aug 2016 18:26:08 +0900 Subject: [PATCH 155/538] schedcore: Remove duplicated init_task's preempt_notifiers init init_task's preempt_notifiers is initialized twice: 1) sched_init() -> INIT_HLIST_HEAD(&init_task.preempt_notifiers) 2) sched_init() -> init_idle(current,) <--- current task is init_task at this time -> __sched_fork(,current) -> INIT_HLIST_HEAD(&p->preempt_notifiers) I think the first one is unnecessary, so remove it. Signed-off-by: seokhoon.yoon Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471339568-5790-1-git-send-email-iamyooon@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7d602f508ca1..90b1961f6ea5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7554,10 +7554,6 @@ void __init sched_init(void) set_load_weight(&init_task); -#ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&init_task.preempt_notifiers); -#endif - /* * The boot idle thread does lazy MMU switching as well: */ From 61c7aca695b6fabe85d0fc424fe8ae2f66f267dd Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 31 Aug 2016 18:27:44 +0800 Subject: [PATCH 156/538] sched/deadline: Fix the intention to re-evalute tick dependency for offline CPU The dl task will be replenished after dl task timer fire and start a new period. It will be enqueued and to re-evaluate its dependency on the tick in order to restart it. However, if the CPU is hot-unplugged, irq_work_queue will splash since the target CPU is offline. As a result we get: WARNING: CPU: 2 PID: 0 at kernel/irq_work.c:69 irq_work_queue_on+0xad/0xe0 Call Trace: dump_stack+0x99/0xd0 __warn+0xd1/0xf0 warn_slowpath_null+0x1d/0x20 irq_work_queue_on+0xad/0xe0 tick_nohz_full_kick_cpu+0x44/0x50 tick_nohz_dep_set_cpu+0x74/0xb0 enqueue_task_dl+0x226/0x480 activate_task+0x5c/0xa0 dl_task_timer+0x19b/0x2c0 ? push_dl_task.part.31+0x190/0x190 This can be triggered by hot-unplugging the full dynticks CPU which dl task is running on. We enqueue the dl task on the offline CPU, because we need to do replenish for start_dl_timer(). So, as Juri pointed out, we would need to do is calling replenish_dl_entity() directly, instead of enqueue_task_dl(). pi_se shouldn't be a problem as the task shouldn't be boosted if it was throttled. This patch fixes it by avoiding the whole enqueue+dequeue+enqueue story, by first migrating (set_task_cpu()) and then doing 1 enqueue. Suggested-by: Peter Zijlstra Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Juri Lelli Cc: Linus Torvalds Cc: Luca Abeni Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1472639264-3932-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 46 ++++++++++++++++------------------------- 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 18fb0b8fc911..0c75bc656178 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) { struct rq *later_rq = NULL; - bool fallback = false; later_rq = find_lock_later_rq(p, rq); - if (!later_rq) { int cpu; @@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p * If we cannot preempt any rq, fall back to pick any * online cpu. */ - fallback = true; cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); if (cpu >= nr_cpu_ids) { /* @@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p double_lock_balance(rq, later_rq); } - /* - * By now the task is replenished and enqueued; migrate it. - */ - deactivate_task(rq, p, 0); set_task_cpu(p, later_rq->cpu); - activate_task(later_rq, p, 0); - - if (!fallback) - resched_curr(later_rq); - double_unlock_balance(later_rq, rq); return later_rq; @@ -641,29 +629,31 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) goto unlock; } - enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); - else - resched_curr(rq); - #ifdef CONFIG_SMP - /* - * Perform balancing operations here; after the replenishments. We - * cannot drop rq->lock before this, otherwise the assertion in - * start_dl_timer() about not missing updates is not true. - * - * If we find that the rq the task was on is no longer available, we - * need to select a new rq. - * - * XXX figure out if select_task_rq_dl() deals with offline cpus. - */ if (unlikely(!rq->online)) { + /* + * If the runqueue is no longer available, migrate the + * task elsewhere. This necessarily changes rq. + */ lockdep_unpin_lock(&rq->lock, rf.cookie); rq = dl_task_offline_migration(rq, p); rf.cookie = lockdep_pin_lock(&rq->lock); + + /* + * Now that the task has been migrated to the new RQ and we + * have that locked, proceed as normal and enqueue the task + * there. + */ } +#endif + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); + +#ifdef CONFIG_SMP /* * Queueing this task back might have overloaded rq, check if we need * to kick someone away. From 1a3d027c5a6847e5d349c8527f99aada47e5467a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 17 Jun 2016 12:43:23 -0500 Subject: [PATCH 157/538] sched/debug: Rename and move enqueue_sleeper() enqueue_sleeper() doesn't actually enqueue, it just handles some statistics and tracepoints. Rename it to update_stats_enqueue_sleeper() and call it from update_stats_enqueue(). Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Matt Fleming Cc: Mel Gorman Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/fb20b7159dc4d028c406c0e8d5f8c439b741615b.1466184592.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 142 +++++++++++++++++++++++--------------------- 1 file changed, 73 insertions(+), 69 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6011bfe81665..479639f6dc80 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -862,11 +862,72 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) se->statistics.wait_start = 0; } +static void +update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct task_struct *tsk = NULL; + + if (entity_is_task(se)) + tsk = task_of(se); + + if (se->statistics.sleep_start) { + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > se->statistics.sleep_max)) + se->statistics.sleep_max = delta; + + se->statistics.sleep_start = 0; + se->statistics.sum_sleep_runtime += delta; + + if (tsk) { + account_scheduler_latency(tsk, delta >> 10, 1); + trace_sched_stat_sleep(tsk, delta); + } + } + if (se->statistics.block_start) { + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > se->statistics.block_max)) + se->statistics.block_max = delta; + + se->statistics.block_start = 0; + se->statistics.sum_sleep_runtime += delta; + + if (tsk) { + if (tsk->in_iowait) { + se->statistics.iowait_sum += delta; + se->statistics.iowait_count++; + trace_sched_stat_iowait(tsk, delta); + } + + trace_sched_stat_blocked(tsk, delta); + + /* + * Blocking time is in units of nanosecs, so shift by + * 20 to get a milliseconds-range estimation of the + * amount of time that the task spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + profile_hits(SLEEP_PROFILING, + (void *)get_wchan(tsk), + delta >> 20); + } + account_scheduler_latency(tsk, delta >> 10, 0); + } + } +} + /* * Task is being enqueued - update stats: */ static inline void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Are we enqueueing a waiting task? (for current tasks @@ -874,6 +935,9 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) */ if (se != cfs_rq->curr) update_stats_wait_start(cfs_rq, se); + + if (flags & ENQUEUE_WAKEUP) + update_stats_enqueue_sleeper(cfs_rq, se); } static inline void @@ -910,7 +974,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) } static inline void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { } @@ -3197,68 +3266,6 @@ static inline int idle_balance(struct rq *rq) #endif /* CONFIG_SMP */ -static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -#ifdef CONFIG_SCHEDSTATS - struct task_struct *tsk = NULL; - - if (entity_is_task(se)) - tsk = task_of(se); - - if (se->statistics.sleep_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > se->statistics.sleep_max)) - se->statistics.sleep_max = delta; - - se->statistics.sleep_start = 0; - se->statistics.sum_sleep_runtime += delta; - - if (tsk) { - account_scheduler_latency(tsk, delta >> 10, 1); - trace_sched_stat_sleep(tsk, delta); - } - } - if (se->statistics.block_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > se->statistics.block_max)) - se->statistics.block_max = delta; - - se->statistics.block_start = 0; - se->statistics.sum_sleep_runtime += delta; - - if (tsk) { - if (tsk->in_iowait) { - se->statistics.iowait_sum += delta; - se->statistics.iowait_count++; - trace_sched_stat_iowait(tsk, delta); - } - - trace_sched_stat_blocked(tsk, delta); - - /* - * Blocking time is in units of nanosecs, so shift by - * 20 to get a milliseconds-range estimation of the - * amount of time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - profile_hits(SLEEP_PROFILING, - (void *)get_wchan(tsk), - delta >> 20); - } - account_scheduler_latency(tsk, delta >> 10, 0); - } - } -#endif -} - static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHED_DEBUG @@ -3385,15 +3392,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); - if (flags & ENQUEUE_WAKEUP) { + if (flags & ENQUEUE_WAKEUP) place_entity(cfs_rq, se, 0); - if (schedstat_enabled()) - enqueue_sleeper(cfs_rq, se); - } check_schedstat_required(); if (schedstat_enabled()) { - update_stats_enqueue(cfs_rq, se); + update_stats_enqueue(cfs_rq, se, flags); check_spread(cfs_rq, se); } if (!curr) From ae92882e5646d8661a3ca182ba988752fe4b773f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 17 Jun 2016 12:43:24 -0500 Subject: [PATCH 158/538] sched/debug: Clean up schedstat macros The schedstat_*() macros are inconsistent: most of them take a pointer and a field which the macro combines, whereas schedstat_set() takes the already combined ptr->field. The already combined ptr->field argument is actually more intuitive and easier to use, and there's no reason to require the user to split the variable up, so convert the macros to use the combined argument. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Matt Fleming Cc: Mel Gorman Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/54953ca25bb579f3a5946432dee409b0e05222c6.1466184592.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 22 ++++++++++----------- kernel/sched/debug.c | 4 ++-- kernel/sched/fair.c | 42 ++++++++++++++++++++-------------------- kernel/sched/idle_task.c | 2 +- kernel/sched/stats.h | 22 ++++++++++----------- 5 files changed, 46 insertions(+), 46 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 90b1961f6ea5..850677049d4a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1636,16 +1636,16 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) int this_cpu = smp_processor_id(); if (cpu == this_cpu) { - schedstat_inc(rq, ttwu_local); - schedstat_inc(p, se.statistics.nr_wakeups_local); + schedstat_inc(rq->ttwu_local); + schedstat_inc(p->se.statistics.nr_wakeups_local); } else { struct sched_domain *sd; - schedstat_inc(p, se.statistics.nr_wakeups_remote); + schedstat_inc(p->se.statistics.nr_wakeups_remote); rcu_read_lock(); for_each_domain(this_cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - schedstat_inc(sd, ttwu_wake_remote); + schedstat_inc(sd->ttwu_wake_remote); break; } } @@ -1653,15 +1653,15 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } if (wake_flags & WF_MIGRATED) - schedstat_inc(p, se.statistics.nr_wakeups_migrate); + schedstat_inc(p->se.statistics.nr_wakeups_migrate); #endif /* CONFIG_SMP */ - schedstat_inc(rq, ttwu_count); - schedstat_inc(p, se.statistics.nr_wakeups); + schedstat_inc(rq->ttwu_count); + schedstat_inc(p->se.statistics.nr_wakeups); if (wake_flags & WF_SYNC) - schedstat_inc(p, se.statistics.nr_wakeups_sync); + schedstat_inc(p->se.statistics.nr_wakeups_sync); #endif /* CONFIG_SCHEDSTATS */ } @@ -3237,7 +3237,7 @@ static inline void schedule_debug(struct task_struct *prev) profile_hit(SCHED_PROFILING, __builtin_return_address(0)); - schedstat_inc(this_rq(), sched_count); + schedstat_inc(this_rq()->sched_count); } /* @@ -4849,7 +4849,7 @@ SYSCALL_DEFINE0(sched_yield) { struct rq *rq = this_rq_lock(); - schedstat_inc(rq, yld_count); + schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); /* @@ -5000,7 +5000,7 @@ int __sched yield_to(struct task_struct *p, bool preempt) yielded = curr->sched_class->yield_to_task(rq, p, preempt); if (yielded) { - schedstat_inc(rq, yld_count); + schedstat_inc(rq->yld_count); /* * Make p's CPU reschedule; pick_next_entity takes care of * fairness. diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2a0a9995256d..92fa53457b72 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -429,9 +429,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) p->prio); SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)), + SPLIT_NS(schedstat_val(p->se.statistics.wait_sum)), SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime))); + SPLIT_NS(schedstat_val(p->se.statistics.sum_sleep_runtime))); #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 479639f6dc80..157d741cec34 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -800,7 +800,7 @@ static void update_curr(struct cfs_rq *cfs_rq) max(delta_exec, curr->statistics.exec_max)); curr->sum_exec_runtime += delta_exec; - schedstat_add(cfs_rq, exec_clock, delta_exec); + schedstat_add(cfs_rq->exec_clock, delta_exec); curr->vruntime += calc_delta_fair(delta_exec, curr); update_min_vruntime(cfs_rq); @@ -3275,7 +3275,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) d = -d; if (d > 3*sysctl_sched_latency) - schedstat_inc(cfs_rq, nr_spread_over); + schedstat_inc(cfs_rq->nr_spread_over); #endif } @@ -5164,13 +5164,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, balanced = this_eff_load <= prev_eff_load; - schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); + schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); if (!balanced) return 0; - schedstat_inc(sd, ttwu_move_affine); - schedstat_inc(p, se.statistics.nr_wakeups_affine); + schedstat_inc(sd->ttwu_move_affine); + schedstat_inc(p->se.statistics.nr_wakeups_affine); return 1; } @@ -6183,7 +6183,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { int cpu; - schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + schedstat_inc(p->se.statistics.nr_failed_migrations_affine); env->flags |= LBF_SOME_PINNED; @@ -6214,7 +6214,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) env->flags &= ~LBF_ALL_PINNED; if (task_running(env->src_rq, p)) { - schedstat_inc(p, se.statistics.nr_failed_migrations_running); + schedstat_inc(p->se.statistics.nr_failed_migrations_running); return 0; } @@ -6231,13 +6231,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (tsk_cache_hot <= 0 || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (tsk_cache_hot == 1) { - schedstat_inc(env->sd, lb_hot_gained[env->idle]); - schedstat_inc(p, se.statistics.nr_forced_migrations); + schedstat_inc(env->sd->lb_hot_gained[env->idle]); + schedstat_inc(p->se.statistics.nr_forced_migrations); } return 1; } - schedstat_inc(p, se.statistics.nr_failed_migrations_hot); + schedstat_inc(p->se.statistics.nr_failed_migrations_hot); return 0; } @@ -6277,7 +6277,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) * so we can safely collect stats here rather than * inside detach_tasks(). */ - schedstat_inc(env->sd, lb_gained[env->idle]); + schedstat_inc(env->sd->lb_gained[env->idle]); return p; } return NULL; @@ -6369,7 +6369,7 @@ static int detach_tasks(struct lb_env *env) * so we can safely collect detach_one_task() stats here rather * than inside detach_one_task(). */ - schedstat_add(env->sd, lb_gained[env->idle], detached); + schedstat_add(env->sd->lb_gained[env->idle], detached); return detached; } @@ -7510,7 +7510,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, cpumask_copy(cpus, cpu_active_mask); - schedstat_inc(sd, lb_count[idle]); + schedstat_inc(sd->lb_count[idle]); redo: if (!should_we_balance(&env)) { @@ -7520,19 +7520,19 @@ static int load_balance(int this_cpu, struct rq *this_rq, group = find_busiest_group(&env); if (!group) { - schedstat_inc(sd, lb_nobusyg[idle]); + schedstat_inc(sd->lb_nobusyg[idle]); goto out_balanced; } busiest = find_busiest_queue(&env, group); if (!busiest) { - schedstat_inc(sd, lb_nobusyq[idle]); + schedstat_inc(sd->lb_nobusyq[idle]); goto out_balanced; } BUG_ON(busiest == env.dst_rq); - schedstat_add(sd, lb_imbalance[idle], env.imbalance); + schedstat_add(sd->lb_imbalance[idle], env.imbalance); env.src_cpu = busiest->cpu; env.src_rq = busiest; @@ -7639,7 +7639,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, } if (!ld_moved) { - schedstat_inc(sd, lb_failed[idle]); + schedstat_inc(sd->lb_failed[idle]); /* * Increment the failure counter only on periodic balance. * We do not want newidle balance, which can be very @@ -7722,7 +7722,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, * we can't migrate them. Let the imbalance flag set so parent level * can try to migrate them. */ - schedstat_inc(sd, lb_balanced[idle]); + schedstat_inc(sd->lb_balanced[idle]); sd->nr_balance_failed = 0; @@ -7915,15 +7915,15 @@ static int active_load_balance_cpu_stop(void *data) .idle = CPU_IDLE, }; - schedstat_inc(sd, alb_count); + schedstat_inc(sd->alb_count); p = detach_one_task(&env); if (p) { - schedstat_inc(sd, alb_pushed); + schedstat_inc(sd->alb_pushed); /* Active balancing done, reset the failure counter. */ sd->nr_balance_failed = 0; } else { - schedstat_inc(sd, alb_failed); + schedstat_inc(sd->alb_failed); } } rcu_read_unlock(); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 2ce5458bbe1d..dedc81ecbb2e 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -28,7 +28,7 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie c { put_prev_task(rq, prev); - schedstat_inc(rq, sched_goidle); + schedstat_inc(rq->sched_goidle); return rq->idle; } diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 78955cbea31c..fc0542576a4a 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -29,11 +29,11 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) if (rq) rq->rq_sched_info.run_delay += delta; } -# define schedstat_enabled() static_branch_unlikely(&sched_schedstats) -# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) -# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) -# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) -# define schedstat_val(rq, field) ((schedstat_enabled()) ? (rq)->field : 0) +#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) +#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) +#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) +#define schedstat_val(var) ((schedstat_enabled()) ? (var) : 0) #else /* !CONFIG_SCHEDSTATS */ static inline void @@ -45,12 +45,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) static inline void rq_sched_info_depart(struct rq *rq, unsigned long long delta) {} -# define schedstat_enabled() 0 -# define schedstat_inc(rq, field) do { } while (0) -# define schedstat_add(rq, field, amt) do { } while (0) -# define schedstat_set(var, val) do { } while (0) -# define schedstat_val(rq, field) 0 -#endif +#define schedstat_enabled() 0 +#define schedstat_inc(var) do { } while (0) +#define schedstat_add(var, amt) do { } while (0) +#define schedstat_set(var, val) do { } while (0) +#define schedstat_val(var) 0 +#endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_SCHED_INFO static inline void sched_info_reset_dequeued(struct task_struct *t) From 20e1d4863bfa7152e98f94e5bcdda3e7db41d899 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 17 Jun 2016 12:43:25 -0500 Subject: [PATCH 159/538] sched/debug: Rename 'schedstat_val()' -> 'schedstat_val_or_zero()' The schedstat_val() macro's behavior is kind of surprising: when schedstat is runtime disabled, it returns zero. Rename it to schedstat_val_or_zero(). There's also a need for a similar macro which doesn't have the 'if (schedstat_enable())' check, to avoid doing the check twice. Create a new 'schedstat_val()' macro for that. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Matt Fleming Cc: Mel Gorman Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/3bb1d2367d041fee333b0dde17171e709395b675.1466184592.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 4 ++-- kernel/sched/stats.h | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 92fa53457b72..63ffcaa5d57c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -429,9 +429,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) p->prio); SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(schedstat_val(p->se.statistics.wait_sum)), + SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)), SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(schedstat_val(p->se.statistics.sum_sleep_runtime))); + SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime))); #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index fc0542576a4a..34659a853505 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -33,7 +33,8 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) #define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) #define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) -#define schedstat_val(var) ((schedstat_enabled()) ? (var) : 0) +#define schedstat_val(var) (var) +#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) #else /* !CONFIG_SCHEDSTATS */ static inline void @@ -50,6 +51,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) #define schedstat_add(var, amt) do { } while (0) #define schedstat_set(var, val) do { } while (0) #define schedstat_val(var) 0 +#define schedstat_val_or_zero(var) 0 #endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_SCHED_INFO From 4fa8d299b43a91f871f6d5b00dd5ab33d43bbc2c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 17 Jun 2016 12:43:26 -0500 Subject: [PATCH 160/538] sched/debug: Remove several CONFIG_SCHEDSTATS guards Clean up the sched code by removing several of the CONFIG_SCHEDSTATS guards, using schedstat_*() macros where needed. Code size: !CONFIG_SCHEDSTATS defconfig: text data bss dec hex filename 10209818 4368184 1105920 15683922 ef5152 vmlinux.before.nostats 10209818 4368184 1105920 15683922 ef5152 vmlinux.after.nostats CONFIG_SCHEDSTATS defconfig: text data bss dec hex filename 10214210 4370040 1105920 15690170 ef69ba vmlinux.before.stats 10214210 4370680 1105920 15690810 ef6c3a vmlinux.after.stats Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Matt Fleming Cc: Mel Gorman Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e51e0ebe5af95ac295de720dd252e7c0d2142e4a.1466184592.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 33 ++++------ kernel/sched/debug.c | 99 +++++++++++++++-------------- kernel/sched/fair.c | 148 ++++++++++++++++++++----------------------- 3 files changed, 136 insertions(+), 144 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 850677049d4a..860070fba814 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1629,13 +1629,15 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p, static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { -#ifdef CONFIG_SCHEDSTATS - struct rq *rq = this_rq(); + struct rq *rq; -#ifdef CONFIG_SMP - int this_cpu = smp_processor_id(); + if (!schedstat_enabled()) + return; + + rq = this_rq(); - if (cpu == this_cpu) { +#ifdef CONFIG_SMP + if (cpu == rq->cpu) { schedstat_inc(rq->ttwu_local); schedstat_inc(p->se.statistics.nr_wakeups_local); } else { @@ -1643,7 +1645,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) schedstat_inc(p->se.statistics.nr_wakeups_remote); rcu_read_lock(); - for_each_domain(this_cpu, sd) { + for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { schedstat_inc(sd->ttwu_wake_remote); break; @@ -1654,7 +1656,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) if (wake_flags & WF_MIGRATED) schedstat_inc(p->se.statistics.nr_wakeups_migrate); - #endif /* CONFIG_SMP */ schedstat_inc(rq->ttwu_count); @@ -1662,8 +1663,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) if (wake_flags & WF_SYNC) schedstat_inc(p->se.statistics.nr_wakeups_sync); - -#endif /* CONFIG_SCHEDSTATS */ } static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) @@ -2084,8 +2083,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ttwu_queue(p, cpu, wake_flags); stat: - if (schedstat_enabled()) - ttwu_stat(p, cpu, wake_flags); + ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -2134,8 +2132,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_do_wakeup(rq, p, 0, cookie); - if (schedstat_enabled()) - ttwu_stat(p, smp_processor_id(), 0); + ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); } @@ -7675,12 +7672,10 @@ void normalize_rt_tasks(void) if (p->flags & PF_KTHREAD) continue; - p->se.exec_start = 0; -#ifdef CONFIG_SCHEDSTATS - p->se.statistics.wait_start = 0; - p->se.statistics.sleep_start = 0; - p->se.statistics.block_start = 0; -#endif + p->se.exec_start = 0; + schedstat_set(p->se.statistics.wait_start, 0); + schedstat_set(p->se.statistics.sleep_start, 0); + schedstat_set(p->se.statistics.block_start, 0); if (!dl_task(p) && !rt_task(p)) { /* diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 63ffcaa5d57c..13935886a471 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #define P(F) \ SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) +#define P_SCHEDSTAT(F) \ + SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) #define PN(F) \ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN_SCHEDSTAT(F) \ + SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) if (!se) return; @@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN(se->exec_start); PN(se->vruntime); PN(se->sum_exec_runtime); -#ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) { - PN(se->statistics.wait_start); - PN(se->statistics.sleep_start); - PN(se->statistics.block_start); - PN(se->statistics.sleep_max); - PN(se->statistics.block_max); - PN(se->statistics.exec_max); - PN(se->statistics.slice_max); - PN(se->statistics.wait_max); - PN(se->statistics.wait_sum); - P(se->statistics.wait_count); + PN_SCHEDSTAT(se->statistics.wait_start); + PN_SCHEDSTAT(se->statistics.sleep_start); + PN_SCHEDSTAT(se->statistics.block_start); + PN_SCHEDSTAT(se->statistics.sleep_max); + PN_SCHEDSTAT(se->statistics.block_max); + PN_SCHEDSTAT(se->statistics.exec_max); + PN_SCHEDSTAT(se->statistics.slice_max); + PN_SCHEDSTAT(se->statistics.wait_max); + PN_SCHEDSTAT(se->statistics.wait_sum); + P_SCHEDSTAT(se->statistics.wait_count); } -#endif P(se->load.weight); #ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); #endif + +#undef PN_SCHEDSTAT #undef PN +#undef P_SCHEDSTAT #undef P } #endif @@ -626,9 +631,7 @@ do { \ #undef P64 #endif -#ifdef CONFIG_SCHEDSTATS -#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); - +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n)); if (schedstat_enabled()) { P(yld_count); P(sched_count); @@ -636,9 +639,8 @@ do { \ P(ttwu_count); P(ttwu_local); } - #undef P -#endif + spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); @@ -868,10 +870,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) #define P(F) \ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) +#define P_SCHEDSTAT(F) \ + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F)) #define __PN(F) \ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) #define PN(F) \ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) +#define PN_SCHEDSTAT(F) \ + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F))) PN(se.exec_start); PN(se.vruntime); @@ -881,37 +887,36 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.nr_migrations); -#ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; - PN(se.statistics.sum_sleep_runtime); - PN(se.statistics.wait_start); - PN(se.statistics.sleep_start); - PN(se.statistics.block_start); - PN(se.statistics.sleep_max); - PN(se.statistics.block_max); - PN(se.statistics.exec_max); - PN(se.statistics.slice_max); - PN(se.statistics.wait_max); - PN(se.statistics.wait_sum); - P(se.statistics.wait_count); - PN(se.statistics.iowait_sum); - P(se.statistics.iowait_count); - P(se.statistics.nr_migrations_cold); - P(se.statistics.nr_failed_migrations_affine); - P(se.statistics.nr_failed_migrations_running); - P(se.statistics.nr_failed_migrations_hot); - P(se.statistics.nr_forced_migrations); - P(se.statistics.nr_wakeups); - P(se.statistics.nr_wakeups_sync); - P(se.statistics.nr_wakeups_migrate); - P(se.statistics.nr_wakeups_local); - P(se.statistics.nr_wakeups_remote); - P(se.statistics.nr_wakeups_affine); - P(se.statistics.nr_wakeups_affine_attempts); - P(se.statistics.nr_wakeups_passive); - P(se.statistics.nr_wakeups_idle); + PN_SCHEDSTAT(se.statistics.sum_sleep_runtime); + PN_SCHEDSTAT(se.statistics.wait_start); + PN_SCHEDSTAT(se.statistics.sleep_start); + PN_SCHEDSTAT(se.statistics.block_start); + PN_SCHEDSTAT(se.statistics.sleep_max); + PN_SCHEDSTAT(se.statistics.block_max); + PN_SCHEDSTAT(se.statistics.exec_max); + PN_SCHEDSTAT(se.statistics.slice_max); + PN_SCHEDSTAT(se.statistics.wait_max); + PN_SCHEDSTAT(se.statistics.wait_sum); + P_SCHEDSTAT(se.statistics.wait_count); + PN_SCHEDSTAT(se.statistics.iowait_sum); + P_SCHEDSTAT(se.statistics.iowait_count); + P_SCHEDSTAT(se.statistics.nr_migrations_cold); + P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine); + P_SCHEDSTAT(se.statistics.nr_failed_migrations_running); + P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot); + P_SCHEDSTAT(se.statistics.nr_forced_migrations); + P_SCHEDSTAT(se.statistics.nr_wakeups); + P_SCHEDSTAT(se.statistics.nr_wakeups_sync); + P_SCHEDSTAT(se.statistics.nr_wakeups_migrate); + P_SCHEDSTAT(se.statistics.nr_wakeups_local); + P_SCHEDSTAT(se.statistics.nr_wakeups_remote); + P_SCHEDSTAT(se.statistics.nr_wakeups_affine); + P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts); + P_SCHEDSTAT(se.statistics.nr_wakeups_passive); + P_SCHEDSTAT(se.statistics.nr_wakeups_idle); avg_atom = p->se.sum_exec_runtime; if (nr_switches) @@ -930,7 +935,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) __PN(avg_atom); __PN(avg_per_cpu); } -#endif + __P(nr_switches); SEQ_printf(m, "%-45s:%21Ld\n", "nr_voluntary_switches", (long long)p->nvcsw); @@ -947,8 +952,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) #endif P(policy); P(prio); +#undef PN_SCHEDSTAT #undef PN #undef __PN +#undef P_SCHEDSTAT #undef P #undef __P diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 157d741cec34..a6820b3771e2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -821,26 +821,34 @@ static void update_curr_fair(struct rq *rq) update_curr(cfs_rq_of(&rq->curr->se)); } -#ifdef CONFIG_SCHEDSTATS static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 wait_start = rq_clock(rq_of(cfs_rq)); + u64 wait_start, prev_wait_start; + + if (!schedstat_enabled()) + return; + + wait_start = rq_clock(rq_of(cfs_rq)); + prev_wait_start = schedstat_val(se->statistics.wait_start); if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && - likely(wait_start > se->statistics.wait_start)) - wait_start -= se->statistics.wait_start; + likely(wait_start > prev_wait_start)) + wait_start -= prev_wait_start; - se->statistics.wait_start = wait_start; + schedstat_set(se->statistics.wait_start, wait_start); } -static void +static inline void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct task_struct *p; u64 delta; - delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; + if (!schedstat_enabled()) + return; + + delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); if (entity_is_task(se)) { p = task_of(se); @@ -850,59 +858,67 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) * time stamp can be adjusted to accumulate wait time * prior to migration. */ - se->statistics.wait_start = delta; + schedstat_set(se->statistics.wait_start, delta); return; } trace_sched_stat_wait(p, delta); } - se->statistics.wait_max = max(se->statistics.wait_max, delta); - se->statistics.wait_count++; - se->statistics.wait_sum += delta; - se->statistics.wait_start = 0; + schedstat_set(se->statistics.wait_max, + max(schedstat_val(se->statistics.wait_max), delta)); + schedstat_inc(se->statistics.wait_count); + schedstat_add(se->statistics.wait_sum, delta); + schedstat_set(se->statistics.wait_start, 0); } -static void +static inline void update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct task_struct *tsk = NULL; + u64 sleep_start, block_start; + + if (!schedstat_enabled()) + return; + + sleep_start = schedstat_val(se->statistics.sleep_start); + block_start = schedstat_val(se->statistics.block_start); if (entity_is_task(se)) tsk = task_of(se); - if (se->statistics.sleep_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; + if (sleep_start) { + u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; if ((s64)delta < 0) delta = 0; - if (unlikely(delta > se->statistics.sleep_max)) - se->statistics.sleep_max = delta; + if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) + schedstat_set(se->statistics.sleep_max, delta); - se->statistics.sleep_start = 0; - se->statistics.sum_sleep_runtime += delta; + schedstat_set(se->statistics.sleep_start, 0); + schedstat_add(se->statistics.sum_sleep_runtime, delta); if (tsk) { account_scheduler_latency(tsk, delta >> 10, 1); trace_sched_stat_sleep(tsk, delta); } } - if (se->statistics.block_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; + if (block_start) { + u64 delta = rq_clock(rq_of(cfs_rq)) - block_start; if ((s64)delta < 0) delta = 0; - if (unlikely(delta > se->statistics.block_max)) - se->statistics.block_max = delta; + if (unlikely(delta > schedstat_val(se->statistics.block_max))) + schedstat_set(se->statistics.block_max, delta); - se->statistics.block_start = 0; - se->statistics.sum_sleep_runtime += delta; + schedstat_set(se->statistics.block_start, 0); + schedstat_add(se->statistics.sum_sleep_runtime, delta); if (tsk) { if (tsk->in_iowait) { - se->statistics.iowait_sum += delta; - se->statistics.iowait_count++; + schedstat_add(se->statistics.iowait_sum, delta); + schedstat_inc(se->statistics.iowait_count); trace_sched_stat_iowait(tsk, delta); } @@ -929,6 +945,9 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + if (!schedstat_enabled()) + return; + /* * Are we enqueueing a waiting task? (for current tasks * a dequeue/enqueue event is a NOP) @@ -943,6 +962,10 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) static inline void update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + + if (!schedstat_enabled()) + return; + /* * Mark the end of the wait period if dequeueing a * waiting task: @@ -950,45 +973,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) update_stats_wait_end(cfs_rq, se); - if (flags & DEQUEUE_SLEEP) { - if (entity_is_task(se)) { - struct task_struct *tsk = task_of(se); + if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { + struct task_struct *tsk = task_of(se); - if (tsk->state & TASK_INTERRUPTIBLE) - se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); - if (tsk->state & TASK_UNINTERRUPTIBLE) - se->statistics.block_start = rq_clock(rq_of(cfs_rq)); - } + if (tsk->state & TASK_INTERRUPTIBLE) + schedstat_set(se->statistics.sleep_start, + rq_clock(rq_of(cfs_rq))); + if (tsk->state & TASK_UNINTERRUPTIBLE) + schedstat_set(se->statistics.block_start, + rq_clock(rq_of(cfs_rq))); } - -} -#else -static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} - -static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} - -static inline void -update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ } -static inline void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) -{ -} - -static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) -{ -} -#endif - /* * We are picking a new current task - update its stats: */ @@ -3396,10 +3392,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) place_entity(cfs_rq, se, 0); check_schedstat_required(); - if (schedstat_enabled()) { - update_stats_enqueue(cfs_rq, se, flags); - check_spread(cfs_rq, se); - } + update_stats_enqueue(cfs_rq, se, flags); + check_spread(cfs_rq, se); if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; @@ -3466,8 +3460,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_curr(cfs_rq); dequeue_entity_load_avg(cfs_rq, se); - if (schedstat_enabled()) - update_stats_dequeue(cfs_rq, se, flags); + update_stats_dequeue(cfs_rq, se, flags); clear_buddies(cfs_rq, se); @@ -3541,25 +3534,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * a CPU. So account for the time it spent waiting on the * runqueue. */ - if (schedstat_enabled()) - update_stats_wait_end(cfs_rq, se); + update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); cfs_rq->curr = se; -#ifdef CONFIG_SCHEDSTATS + /* * Track our maximum slice length, if the CPU's load is at * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { - se->statistics.slice_max = max(se->statistics.slice_max, - se->sum_exec_runtime - se->prev_sum_exec_runtime); + schedstat_set(se->statistics.slice_max, + max((u64)schedstat_val(se->statistics.slice_max), + se->sum_exec_runtime - se->prev_sum_exec_runtime)); } -#endif + se->prev_sum_exec_runtime = se->sum_exec_runtime; } @@ -3638,13 +3631,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* throttle cfs_rqs exceeding runtime */ check_cfs_rq_runtime(cfs_rq); - if (schedstat_enabled()) { - check_spread(cfs_rq, prev); - if (prev->on_rq) - update_stats_wait_start(cfs_rq, prev); - } + check_spread(cfs_rq, prev); if (prev->on_rq) { + update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ From 3260ab5616b4cd049c79c342617525456a2391b2 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 29 Aug 2016 08:48:42 +0200 Subject: [PATCH 161/538] xen: Sync xen header Import the actual version of include/xen/interface/sched.h from Xen. Signed-off-by: Juergen Gross Signed-off-by: Peter Zijlstra (Intel) Acked-by: David Vrabel Cc: Douglas_Warzecha@dell.com Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akataria@vmware.com Cc: boris.ostrovsky@oracle.com Cc: chrisw@sous-sol.org Cc: hpa@zytor.com Cc: jdelvare@suse.com Cc: jeremy@goop.org Cc: linux@roeck-us.net Cc: pali.rohar@gmail.com Cc: rusty@rustcorp.com.au Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1472453327-19050-2-git-send-email-jgross@suse.com Signed-off-by: Ingo Molnar --- include/xen/interface/sched.h | 100 ++++++++++++++++++++++++++++------ 1 file changed, 82 insertions(+), 18 deletions(-) diff --git a/include/xen/interface/sched.h b/include/xen/interface/sched.h index f18490985fc8..a4c4d735d781 100644 --- a/include/xen/interface/sched.h +++ b/include/xen/interface/sched.h @@ -3,6 +3,24 @@ * * Scheduler state interactions * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Copyright (c) 2005, Keir Fraser */ @@ -11,19 +29,31 @@ #include +/* + * Guest Scheduler Operations + * + * The SCHEDOP interface provides mechanisms for a guest to interact + * with the scheduler, including yield, blocking and shutting itself + * down. + */ + /* * The prototype for this hypercall is: - * long sched_op_new(int cmd, void *arg) + * long HYPERVISOR_sched_op(enum sched_op cmd, void *arg, ...) + * * @cmd == SCHEDOP_??? (scheduler operation). * @arg == Operation-specific extra argument(s), as described below. + * ... == Additional Operation-specific extra arguments, described below. * - * **NOTE**: - * Versions of Xen prior to 3.0.2 provide only the following legacy version + * Versions of Xen prior to 3.0.2 provided only the following legacy version * of this hypercall, supporting only the commands yield, block and shutdown: * long sched_op(int cmd, unsigned long arg) * @cmd == SCHEDOP_??? (scheduler operation). * @arg == 0 (SCHEDOP_yield and SCHEDOP_block) * == SHUTDOWN_* code (SCHEDOP_shutdown) + * + * This legacy version is available to new guests as: + * long HYPERVISOR_sched_op_compat(enum sched_op cmd, unsigned long arg) */ /* @@ -44,12 +74,17 @@ /* * Halt execution of this domain (all VCPUs) and notify the system controller. * @arg == pointer to sched_shutdown structure. + * + * If the sched_shutdown_t reason is SHUTDOWN_suspend then + * x86 PV guests must also set RDX (EDX for 32-bit guests) to the MFN + * of the guest's start info page. RDX/EDX is the third hypercall + * argument. + * + * In addition, which reason is SHUTDOWN_suspend this hypercall + * returns 1 if suspend was cancelled or the domain was merely + * checkpointed, and 0 if it is resuming in a new domain. */ #define SCHEDOP_shutdown 2 -struct sched_shutdown { - unsigned int reason; /* SHUTDOWN_* */ -}; -DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown); /* * Poll a set of event-channel ports. Return when one or more are pending. An @@ -57,12 +92,6 @@ DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown); * @arg == pointer to sched_poll structure. */ #define SCHEDOP_poll 3 -struct sched_poll { - GUEST_HANDLE(evtchn_port_t) ports; - unsigned int nr_ports; - uint64_t timeout; -}; -DEFINE_GUEST_HANDLE_STRUCT(sched_poll); /* * Declare a shutdown for another domain. The main use of this function is @@ -71,15 +100,11 @@ DEFINE_GUEST_HANDLE_STRUCT(sched_poll); * @arg == pointer to sched_remote_shutdown structure. */ #define SCHEDOP_remote_shutdown 4 -struct sched_remote_shutdown { - domid_t domain_id; /* Remote domain ID */ - unsigned int reason; /* SHUTDOWN_xxx reason */ -}; /* * Latch a shutdown code, so that when the domain later shuts down it * reports this code to the control tools. - * @arg == as for SCHEDOP_shutdown. + * @arg == sched_shutdown, as for SCHEDOP_shutdown. */ #define SCHEDOP_shutdown_code 5 @@ -92,10 +117,47 @@ struct sched_remote_shutdown { * With id != 0 and timeout != 0, poke watchdog timer and set new timeout. */ #define SCHEDOP_watchdog 6 + +/* + * Override the current vcpu affinity by pinning it to one physical cpu or + * undo this override restoring the previous affinity. + * @arg == pointer to sched_pin_override structure. + * + * A negative pcpu value will undo a previous pin override and restore the + * previous cpu affinity. + * This call is allowed for the hardware domain only and requires the cpu + * to be part of the domain's cpupool. + */ +#define SCHEDOP_pin_override 7 + +struct sched_shutdown { + unsigned int reason; /* SHUTDOWN_* => shutdown reason */ +}; +DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown); + +struct sched_poll { + GUEST_HANDLE(evtchn_port_t) ports; + unsigned int nr_ports; + uint64_t timeout; +}; +DEFINE_GUEST_HANDLE_STRUCT(sched_poll); + +struct sched_remote_shutdown { + domid_t domain_id; /* Remote domain ID */ + unsigned int reason; /* SHUTDOWN_* => shutdown reason */ +}; +DEFINE_GUEST_HANDLE_STRUCT(sched_remote_shutdown); + struct sched_watchdog { uint32_t id; /* watchdog ID */ uint32_t timeout; /* timeout */ }; +DEFINE_GUEST_HANDLE_STRUCT(sched_watchdog); + +struct sched_pin_override { + int32_t pcpu; +}; +DEFINE_GUEST_HANDLE_STRUCT(sched_pin_override); /* * Reason codes for SCHEDOP_shutdown. These may be interpreted by control @@ -107,6 +169,7 @@ struct sched_watchdog { #define SHUTDOWN_suspend 2 /* Clean up, save suspend info, kill. */ #define SHUTDOWN_crash 3 /* Tell controller we've crashed. */ #define SHUTDOWN_watchdog 4 /* Restart because watchdog time expired. */ + /* * Domain asked to perform 'soft reset' for it. The expected behavior is to * reset internal Xen state for the domain returning it to the point where it @@ -115,5 +178,6 @@ struct sched_watchdog { * interfaces again. */ #define SHUTDOWN_soft_reset 5 +#define SHUTDOWN_MAX 5 /* Maximum valid shutdown reason. */ #endif /* __XEN_PUBLIC_SCHED_H__ */ From 47ae4b05d0fa2f2a998ebaf34d2dcbffca56a9db Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 29 Aug 2016 08:48:43 +0200 Subject: [PATCH 162/538] virt, sched: Add generic vCPU pinning support Add generic virtualization support for pinning the current vCPU to a specified physical CPU. As this operation isn't performance critical (a very limited set of operations like BIOS calls and SMIs is expected to need this) just add a hypervisor specific indirection. Signed-off-by: Juergen Gross Signed-off-by: Peter Zijlstra (Intel) Cc: Douglas_Warzecha@dell.com Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akataria@vmware.com Cc: boris.ostrovsky@oracle.com Cc: chrisw@sous-sol.org Cc: david.vrabel@citrix.com Cc: hpa@zytor.com Cc: jdelvare@suse.com Cc: jeremy@goop.org Cc: linux@roeck-us.net Cc: pali.rohar@gmail.com Cc: rusty@rustcorp.com.au Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1472453327-19050-3-git-send-email-jgross@suse.com Signed-off-by: Ingo Molnar --- MAINTAINERS | 1 + arch/x86/include/asm/hypervisor.h | 4 ++++ arch/x86/kernel/cpu/hypervisor.c | 11 +++++++++++ include/linux/hypervisor.h | 17 +++++++++++++++++ kernel/smp.c | 1 + kernel/up.c | 1 + 6 files changed, 35 insertions(+) create mode 100644 include/linux/hypervisor.h diff --git a/MAINTAINERS b/MAINTAINERS index db814a89599c..95151aab3445 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8845,6 +8845,7 @@ S: Supported F: Documentation/virtual/paravirt_ops.txt F: arch/*/kernel/paravirt* F: arch/*/include/asm/paravirt.h +F: include/linux/hypervisor.h PARIDE DRIVERS FOR PARALLEL PORT IDE DEVICES M: Tim Waugh diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 055ea9941dd5..67942b6ad4b7 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -43,6 +43,9 @@ struct hypervisor_x86 { /* X2APIC detection (run once per boot) */ bool (*x2apic_available)(void); + + /* pin current vcpu to specified physical cpu (run rarely) */ + void (*pin_vcpu)(int); }; extern const struct hypervisor_x86 *x86_hyper; @@ -56,6 +59,7 @@ extern const struct hypervisor_x86 x86_hyper_kvm; extern void init_hypervisor(struct cpuinfo_x86 *c); extern void init_hypervisor_platform(void); extern bool hypervisor_x2apic_available(void); +extern void hypervisor_pin_vcpu(int cpu); #else static inline void init_hypervisor(struct cpuinfo_x86 *c) { } static inline void init_hypervisor_platform(void) { } diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 27e46658ebe3..35691a6b0d32 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -86,3 +86,14 @@ bool __init hypervisor_x2apic_available(void) x86_hyper->x2apic_available && x86_hyper->x2apic_available(); } + +void hypervisor_pin_vcpu(int cpu) +{ + if (!x86_hyper) + return; + + if (x86_hyper->pin_vcpu) + x86_hyper->pin_vcpu(cpu); + else + WARN_ONCE(1, "vcpu pinning requested but not supported!\n"); +} diff --git a/include/linux/hypervisor.h b/include/linux/hypervisor.h new file mode 100644 index 000000000000..3fa5ef2b3759 --- /dev/null +++ b/include/linux/hypervisor.h @@ -0,0 +1,17 @@ +#ifndef __LINUX_HYPEVISOR_H +#define __LINUX_HYPEVISOR_H + +/* + * Generic Hypervisor support + * Juergen Gross + */ + +#ifdef CONFIG_HYPERVISOR_GUEST +#include +#else +static inline void hypervisor_pin_vcpu(int cpu) +{ +} +#endif + +#endif /* __LINUX_HYPEVISOR_H */ diff --git a/kernel/smp.c b/kernel/smp.c index 3aa642d39c03..4274ca5f3bbc 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "smpboot.h" diff --git a/kernel/up.c b/kernel/up.c index 1760bf3d1463..3ccee2bd13ba 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -6,6 +6,7 @@ #include #include #include +#include int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int wait) From df8ce9d78a4e7fbe7ddfd8ccee3ecaaa0013e883 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 29 Aug 2016 08:48:44 +0200 Subject: [PATCH 163/538] smp: Add function to execute a function synchronously on a CPU On some hardware models (e.g. Dell Studio 1555 laptop) some hardware related functions (e.g. SMIs) are to be executed on physical CPU 0 only. Instead of open coding such a functionality multiple times in the kernel add a service function for this purpose. This will enable the possibility to take special measures in virtualized environments like Xen, too. Signed-off-by: Juergen Gross Signed-off-by: Peter Zijlstra (Intel) Cc: Douglas_Warzecha@dell.com Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akataria@vmware.com Cc: boris.ostrovsky@oracle.com Cc: chrisw@sous-sol.org Cc: david.vrabel@citrix.com Cc: hpa@zytor.com Cc: jdelvare@suse.com Cc: jeremy@goop.org Cc: linux@roeck-us.net Cc: pali.rohar@gmail.com Cc: rusty@rustcorp.com.au Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1472453327-19050-4-git-send-email-jgross@suse.com Signed-off-by: Ingo Molnar --- include/linux/smp.h | 3 +++ kernel/smp.c | 50 +++++++++++++++++++++++++++++++++++++++++++++ kernel/up.c | 17 +++++++++++++++ 3 files changed, 70 insertions(+) diff --git a/include/linux/smp.h b/include/linux/smp.h index eccae4690f41..8e0cb7a0f836 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -196,6 +196,9 @@ extern void arch_enable_nonboot_cpus_end(void); void smp_setup_processor_id(void); +int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, + bool phys); + /* SMP core functions */ int smpcfd_prepare_cpu(unsigned int cpu); int smpcfd_dead_cpu(unsigned int cpu); diff --git a/kernel/smp.c b/kernel/smp.c index 4274ca5f3bbc..f4f6137941cb 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -725,3 +725,53 @@ void wake_up_all_idle_cpus(void) preempt_enable(); } EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); + +/** + * smp_call_on_cpu - Call a function on a specific cpu + * + * Used to call a function on a specific cpu and wait for it to return. + * Optionally make sure the call is done on a specified physical cpu via vcpu + * pinning in order to support virtualized environments. + */ +struct smp_call_on_cpu_struct { + struct work_struct work; + struct completion done; + int (*func)(void *); + void *data; + int ret; + int cpu; +}; + +static void smp_call_on_cpu_callback(struct work_struct *work) +{ + struct smp_call_on_cpu_struct *sscs; + + sscs = container_of(work, struct smp_call_on_cpu_struct, work); + if (sscs->cpu >= 0) + hypervisor_pin_vcpu(sscs->cpu); + sscs->ret = sscs->func(sscs->data); + if (sscs->cpu >= 0) + hypervisor_pin_vcpu(-1); + + complete(&sscs->done); +} + +int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) +{ + struct smp_call_on_cpu_struct sscs = { + .work = __WORK_INITIALIZER(sscs.work, smp_call_on_cpu_callback), + .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done), + .func = func, + .data = par, + .cpu = phys ? cpu : -1, + }; + + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + return -ENXIO; + + queue_work_on(cpu, system_wq, &sscs.work); + wait_for_completion(&sscs.done); + + return sscs.ret; +} +EXPORT_SYMBOL_GPL(smp_call_on_cpu); diff --git a/kernel/up.c b/kernel/up.c index 3ccee2bd13ba..ee81ac9af4ca 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -83,3 +83,20 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), preempt_enable(); } EXPORT_SYMBOL(on_each_cpu_cond); + +int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) +{ + int ret; + + if (cpu != 0) + return -ENXIO; + + if (phys) + hypervisor_pin_vcpu(0); + ret = func(par); + if (phys) + hypervisor_pin_vcpu(-1); + + return ret; +} +EXPORT_SYMBOL_GPL(smp_call_on_cpu); From 99bc67536d04bcb9133546284acfb15d033b8bfe Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 29 Aug 2016 08:48:45 +0200 Subject: [PATCH 164/538] xen: Add xen_pin_vcpu() to support calling functions on a dedicated pCPU Some hardware models (e.g. Dell Studio 1555 laptops) require calls to the firmware to be issued on CPU 0 only. As Dom0 might have to use these calls, add xen_pin_vcpu() to achieve this functionality. In case either the domain doesn't have the privilege to make the related hypercall or the hypervisor isn't supporting it, issue a warning once and disable further pinning attempts. Signed-off-by: Juergen Gross Signed-off-by: Peter Zijlstra (Intel) Acked-by: David Vrabel Cc: Douglas_Warzecha@dell.com Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akataria@vmware.com Cc: boris.ostrovsky@oracle.com Cc: chrisw@sous-sol.org Cc: hpa@zytor.com Cc: jdelvare@suse.com Cc: jeremy@goop.org Cc: linux@roeck-us.net Cc: pali.rohar@gmail.com Cc: rusty@rustcorp.com.au Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1472453327-19050-5-git-send-email-jgross@suse.com Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b86ebb1a9a7f..bc9aaba01a22 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1925,6 +1925,45 @@ static void xen_set_cpu_features(struct cpuinfo_x86 *c) } } +static void xen_pin_vcpu(int cpu) +{ + static bool disable_pinning; + struct sched_pin_override pin_override; + int ret; + + if (disable_pinning) + return; + + pin_override.pcpu = cpu; + ret = HYPERVISOR_sched_op(SCHEDOP_pin_override, &pin_override); + + /* Ignore errors when removing override. */ + if (cpu < 0) + return; + + switch (ret) { + case -ENOSYS: + pr_warn("Unable to pin on physical cpu %d. In case of problems consider vcpu pinning.\n", + cpu); + disable_pinning = true; + break; + case -EPERM: + WARN(1, "Trying to pin vcpu without having privilege to do so\n"); + disable_pinning = true; + break; + case -EINVAL: + case -EBUSY: + pr_warn("Physical cpu %d not available for pinning. Check Xen cpu configuration.\n", + cpu); + break; + case 0: + break; + default: + WARN(1, "rc %d while trying to pin vcpu\n", ret); + disable_pinning = true; + } +} + const struct hypervisor_x86 x86_hyper_xen = { .name = "Xen", .detect = xen_platform, @@ -1933,6 +1972,7 @@ const struct hypervisor_x86 x86_hyper_xen = { #endif .x2apic_available = xen_x2apic_para_available, .set_cpu_features = xen_set_cpu_features, + .pin_vcpu = xen_pin_vcpu, }; EXPORT_SYMBOL(x86_hyper_xen); From e23f22b5cb9e44da24cb8494707536211adff8d1 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 29 Aug 2016 08:48:46 +0200 Subject: [PATCH 165/538] dcdbas: Make use of smp_call_on_cpu() Use smp_call_on_cpu() to raise SMI on CPU 0. Make call secure by adding get_online_cpus() to avoid e.g. suspend resume cycles in between. Signed-off-by: Juergen Gross Signed-off-by: Peter Zijlstra (Intel) Cc: Douglas_Warzecha@dell.com Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akataria@vmware.com Cc: boris.ostrovsky@oracle.com Cc: chrisw@sous-sol.org Cc: david.vrabel@citrix.com Cc: hpa@zytor.com Cc: jdelvare@suse.com Cc: jeremy@goop.org Cc: linux@roeck-us.net Cc: pali.rohar@gmail.com Cc: rusty@rustcorp.com.au Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1472453327-19050-6-git-send-email-jgross@suse.com Signed-off-by: Ingo Molnar --- drivers/firmware/dcdbas.c | 51 ++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c index 829eec8959f2..2fe1a130189f 100644 --- a/drivers/firmware/dcdbas.c +++ b/drivers/firmware/dcdbas.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -238,33 +239,14 @@ static ssize_t host_control_on_shutdown_store(struct device *dev, return count; } -/** - * dcdbas_smi_request: generate SMI request - * - * Called with smi_data_lock. - */ -int dcdbas_smi_request(struct smi_cmd *smi_cmd) +static int raise_smi(void *par) { - cpumask_var_t old_mask; - int ret = 0; - - if (smi_cmd->magic != SMI_CMD_MAGIC) { - dev_info(&dcdbas_pdev->dev, "%s: invalid magic value\n", - __func__); - return -EBADR; - } + struct smi_cmd *smi_cmd = par; - /* SMI requires CPU 0 */ - if (!alloc_cpumask_var(&old_mask, GFP_KERNEL)) - return -ENOMEM; - - cpumask_copy(old_mask, ¤t->cpus_allowed); - set_cpus_allowed_ptr(current, cpumask_of(0)); if (smp_processor_id() != 0) { dev_dbg(&dcdbas_pdev->dev, "%s: failed to get CPU 0\n", __func__); - ret = -EBUSY; - goto out; + return -EBUSY; } /* generate SMI */ @@ -280,9 +262,28 @@ int dcdbas_smi_request(struct smi_cmd *smi_cmd) : "memory" ); -out: - set_cpus_allowed_ptr(current, old_mask); - free_cpumask_var(old_mask); + return 0; +} +/** + * dcdbas_smi_request: generate SMI request + * + * Called with smi_data_lock. + */ +int dcdbas_smi_request(struct smi_cmd *smi_cmd) +{ + int ret; + + if (smi_cmd->magic != SMI_CMD_MAGIC) { + dev_info(&dcdbas_pdev->dev, "%s: invalid magic value\n", + __func__); + return -EBADR; + } + + /* SMI requires CPU 0 */ + get_online_cpus(); + ret = smp_call_on_cpu(0, raise_smi, smi_cmd, true); + put_online_cpus(); + return ret; } From 27046a3ffbb01ba715e6236c170701c84759b61d Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 29 Aug 2016 08:48:47 +0200 Subject: [PATCH 166/538] hwmon: Use smp_call_on_cpu() for dell-smm i8k Use the smp_call_on_cpu() function to call system management mode on CPU 0. Make call secure by adding get_online_cpus() to avoid e.g. suspend resume cycles in between. Signed-off-by: Juergen Gross Signed-off-by: Peter Zijlstra (Intel) Cc: Douglas_Warzecha@dell.com Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akataria@vmware.com Cc: boris.ostrovsky@oracle.com Cc: chrisw@sous-sol.org Cc: david.vrabel@citrix.com Cc: hpa@zytor.com Cc: jdelvare@suse.com Cc: jeremy@goop.org Cc: linux@roeck-us.net Cc: pali.rohar@gmail.com Cc: rusty@rustcorp.com.au Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1472453327-19050-7-git-send-email-jgross@suse.com Signed-off-by: Ingo Molnar --- drivers/hwmon/dell-smm-hwmon.c | 36 +++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/drivers/hwmon/dell-smm-hwmon.c b/drivers/hwmon/dell-smm-hwmon.c index acf9c0361d9f..34704b0451b4 100644 --- a/drivers/hwmon/dell-smm-hwmon.c +++ b/drivers/hwmon/dell-smm-hwmon.c @@ -21,6 +21,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -36,6 +37,7 @@ #include #include #include +#include #include @@ -134,11 +136,11 @@ static inline const char *i8k_get_dmi_data(int field) /* * Call the System Management Mode BIOS. Code provided by Jonathan Buzzard. */ -static int i8k_smm(struct smm_regs *regs) +static int i8k_smm_func(void *par) { int rc; + struct smm_regs *regs = par; int eax = regs->eax; - cpumask_var_t old_mask; #ifdef DEBUG int ebx = regs->ebx; @@ -149,16 +151,8 @@ static int i8k_smm(struct smm_regs *regs) #endif /* SMM requires CPU 0 */ - if (!alloc_cpumask_var(&old_mask, GFP_KERNEL)) - return -ENOMEM; - cpumask_copy(old_mask, ¤t->cpus_allowed); - rc = set_cpus_allowed_ptr(current, cpumask_of(0)); - if (rc) - goto out; - if (smp_processor_id() != 0) { - rc = -EBUSY; - goto out; - } + if (smp_processor_id() != 0) + return -EBUSY; #if defined(CONFIG_X86_64) asm volatile("pushq %%rax\n\t" @@ -216,10 +210,6 @@ static int i8k_smm(struct smm_regs *regs) if (rc != 0 || (regs->eax & 0xffff) == 0xffff || regs->eax == eax) rc = -EINVAL; -out: - set_cpus_allowed_ptr(current, old_mask); - free_cpumask_var(old_mask); - #ifdef DEBUG rettime = ktime_get(); delta = ktime_sub(rettime, calltime); @@ -231,6 +221,20 @@ static int i8k_smm(struct smm_regs *regs) return rc; } +/* + * Call the System Management Mode BIOS. + */ +static int i8k_smm(struct smm_regs *regs) +{ + int ret; + + get_online_cpus(); + ret = smp_call_on_cpu(0, i8k_smm_func, regs, true); + put_online_cpus(); + + return ret; +} + /* * Read the fan status. */ From 3c1627e999e45e292d5d7ea3751ed86a6383ee2c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 5 Sep 2016 15:28:36 +0200 Subject: [PATCH 167/538] cpu/hotplug: Replace anon union Some compilers are unhappy with the anon union in the state array. Replace it with a named union. While at it align the state array initializers proper and add the missing name tags. Fixes: cf392d10b69e "cpu/hotplug: Add multi instance support" Reported-by: Ingo Molnar Reported-by: Fenguang Wu Signed-off-by: Thomas Gleixner Cc: rt@linutronix.de --- kernel/cpu.c | 145 ++++++++++++++++++++++++++------------------------- 1 file changed, 74 insertions(+), 71 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index c90f839c5b86..2409ed717a3f 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -73,15 +73,15 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); struct cpuhp_step { const char *name; union { - int (*startup)(unsigned int cpu); - int (*startup_multi)(unsigned int cpu, - struct hlist_node *node); - }; + int (*single)(unsigned int cpu); + int (*multi)(unsigned int cpu, + struct hlist_node *node); + } startup; union { - int (*teardown)(unsigned int cpu); - int (*teardown_multi)(unsigned int cpu, - struct hlist_node *node); - }; + int (*single)(unsigned int cpu); + int (*multi)(unsigned int cpu, + struct hlist_node *node); + } teardown; struct hlist_head list; bool skip_onerr; bool cant_stop; @@ -127,7 +127,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, int ret, cnt; if (!step->multi_instance) { - cb = bringup ? step->startup : step->teardown; + cb = bringup ? step->startup.single : step->teardown.single; if (!cb) return 0; trace_cpuhp_enter(cpu, st->target, state, cb); @@ -135,7 +135,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, trace_cpuhp_exit(cpu, st->state, state, ret); return ret; } - cbm = bringup ? step->startup_multi : step->teardown_multi; + cbm = bringup ? step->startup.multi : step->teardown.multi; if (!cbm) return 0; @@ -160,7 +160,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, return 0; err: /* Rollback the instances if one failed */ - cbm = !bringup ? step->startup_multi : step->teardown_multi; + cbm = !bringup ? step->startup.multi : step->teardown.multi; if (!cbm) return ret; @@ -1256,40 +1256,40 @@ core_initcall(cpu_hotplug_pm_sync_init); static struct cpuhp_step cpuhp_bp_states[] = { [CPUHP_OFFLINE] = { .name = "offline", - .startup = NULL, - .teardown = NULL, + .startup.single = NULL, + .teardown.single = NULL, }, #ifdef CONFIG_SMP [CPUHP_CREATE_THREADS]= { .name = "threads:create", - .startup = smpboot_create_threads, - .teardown = NULL, + .startup.single = smpboot_create_threads, + .teardown.single = NULL, .cant_stop = true, }, [CPUHP_PERF_PREPARE] = { - .name = "perf prepare", - .startup = perf_event_init_cpu, - .teardown = perf_event_exit_cpu, + .name = "perf:prepare", + .startup.single = perf_event_init_cpu, + .teardown.single = perf_event_exit_cpu, }, [CPUHP_WORKQUEUE_PREP] = { - .name = "workqueue prepare", - .startup = workqueue_prepare_cpu, - .teardown = NULL, + .name = "workqueue:prepare", + .startup.single = workqueue_prepare_cpu, + .teardown.single = NULL, }, [CPUHP_HRTIMERS_PREPARE] = { - .name = "hrtimers prepare", - .startup = hrtimers_prepare_cpu, - .teardown = hrtimers_dead_cpu, + .name = "hrtimers:prepare", + .startup.single = hrtimers_prepare_cpu, + .teardown.single = hrtimers_dead_cpu, }, [CPUHP_SMPCFD_PREPARE] = { - .name = "SMPCFD prepare", - .startup = smpcfd_prepare_cpu, - .teardown = smpcfd_dead_cpu, + .name = "SMPCFD:prepare", + .startup.single = smpcfd_prepare_cpu, + .teardown.single = smpcfd_dead_cpu, }, [CPUHP_RCUTREE_PREP] = { - .name = "RCU-tree prepare", - .startup = rcutree_prepare_cpu, - .teardown = rcutree_dead_cpu, + .name = "RCU-tree:prepare", + .startup.single = rcutree_prepare_cpu, + .teardown.single = rcutree_dead_cpu, }, /* * Preparatory and dead notifiers. Will be replaced once the notifiers @@ -1297,8 +1297,8 @@ static struct cpuhp_step cpuhp_bp_states[] = { */ [CPUHP_NOTIFY_PREPARE] = { .name = "notify:prepare", - .startup = notify_prepare, - .teardown = notify_dead, + .startup.single = notify_prepare, + .teardown.single = notify_dead, .skip_onerr = true, .cant_stop = true, }, @@ -1308,20 +1308,21 @@ static struct cpuhp_step cpuhp_bp_states[] = { * otherwise a RCU stall occurs. */ [CPUHP_TIMERS_DEAD] = { - .name = "timers dead", - .startup = NULL, - .teardown = timers_dead_cpu, + .name = "timers:dead", + .startup.single = NULL, + .teardown.single = timers_dead_cpu, }, /* Kicks the plugged cpu into life */ [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", - .startup = bringup_cpu, - .teardown = NULL, + .startup.single = bringup_cpu, + .teardown.single = NULL, .cant_stop = true, }, [CPUHP_AP_SMPCFD_DYING] = { - .startup = NULL, - .teardown = smpcfd_dying_cpu, + .name = "SMPCFD:dying", + .startup.single = NULL, + .teardown.single = smpcfd_dying_cpu, }, /* * Handled on controll processor until the plugged processor manages @@ -1329,8 +1330,8 @@ static struct cpuhp_step cpuhp_bp_states[] = { */ [CPUHP_TEARDOWN_CPU] = { .name = "cpu:teardown", - .startup = NULL, - .teardown = takedown_cpu, + .startup.single = NULL, + .teardown.single = takedown_cpu, .cant_stop = true, }, #else @@ -1356,22 +1357,23 @@ static struct cpuhp_step cpuhp_ap_states[] = { /* First state is scheduler control. Interrupts are disabled */ [CPUHP_AP_SCHED_STARTING] = { .name = "sched:starting", - .startup = sched_cpu_starting, - .teardown = sched_cpu_dying, + .startup.single = sched_cpu_starting, + .teardown.single = sched_cpu_dying, }, [CPUHP_AP_RCUTREE_DYING] = { - .startup = NULL, - .teardown = rcutree_dying_cpu, + .name = "RCU-tree:dying", + .startup.single = NULL, + .teardown.single = rcutree_dying_cpu, }, /* - * Low level startup/teardown notifiers. Run with interrupts + * Low level startup.single/teardown notifiers. Run with interrupts * disabled. Will be removed once the notifiers are converted to * states. */ [CPUHP_AP_NOTIFY_STARTING] = { .name = "notify:starting", - .startup = notify_starting, - .teardown = notify_dying, + .startup.single = notify_starting, + .teardown.single = notify_dying, .skip_onerr = true, .cant_stop = true, }, @@ -1383,23 +1385,23 @@ static struct cpuhp_step cpuhp_ap_states[] = { /* Handle smpboot threads park/unpark */ [CPUHP_AP_SMPBOOT_THREADS] = { .name = "smpboot:threads", - .startup = smpboot_unpark_threads, - .teardown = NULL, + .startup.single = smpboot_unpark_threads, + .teardown.single = NULL, }, [CPUHP_AP_PERF_ONLINE] = { - .name = "perf online", - .startup = perf_event_init_cpu, - .teardown = perf_event_exit_cpu, + .name = "perf:online", + .startup.single = perf_event_init_cpu, + .teardown.single = perf_event_exit_cpu, }, [CPUHP_AP_WORKQUEUE_ONLINE] = { - .name = "workqueue online", - .startup = workqueue_online_cpu, - .teardown = workqueue_offline_cpu, + .name = "workqueue:online", + .startup.single = workqueue_online_cpu, + .teardown.single = workqueue_offline_cpu, }, [CPUHP_AP_RCUTREE_ONLINE] = { - .name = "RCU-tree online", - .startup = rcutree_online_cpu, - .teardown = rcutree_offline_cpu, + .name = "RCU-tree:online", + .startup.single = rcutree_online_cpu, + .teardown.single = rcutree_offline_cpu, }, /* @@ -1408,8 +1410,8 @@ static struct cpuhp_step cpuhp_ap_states[] = { */ [CPUHP_AP_NOTIFY_ONLINE] = { .name = "notify:online", - .startup = notify_online, - .teardown = notify_down_prepare, + .startup.single = notify_online, + .teardown.single = notify_down_prepare, .skip_onerr = true, }, #endif @@ -1421,16 +1423,16 @@ static struct cpuhp_step cpuhp_ap_states[] = { /* Last state is scheduler control setting the cpu active */ [CPUHP_AP_ACTIVE] = { .name = "sched:active", - .startup = sched_cpu_activate, - .teardown = sched_cpu_deactivate, + .startup.single = sched_cpu_activate, + .teardown.single = sched_cpu_deactivate, }, #endif /* CPU is fully up and running. */ [CPUHP_ONLINE] = { .name = "online", - .startup = NULL, - .teardown = NULL, + .startup.single = NULL, + .teardown.single = NULL, }, }; @@ -1453,8 +1455,8 @@ static void cpuhp_store_callbacks(enum cpuhp_state state, mutex_lock(&cpuhp_state_mutex); sp = cpuhp_get_step(state); - sp->startup = startup; - sp->teardown = teardown; + sp->startup.single = startup; + sp->teardown.single = teardown; sp->name = name; sp->multi_instance = multi_instance; INIT_HLIST_HEAD(&sp->list); @@ -1463,7 +1465,7 @@ static void cpuhp_store_callbacks(enum cpuhp_state state, static void *cpuhp_get_teardown_cb(enum cpuhp_state state) { - return cpuhp_get_step(state)->teardown; + return cpuhp_get_step(state)->teardown.single; } /* @@ -1476,7 +1478,8 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, struct cpuhp_step *sp = cpuhp_get_step(state); int ret; - if ((bringup && !sp->startup) || (!bringup && !sp->teardown)) + if ((bringup && !sp->startup.single) || + (!bringup && !sp->teardown.single)) return 0; /* * The non AP bound callbacks can fail on bringup. On teardown @@ -1554,7 +1557,7 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, get_online_cpus(); - if (!invoke || !sp->startup_multi) + if (!invoke || !sp->startup.multi) goto add_node; /* @@ -1570,7 +1573,7 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, ret = cpuhp_issue_call(cpu, state, true, node); if (ret) { - if (sp->teardown_multi) + if (sp->teardown.multi) cpuhp_rollback_install(cpu, state, node); goto err; } From 545d5d657b720e9c4dc773265bb7e9d88e34b269 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 31 May 2016 13:56:48 +0100 Subject: [PATCH 168/538] genirq: Update stale comment for __irq_domain_add Commit 1bf4ddc46c5d ("irqdomain: Introduce irq_domain_create_{linear, tree}") introduced the use of fwnode_handle to identify the interrupt controller when calling __irq_domain_add but missed updating the kernel doc parameters for the function. Update this comment. While we are touching this code, also consolidate the declaration and assignment of of_node. Signed-off-by: Punit Agrawal Acked-by: Marc Zygnier Link: http://lkml.kernel.org/r/1464699409-23113-1-git-send-email-punit.agrawal@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f10cffe8aefb..8c0a0ae43521 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); /** * __irq_domain_add() - Allocate a new irq_domain data structure - * @of_node: optional device-tree node of the interrupt controller + * @fwnode: firmware node for the interrupt controller * @size: Size of linear map; 0 for radix mapping only * @hwirq_max: Maximum number of interrupts supported by controller * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no @@ -96,10 +96,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, const struct irq_domain_ops *ops, void *host_data) { + struct device_node *of_node = to_of_node(fwnode); struct irq_domain *domain; - struct device_node *of_node; - - of_node = to_of_node(fwnode); domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), GFP_KERNEL, of_node_to_nid(of_node)); From e8b61b3f2c5d3ee7804766621c91f38737d38105 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 1 Jun 2016 10:43:29 +0200 Subject: [PATCH 169/538] futex: Add some more function commentry Add some more comments and reformat existing ones to kernel doc style. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Darren Hart Link: http://lkml.kernel.org/r/1464770609-30168-1-git-send-email-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/futex.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 46cb3a301bc1..2c4be467fecd 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -381,8 +381,12 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb) #endif } -/* - * We hash on the keys returned from get_futex_key (see below). +/** + * hash_futex - Return the hash bucket in the global hash + * @key: Pointer to the futex key for which the hash is calculated + * + * We hash on the keys returned from get_futex_key (see below) and return the + * corresponding hash bucket in the global hash. */ static struct futex_hash_bucket *hash_futex(union futex_key *key) { @@ -392,7 +396,12 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key) return &futex_queues[hash & (futex_hashsize - 1)]; } -/* + +/** + * match_futex - Check whether two futex keys are equal + * @key1: Pointer to key1 + * @key2: Pointer to key2 + * * Return 1 if two futex_keys are equal, 0 otherwise. */ static inline int match_futex(union futex_key *key1, union futex_key *key2) From 5210d393ef84e5d2a4854671a9af2d97fd1b8dd4 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Fri, 2 Sep 2016 20:49:12 +0800 Subject: [PATCH 170/538] netfilter: nf_tables_trace: fix endiness when dump chain policy NFTA_TRACE_POLICY attribute is big endian, but we forget to call htonl to convert it. Fortunately, this attribute is parsed as big endian in libnftnl. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 39eb1cc62e91..fa24a5b398b1 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -237,7 +237,7 @@ void nft_trace_notify(struct nft_traceinfo *info) break; case NFT_TRACETYPE_POLICY: if (nla_put_be32(skb, NFTA_TRACE_POLICY, - info->basechain->policy)) + htonl(info->basechain->policy))) goto nla_put_failure; break; } From 00b992deaa08495ab958da5950c9ebbba27d0ddc Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Tue, 19 Jul 2016 15:54:08 +0600 Subject: [PATCH 171/538] genirq: No need to mask non trigger mode flags before __irq_set_trigger() Some callers of __irq_set_trigger() masks all flags except trigger mode flags. This is unnecessary, ase __irq_set_trigger() already does this before usage of flags. [ tglx: Moved the flag mask and adjusted comment. Removed the hunk in enable_percpu_irq() as it is required there ] Signed-off-by: Alexander Kuleshov Link: http://lkml.kernel.org/r/20160719095408.13778-1-kuleshovmail@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 1 - kernel/irq/manage.c | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 93c373a8b12b..e11e8afcf209 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -76,7 +76,6 @@ int irq_set_irq_type(unsigned int irq, unsigned int type) if (!desc) return -EINVAL; - type &= IRQ_TYPE_SENSE_MASK; ret = __irq_set_trigger(desc, type); irq_put_desc_busunlock(desc, flags); return ret; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 73a2b786b5e9..4908617dee28 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -669,8 +669,6 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) return 0; } - flags &= IRQ_TYPE_SENSE_MASK; - if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { if (!irqd_irq_masked(&desc->irq_data)) mask_irq(desc); @@ -678,7 +676,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) unmask = 1; } - /* caller masked out all except trigger mode flags */ + /* Mask all flags except trigger mode */ + flags &= IRQ_TYPE_SENSE_MASK; ret = chip->irq_set_type(&desc->irq_data, flags); switch (ret) { From 23299b8c64f8c50e83eb345b835077f3c29588b8 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Mon, 5 Sep 2016 18:05:13 +0300 Subject: [PATCH 172/538] dt-bindings: mvebu-odmi: Fix example typo Make the example compatible string match its definition. Signed-off-by: Baruch Siach Reviewed-by: Thomas Petazzoni Link: https://lkml.kernel.org/r/0743fef6fe390bc4ae7cabd15c4836bbed98f7cf.1473087913.git.baruch@tkos.co.il Signed-off-by: Jason Cooper --- .../bindings/interrupt-controller/marvell,odmi-controller.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt b/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt index 8af0a8e613ab..3f6442c7f867 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt @@ -31,7 +31,7 @@ Required properties: Example: odmi: odmi@300000 { - compatible = "marvell,ap806-odm-controller", + compatible = "marvell,ap806-odmi-controller", "marvell,odmi-controller"; interrupt-controller; msi-controller; From 677f6646533d701c8609b8bcb9304173c11cc194 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 6 Sep 2016 16:13:48 +0200 Subject: [PATCH 173/538] cpu/hotplug: Make state names consistent We should have all names in the scheme "[subsys/]facility:state]". Fix the core to comply. Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 2409ed717a3f..32eef273a0b9 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1261,7 +1261,7 @@ static struct cpuhp_step cpuhp_bp_states[] = { }, #ifdef CONFIG_SMP [CPUHP_CREATE_THREADS]= { - .name = "threads:create", + .name = "threads:prepare", .startup.single = smpboot_create_threads, .teardown.single = NULL, .cant_stop = true, @@ -1282,12 +1282,12 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown.single = hrtimers_dead_cpu, }, [CPUHP_SMPCFD_PREPARE] = { - .name = "SMPCFD:prepare", + .name = "smpcfd:prepare", .startup.single = smpcfd_prepare_cpu, .teardown.single = smpcfd_dead_cpu, }, [CPUHP_RCUTREE_PREP] = { - .name = "RCU-tree:prepare", + .name = "RCU/tree:prepare", .startup.single = rcutree_prepare_cpu, .teardown.single = rcutree_dead_cpu, }, @@ -1320,7 +1320,7 @@ static struct cpuhp_step cpuhp_bp_states[] = { .cant_stop = true, }, [CPUHP_AP_SMPCFD_DYING] = { - .name = "SMPCFD:dying", + .name = "smpcfd:dying", .startup.single = NULL, .teardown.single = smpcfd_dying_cpu, }, @@ -1361,7 +1361,7 @@ static struct cpuhp_step cpuhp_ap_states[] = { .teardown.single = sched_cpu_dying, }, [CPUHP_AP_RCUTREE_DYING] = { - .name = "RCU-tree:dying", + .name = "RCU/tree:dying", .startup.single = NULL, .teardown.single = rcutree_dying_cpu, }, @@ -1384,7 +1384,7 @@ static struct cpuhp_step cpuhp_ap_states[] = { }, /* Handle smpboot threads park/unpark */ [CPUHP_AP_SMPBOOT_THREADS] = { - .name = "smpboot:threads", + .name = "smpboot/threads:online", .startup.single = smpboot_unpark_threads, .teardown.single = NULL, }, @@ -1399,7 +1399,7 @@ static struct cpuhp_step cpuhp_ap_states[] = { .teardown.single = workqueue_offline_cpu, }, [CPUHP_AP_RCUTREE_ONLINE] = { - .name = "RCU-tree:online", + .name = "RCU/tree:online", .startup.single = rcutree_online_cpu, .teardown.single = rcutree_offline_cpu, }, From d1a6cba576fc7c43e476538fe5aa72fe04bd80e1 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Tue, 6 Sep 2016 22:31:02 +0800 Subject: [PATCH 174/538] netfilter: nft_chain_route: re-route before skb is queued to userspace Imagine such situation, user add the following nft rules, and queue the packets to userspace for further check: # ip rule add fwmark 0x0/0x1 lookup eth0 # ip rule add fwmark 0x1/0x1 lookup eth1 # nft add table filter # nft add chain filter output {type route hook output priority 0 \;} # nft add rule filter output mark set 0x1 # nft add rule filter output queue num 0 But after we reinject the skbuff, the packet will be sent via the wrong route, i.e. in this case, the packet will be routed via eth0 table, not eth1 table. Because we skip to do re-route when verdict is NF_QUEUE, even if the mark was changed. Acctually, we should not touch sk_buff if verdict is NF_DROP or NF_STOLEN, and when re-route fails, return NF_DROP with error code. This is consistent with the mangle table in iptables. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nft_chain_route_ipv4.c | 11 +++++++---- net/ipv6/netfilter/nft_chain_route_ipv6.c | 10 +++++++--- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c index 2375b0a8be46..30493beb611a 100644 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -31,6 +31,7 @@ static unsigned int nf_route_table_hook(void *priv, __be32 saddr, daddr; u_int8_t tos; const struct iphdr *iph; + int err; /* root is playing with raw sockets. */ if (skb->len < sizeof(struct iphdr) || @@ -46,15 +47,17 @@ static unsigned int nf_route_table_hook(void *priv, tos = iph->tos; ret = nft_do_chain(&pkt, priv); - if (ret != NF_DROP && ret != NF_QUEUE) { + if (ret != NF_DROP && ret != NF_STOLEN) { iph = ip_hdr(skb); if (iph->saddr != saddr || iph->daddr != daddr || skb->mark != mark || - iph->tos != tos) - if (ip_route_me_harder(state->net, skb, RTN_UNSPEC)) - ret = NF_DROP; + iph->tos != tos) { + err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); + if (err < 0) + ret = NF_DROP_ERR(err); + } } return ret; } diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c index 71d995ff3108..2535223ba956 100644 --- a/net/ipv6/netfilter/nft_chain_route_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c @@ -31,6 +31,7 @@ static unsigned int nf_route_table_hook(void *priv, struct in6_addr saddr, daddr; u_int8_t hop_limit; u32 mark, flowlabel; + int err; /* malformed packet, drop it */ if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0) @@ -46,13 +47,16 @@ static unsigned int nf_route_table_hook(void *priv, flowlabel = *((u32 *)ipv6_hdr(skb)); ret = nft_do_chain(&pkt, priv); - if (ret != NF_DROP && ret != NF_QUEUE && + if (ret != NF_DROP && ret != NF_STOLEN && (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) || memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) || skb->mark != mark || ipv6_hdr(skb)->hop_limit != hop_limit || - flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) - return ip6_route_me_harder(state->net, skb) == 0 ? ret : NF_DROP; + flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) { + err = ip6_route_me_harder(state->net, skb); + if (err < 0) + ret = NF_DROP_ERR(err); + } return ret; } From ee1e714b94521b0bb27b04dfd1728ec51b19d4f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 18 Aug 2016 14:57:16 +0200 Subject: [PATCH 175/538] cpu/hotplug: Remove CPU_STARTING and CPU_DYING notifier All users are converted to state machine, remove CPU_STARTING and the corresponding CPU_DYING. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160818125731.27256-2-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- arch/sparc/kernel/smp_32.c | 2 -- include/linux/cpu.h | 12 ----------- include/linux/cpuhotplug.h | 1 - kernel/cpu.c | 30 ++-------------------------- tools/testing/radix-tree/linux/cpu.h | 13 ------------ 5 files changed, 2 insertions(+), 56 deletions(-) diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c index fb30e7c6a5b1..e80e6ba3d500 100644 --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c @@ -352,9 +352,7 @@ static void sparc_start_secondary(void *arg) preempt_disable(); cpu = smp_processor_id(); - /* Invoke the CPU_STARTING notifier callbacks */ notify_cpu_starting(cpu); - arch_cpu_pre_online(arg); /* Set the CPU in the cpu_online_mask */ diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 797d9c8e9a1b..6bf1992fe638 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -61,17 +61,8 @@ struct notifier_block; #define CPU_DOWN_PREPARE 0x0005 /* CPU (unsigned)v going down */ #define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */ #define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */ -#define CPU_DYING 0x0008 /* CPU (unsigned)v not running any task, - * not handling interrupts, soon dead. - * Called on the dying cpu, interrupts - * are already disabled. Must not - * sleep, must not fail */ #define CPU_POST_DEAD 0x0009 /* CPU (unsigned)v dead, cpu_hotplug * lock is dropped */ -#define CPU_STARTING 0x000A /* CPU (unsigned)v soon running. - * Called on the new cpu, just before - * enabling interrupts. Must not sleep, - * must not fail */ #define CPU_BROKEN 0x000B /* CPU (unsigned)v did not die properly, * perhaps due to preemption. */ @@ -86,9 +77,6 @@ struct notifier_block; #define CPU_DOWN_PREPARE_FROZEN (CPU_DOWN_PREPARE | CPU_TASKS_FROZEN) #define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN) #define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN) -#define CPU_DYING_FROZEN (CPU_DYING | CPU_TASKS_FROZEN) -#define CPU_STARTING_FROZEN (CPU_STARTING | CPU_TASKS_FROZEN) - #ifdef CONFIG_SMP extern bool cpuhp_tasks_frozen; diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index b95f7adfbf8b..9e6d10786e29 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -69,7 +69,6 @@ enum cpuhp_state { CPUHP_AP_ARM64_ISNDEP_STARTING, CPUHP_AP_SMPCFD_DYING, CPUHP_AP_X86_TBOOT_DYING, - CPUHP_AP_NOTIFY_STARTING, CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, CPUHP_AP_ONLINE_IDLE, diff --git a/kernel/cpu.c b/kernel/cpu.c index 32eef273a0b9..d14ae4438e8e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -408,12 +408,6 @@ static int notify_online(unsigned int cpu) return 0; } -static int notify_starting(unsigned int cpu) -{ - cpu_notify(CPU_STARTING, cpu); - return 0; -} - static int bringup_wait_for_ap(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); @@ -759,12 +753,6 @@ static int notify_down_prepare(unsigned int cpu) return err; } -static int notify_dying(unsigned int cpu) -{ - cpu_notify(CPU_DYING, cpu); - return 0; -} - /* Take this CPU down. */ static int take_cpu_down(void *_param) { @@ -823,7 +811,7 @@ static int takedown_cpu(unsigned int cpu) BUG_ON(cpu_online(cpu)); /* - * The migration_call() CPU_DYING callback will have removed all + * The CPUHP_AP_SCHED_MIGRATE_DYING callback will have removed all * runnable tasks from the cpu, there's only the idle task left now * that the migration thread is done doing the stop_machine thing. * @@ -876,7 +864,6 @@ void cpuhp_report_idle_dead(void) #define notify_down_prepare NULL #define takedown_cpu NULL #define notify_dead NULL -#define notify_dying NULL #endif #ifdef CONFIG_HOTPLUG_CPU @@ -966,10 +953,9 @@ EXPORT_SYMBOL(cpu_down); #endif /*CONFIG_HOTPLUG_CPU*/ /** - * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers + * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU * @cpu: cpu that just started * - * This function calls the cpu_chain notifiers with CPU_STARTING. * It must be called by the arch code on the new cpu, before the new cpu * enables interrupts and before the "boot" cpu returns from __cpu_up(). */ @@ -1365,18 +1351,6 @@ static struct cpuhp_step cpuhp_ap_states[] = { .startup.single = NULL, .teardown.single = rcutree_dying_cpu, }, - /* - * Low level startup.single/teardown notifiers. Run with interrupts - * disabled. Will be removed once the notifiers are converted to - * states. - */ - [CPUHP_AP_NOTIFY_STARTING] = { - .name = "notify:starting", - .startup.single = notify_starting, - .teardown.single = notify_dying, - .skip_onerr = true, - .cant_stop = true, - }, /* Entry state on starting. Interrupts enabled from here on. Transient * state for synchronsization */ [CPUHP_AP_ONLINE] = { diff --git a/tools/testing/radix-tree/linux/cpu.h b/tools/testing/radix-tree/linux/cpu.h index 60a40459f269..7cf412103205 100644 --- a/tools/testing/radix-tree/linux/cpu.h +++ b/tools/testing/radix-tree/linux/cpu.h @@ -7,19 +7,8 @@ #define CPU_DOWN_PREPARE 0x0005 /* CPU (unsigned)v going down */ #define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */ #define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */ -#define CPU_DYING 0x0008 /* CPU (unsigned)v not running any task, - * not handling interrupts, soon dead. - * Called on the dying cpu, interrupts - * are already disabled. Must not - * sleep, must not fail */ #define CPU_POST_DEAD 0x0009 /* CPU (unsigned)v dead, cpu_hotplug * lock is dropped */ -#define CPU_STARTING 0x000A /* CPU (unsigned)v soon running. - * Called on the new cpu, just before - * enabling interrupts. Must not sleep, - * must not fail */ -#define CPU_DYING_IDLE 0x000B /* CPU (unsigned)v dying, reached - * idle loop. */ #define CPU_BROKEN 0x000C /* CPU (unsigned)v did not die properly, * perhaps due to preemption. */ #define CPU_TASKS_FROZEN 0x0010 @@ -30,5 +19,3 @@ #define CPU_DOWN_PREPARE_FROZEN (CPU_DOWN_PREPARE | CPU_TASKS_FROZEN) #define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN) #define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN) -#define CPU_DYING_FROZEN (CPU_DYING | CPU_TASKS_FROZEN) -#define CPU_STARTING_FROZEN (CPU_STARTING | CPU_TASKS_FROZEN) From 017c59c042d01fc84cae7a8ea475861e702c77ab Mon Sep 17 00:00:00 2001 From: Akash Goel Date: Fri, 2 Sep 2016 21:47:38 +0200 Subject: [PATCH 176/538] relay: Use per CPU constructs for the relay channel buffer pointers relay essentially needs to maintain a per CPU array of channel buffer pointers but it manually creates that array. Instead its better to use the per CPU constructs, provided by the kernel, to allocate & access the array of pointer to channel buffers. Signed-off-by: Akash Goel Reviewed-by: Chris Wilson Link: http://lkml.kernel.org/r/1470909140-25919-1-git-send-email-akash.goel@intel.com Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- include/linux/relay.h | 17 ++++++---- kernel/relay.c | 74 ++++++++++++++++++++++++------------------- 2 files changed, 52 insertions(+), 39 deletions(-) diff --git a/include/linux/relay.h b/include/linux/relay.h index d7c8359693c6..eb295e373b90 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -19,6 +19,7 @@ #include #include #include +#include /* * Tracks changes to rchan/rchan_buf structs @@ -63,7 +64,7 @@ struct rchan struct kref kref; /* channel refcount */ void *private_data; /* for user-defined data */ size_t last_toobig; /* tried to log event > subbuf size */ - struct rchan_buf *buf[NR_CPUS]; /* per-cpu channel buffers */ + struct rchan_buf ** __percpu buf; /* per-cpu channel buffers */ int is_global; /* One global buffer ? */ struct list_head list; /* for channel list */ struct dentry *parent; /* parent dentry passed to open */ @@ -204,7 +205,7 @@ static inline void relay_write(struct rchan *chan, struct rchan_buf *buf; local_irq_save(flags); - buf = chan->buf[smp_processor_id()]; + buf = *this_cpu_ptr(chan->buf); if (unlikely(buf->offset + length > chan->subbuf_size)) length = relay_switch_subbuf(buf, length); memcpy(buf->data + buf->offset, data, length); @@ -230,12 +231,12 @@ static inline void __relay_write(struct rchan *chan, { struct rchan_buf *buf; - buf = chan->buf[get_cpu()]; + buf = *get_cpu_ptr(chan->buf); if (unlikely(buf->offset + length > buf->chan->subbuf_size)) length = relay_switch_subbuf(buf, length); memcpy(buf->data + buf->offset, data, length); buf->offset += length; - put_cpu(); + put_cpu_ptr(chan->buf); } /** @@ -251,17 +252,19 @@ static inline void __relay_write(struct rchan *chan, */ static inline void *relay_reserve(struct rchan *chan, size_t length) { - void *reserved; - struct rchan_buf *buf = chan->buf[smp_processor_id()]; + void *reserved = NULL; + struct rchan_buf *buf = *get_cpu_ptr(chan->buf); if (unlikely(buf->offset + length > buf->chan->subbuf_size)) { length = relay_switch_subbuf(buf, length); if (!length) - return NULL; + goto end; } reserved = buf->data + buf->offset; buf->offset += length; +end: + put_cpu_ptr(chan->buf); return reserved; } diff --git a/kernel/relay.c b/kernel/relay.c index d797502140b9..ed157378f6cb 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -214,7 +214,7 @@ static void relay_destroy_buf(struct rchan_buf *buf) __free_page(buf->page_array[i]); relay_free_page_array(buf->page_array); } - chan->buf[buf->cpu] = NULL; + *per_cpu_ptr(chan->buf, buf->cpu) = NULL; kfree(buf->padding); kfree(buf); kref_put(&chan->kref, relay_destroy_channel); @@ -382,20 +382,21 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) */ void relay_reset(struct rchan *chan) { + struct rchan_buf *buf; unsigned int i; if (!chan) return; - if (chan->is_global && chan->buf[0]) { - __relay_reset(chan->buf[0], 0); + if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) { + __relay_reset(buf, 0); return; } mutex_lock(&relay_channels_mutex); for_each_possible_cpu(i) - if (chan->buf[i]) - __relay_reset(chan->buf[i], 0); + if ((buf = *per_cpu_ptr(chan->buf, i))) + __relay_reset(buf, 0); mutex_unlock(&relay_channels_mutex); } EXPORT_SYMBOL_GPL(relay_reset); @@ -440,7 +441,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) struct dentry *dentry; if (chan->is_global) - return chan->buf[0]; + return *per_cpu_ptr(chan->buf, 0); buf = relay_create_buf(chan); if (!buf) @@ -464,7 +465,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) __relay_reset(buf, 1); if(chan->is_global) { - chan->buf[0] = buf; + *per_cpu_ptr(chan->buf, 0) = buf; buf->cpu = 0; } @@ -526,22 +527,24 @@ static int relay_hotcpu_callback(struct notifier_block *nb, { unsigned int hotcpu = (unsigned long)hcpu; struct rchan *chan; + struct rchan_buf *buf; switch(action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: mutex_lock(&relay_channels_mutex); list_for_each_entry(chan, &relay_channels, list) { - if (chan->buf[hotcpu]) + if ((buf = *per_cpu_ptr(chan->buf, hotcpu))) continue; - chan->buf[hotcpu] = relay_open_buf(chan, hotcpu); - if(!chan->buf[hotcpu]) { + buf = relay_open_buf(chan, hotcpu); + if (!buf) { printk(KERN_ERR "relay_hotcpu_callback: cpu %d buffer " "creation failed\n", hotcpu); mutex_unlock(&relay_channels_mutex); return notifier_from_errno(-ENOMEM); } + *per_cpu_ptr(chan->buf, hotcpu) = buf; } mutex_unlock(&relay_channels_mutex); break; @@ -583,6 +586,7 @@ struct rchan *relay_open(const char *base_filename, { unsigned int i; struct rchan *chan; + struct rchan_buf *buf; if (!(subbuf_size && n_subbufs)) return NULL; @@ -593,6 +597,7 @@ struct rchan *relay_open(const char *base_filename, if (!chan) return NULL; + chan->buf = alloc_percpu(struct rchan_buf *); chan->version = RELAYFS_CHANNEL_VERSION; chan->n_subbufs = n_subbufs; chan->subbuf_size = subbuf_size; @@ -608,9 +613,10 @@ struct rchan *relay_open(const char *base_filename, mutex_lock(&relay_channels_mutex); for_each_online_cpu(i) { - chan->buf[i] = relay_open_buf(chan, i); - if (!chan->buf[i]) + buf = relay_open_buf(chan, i); + if (!buf) goto free_bufs; + *per_cpu_ptr(chan->buf, i) = buf; } list_add(&chan->list, &relay_channels); mutex_unlock(&relay_channels_mutex); @@ -619,8 +625,8 @@ struct rchan *relay_open(const char *base_filename, free_bufs: for_each_possible_cpu(i) { - if (chan->buf[i]) - relay_close_buf(chan->buf[i]); + if ((buf = *per_cpu_ptr(chan->buf, i))) + relay_close_buf(buf); } kref_put(&chan->kref, relay_destroy_channel); @@ -666,6 +672,7 @@ int relay_late_setup_files(struct rchan *chan, unsigned int i, curr_cpu; unsigned long flags; struct dentry *dentry; + struct rchan_buf *buf; struct rchan_percpu_buf_dispatcher disp; if (!chan || !base_filename) @@ -684,10 +691,11 @@ int relay_late_setup_files(struct rchan *chan, if (chan->is_global) { err = -EINVAL; - if (!WARN_ON_ONCE(!chan->buf[0])) { - dentry = relay_create_buf_file(chan, chan->buf[0], 0); + buf = *per_cpu_ptr(chan->buf, 0); + if (!WARN_ON_ONCE(!buf)) { + dentry = relay_create_buf_file(chan, buf, 0); if (dentry && !WARN_ON_ONCE(!chan->is_global)) { - relay_set_buf_dentry(chan->buf[0], dentry); + relay_set_buf_dentry(buf, dentry); err = 0; } } @@ -702,13 +710,14 @@ int relay_late_setup_files(struct rchan *chan, * on all currently online CPUs. */ for_each_online_cpu(i) { - if (unlikely(!chan->buf[i])) { + buf = *per_cpu_ptr(chan->buf, i); + if (unlikely(!buf)) { WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n"); err = -EINVAL; break; } - dentry = relay_create_buf_file(chan, chan->buf[i], i); + dentry = relay_create_buf_file(chan, buf, i); if (unlikely(!dentry)) { err = -EINVAL; break; @@ -716,10 +725,10 @@ int relay_late_setup_files(struct rchan *chan, if (curr_cpu == i) { local_irq_save(flags); - relay_set_buf_dentry(chan->buf[i], dentry); + relay_set_buf_dentry(buf, dentry); local_irq_restore(flags); } else { - disp.buf = chan->buf[i]; + disp.buf = buf; disp.dentry = dentry; smp_mb(); /* relay_channels_mutex must be held, so wait. */ @@ -822,11 +831,10 @@ void relay_subbufs_consumed(struct rchan *chan, if (!chan) return; - if (cpu >= NR_CPUS || !chan->buf[cpu] || - subbufs_consumed > chan->n_subbufs) + buf = *per_cpu_ptr(chan->buf, cpu); + if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs) return; - buf = chan->buf[cpu]; if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed) buf->subbufs_consumed = buf->subbufs_produced; else @@ -842,18 +850,19 @@ EXPORT_SYMBOL_GPL(relay_subbufs_consumed); */ void relay_close(struct rchan *chan) { + struct rchan_buf *buf; unsigned int i; if (!chan) return; mutex_lock(&relay_channels_mutex); - if (chan->is_global && chan->buf[0]) - relay_close_buf(chan->buf[0]); + if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) + relay_close_buf(buf); else for_each_possible_cpu(i) - if (chan->buf[i]) - relay_close_buf(chan->buf[i]); + if ((buf = *per_cpu_ptr(chan->buf, i))) + relay_close_buf(buf); if (chan->last_toobig) printk(KERN_WARNING "relay: one or more items not logged " @@ -874,20 +883,21 @@ EXPORT_SYMBOL_GPL(relay_close); */ void relay_flush(struct rchan *chan) { + struct rchan_buf *buf; unsigned int i; if (!chan) return; - if (chan->is_global && chan->buf[0]) { - relay_switch_subbuf(chan->buf[0], 0); + if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) { + relay_switch_subbuf(buf, 0); return; } mutex_lock(&relay_channels_mutex); for_each_possible_cpu(i) - if (chan->buf[i]) - relay_switch_subbuf(chan->buf[i], 0); + if ((buf = *per_cpu_ptr(chan->buf, i))) + relay_switch_subbuf(buf, 0); mutex_unlock(&relay_channels_mutex); } EXPORT_SYMBOL_GPL(relay_flush); From e6d4989a9ad1ccc343f29578a461612ed80fc6c5 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Thu, 18 Aug 2016 14:57:17 +0200 Subject: [PATCH 177/538] relayfs: Convert to hotplug state machine Install the callbacks via the state machine. They are installed at run time but relay_prepare_cpu() does not need to be invoked by the boot CPU because relay_open() was not yet invoked and there are no pools that need to be created. Signed-off-by: Richard Weinberger Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: rt@linutronix.de Cc: Andrew Morton Link: http://lkml.kernel.org/r/20160818125731.27256-3-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 1 + include/linux/relay.h | 6 ++++ kernel/cpu.c | 6 ++++ kernel/relay.c | 58 +++++++++----------------------------- 4 files changed, 26 insertions(+), 45 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 9e6d10786e29..4c79f40fcebc 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -21,6 +21,7 @@ enum cpuhp_state { CPUHP_PROFILE_PREPARE, CPUHP_X2APIC_PREPARE, CPUHP_SMPCFD_PREPARE, + CPUHP_RELAY_PREPARE, CPUHP_RCUTREE_PREP, CPUHP_NOTIFY_PREPARE, CPUHP_TIMERS_DEAD, diff --git a/include/linux/relay.h b/include/linux/relay.h index eb295e373b90..ecbb34a382b8 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -288,5 +288,11 @@ static inline void subbuf_start_reserve(struct rchan_buf *buf, */ extern const struct file_operations relay_file_operations; +#ifdef CONFIG_RELAY +int relay_prepare_cpu(unsigned int cpu); +#else +#define relay_prepare_cpu NULL +#endif + #endif /* _LINUX_RELAY_H */ diff --git a/kernel/cpu.c b/kernel/cpu.c index d14ae4438e8e..0c0d4b2ddd1c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -1272,6 +1273,11 @@ static struct cpuhp_step cpuhp_bp_states[] = { .startup.single = smpcfd_prepare_cpu, .teardown.single = smpcfd_dead_cpu, }, + [CPUHP_RELAY_PREPARE] = { + .name = "relay:prepare", + .startup.single = relay_prepare_cpu, + .teardown.single = NULL, + }, [CPUHP_RCUTREE_PREP] = { .name = "RCU/tree:prepare", .startup.single = rcutree_prepare_cpu, diff --git a/kernel/relay.c b/kernel/relay.c index ed157378f6cb..fc9b4a4af463 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -513,48 +513,25 @@ static void setup_callbacks(struct rchan *chan, chan->cb = cb; } -/** - * relay_hotcpu_callback - CPU hotplug callback - * @nb: notifier block - * @action: hotplug action to take - * @hcpu: CPU number - * - * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) - */ -static int relay_hotcpu_callback(struct notifier_block *nb, - unsigned long action, - void *hcpu) +int relay_prepare_cpu(unsigned int cpu) { - unsigned int hotcpu = (unsigned long)hcpu; struct rchan *chan; struct rchan_buf *buf; - switch(action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - mutex_lock(&relay_channels_mutex); - list_for_each_entry(chan, &relay_channels, list) { - if ((buf = *per_cpu_ptr(chan->buf, hotcpu))) - continue; - buf = relay_open_buf(chan, hotcpu); - if (!buf) { - printk(KERN_ERR - "relay_hotcpu_callback: cpu %d buffer " - "creation failed\n", hotcpu); - mutex_unlock(&relay_channels_mutex); - return notifier_from_errno(-ENOMEM); - } - *per_cpu_ptr(chan->buf, hotcpu) = buf; + mutex_lock(&relay_channels_mutex); + list_for_each_entry(chan, &relay_channels, list) { + if ((buf = *per_cpu_ptr(chan->buf, cpu))) + continue; + buf = relay_open_buf(chan, cpu); + if (!buf) { + pr_err("relay: cpu %d buffer creation failed\n", cpu); + mutex_unlock(&relay_channels_mutex); + return -ENOMEM; } - mutex_unlock(&relay_channels_mutex); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - /* No need to flush the cpu : will be flushed upon - * final relay_flush() call. */ - break; + *per_cpu_ptr(chan->buf, cpu) = buf; } - return NOTIFY_OK; + mutex_unlock(&relay_channels_mutex); + return 0; } /** @@ -1387,12 +1364,3 @@ const struct file_operations relay_file_operations = { .splice_read = relay_file_splice_read, }; EXPORT_SYMBOL_GPL(relay_file_operations); - -static __init int relay_init(void) -{ - - hotcpu_notifier(relay_hotcpu_callback, 0); - return 0; -} - -early_initcall(relay_init); From 6731d4f12315aed5f7eefc52dac30428e382d7d0 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 23 Aug 2016 14:53:19 +0200 Subject: [PATCH 178/538] slab: Convert to hotplug state machine Install the callbacks via the state machine. Signed-off-by: Richard Weinberger Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: rt@linutronix.de Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrew Morton Cc: Christoph Lameter Link: http://lkml.kernel.org/r/20160823125319.abeapfjapf2kfezp@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 1 + include/linux/slab.h | 8 +++ kernel/cpu.c | 6 ++ mm/slab.c | 114 +++++++++++++++++-------------------- 4 files changed, 66 insertions(+), 63 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 4c79f40fcebc..c2cf14953abc 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -22,6 +22,7 @@ enum cpuhp_state { CPUHP_X2APIC_PREPARE, CPUHP_SMPCFD_PREPARE, CPUHP_RELAY_PREPARE, + CPUHP_SLAB_PREPARE, CPUHP_RCUTREE_PREP, CPUHP_NOTIFY_PREPARE, CPUHP_TIMERS_DEAD, diff --git a/include/linux/slab.h b/include/linux/slab.h index 4293808d8cfb..084b12bad198 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -650,4 +650,12 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node) unsigned int kmem_cache_size(struct kmem_cache *s); void __init kmem_cache_init_late(void); +#if defined(CONFIG_SMP) && defined(CONFIG_SLAB) +int slab_prepare_cpu(unsigned int cpu); +int slab_dead_cpu(unsigned int cpu); +#else +#define slab_prepare_cpu NULL +#define slab_dead_cpu NULL +#endif + #endif /* _LINUX_SLAB_H */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 0c0d4b2ddd1c..7c783876cbcb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -1278,6 +1279,11 @@ static struct cpuhp_step cpuhp_bp_states[] = { .startup.single = relay_prepare_cpu, .teardown.single = NULL, }, + [CPUHP_SLAB_PREPARE] = { + .name = "slab:prepare", + .startup.single = slab_prepare_cpu, + .teardown.single = slab_dead_cpu, + }, [CPUHP_RCUTREE_PREP] = { .name = "RCU/tree:prepare", .startup.single = rcutree_prepare_cpu, diff --git a/mm/slab.c b/mm/slab.c index b67271024135..090fb26b3a39 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -886,6 +886,7 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) return 0; } +#if (defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)) || defined(CONFIG_SMP) /* * Allocates and initializes node for a node on each slab cache, used for * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node @@ -908,6 +909,7 @@ static int init_cache_node_node(int node) return 0; } +#endif static int setup_kmem_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp, bool force_change) @@ -975,6 +977,8 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, return ret; } +#ifdef CONFIG_SMP + static void cpuup_canceled(long cpu) { struct kmem_cache *cachep; @@ -1075,65 +1079,54 @@ static int cpuup_prepare(long cpu) return -ENOMEM; } -static int cpuup_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +int slab_prepare_cpu(unsigned int cpu) { - long cpu = (long)hcpu; - int err = 0; + int err; - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - mutex_lock(&slab_mutex); - err = cpuup_prepare(cpu); - mutex_unlock(&slab_mutex); - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - start_cpu_timer(cpu); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - /* - * Shutdown cache reaper. Note that the slab_mutex is - * held so that if cache_reap() is invoked it cannot do - * anything expensive but will only modify reap_work - * and reschedule the timer. - */ - cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); - /* Now the cache_reaper is guaranteed to be not running. */ - per_cpu(slab_reap_work, cpu).work.func = NULL; - break; - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - start_cpu_timer(cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - /* - * Even if all the cpus of a node are down, we don't free the - * kmem_cache_node of any cache. This to avoid a race between - * cpu_down, and a kmalloc allocation from another cpu for - * memory from the node of the cpu going down. The node - * structure is usually allocated from kmem_cache_create() and - * gets destroyed at kmem_cache_destroy(). - */ - /* fall through */ + mutex_lock(&slab_mutex); + err = cpuup_prepare(cpu); + mutex_unlock(&slab_mutex); + return err; +} + +/* + * This is called for a failed online attempt and for a successful + * offline. + * + * Even if all the cpus of a node are down, we don't free the + * kmem_list3 of any cache. This to avoid a race between cpu_down, and + * a kmalloc allocation from another cpu for memory from the node of + * the cpu going down. The list3 structure is usually allocated from + * kmem_cache_create() and gets destroyed at kmem_cache_destroy(). + */ +int slab_dead_cpu(unsigned int cpu) +{ + mutex_lock(&slab_mutex); + cpuup_canceled(cpu); + mutex_unlock(&slab_mutex); + return 0; +} #endif - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - mutex_lock(&slab_mutex); - cpuup_canceled(cpu); - mutex_unlock(&slab_mutex); - break; - } - return notifier_from_errno(err); + +static int slab_online_cpu(unsigned int cpu) +{ + start_cpu_timer(cpu); + return 0; } -static struct notifier_block cpucache_notifier = { - &cpuup_callback, NULL, 0 -}; +static int slab_offline_cpu(unsigned int cpu) +{ + /* + * Shutdown cache reaper. Note that the slab_mutex is held so + * that if cache_reap() is invoked it cannot do anything + * expensive but will only modify reap_work and reschedule the + * timer. + */ + cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); + /* Now the cache_reaper is guaranteed to be not running. */ + per_cpu(slab_reap_work, cpu).work.func = NULL; + return 0; +} #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) /* @@ -1336,12 +1329,6 @@ void __init kmem_cache_init_late(void) /* Done! */ slab_state = FULL; - /* - * Register a cpu startup notifier callback that initializes - * cpu_cache_get for all new cpus - */ - register_cpu_notifier(&cpucache_notifier); - #ifdef CONFIG_NUMA /* * Register a memory hotplug callback that initializes and frees @@ -1358,13 +1345,14 @@ void __init kmem_cache_init_late(void) static int __init cpucache_init(void) { - int cpu; + int ret; /* * Register the timers that return unneeded pages to the page allocator */ - for_each_online_cpu(cpu) - start_cpu_timer(cpu); + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "SLAB online", + slab_online_cpu, slab_offline_cpu); + WARN_ON(ret < 0); /* Done! */ slab_state = FULL; From a96a87bf949d249039cdf532bb5f5d06622cc5e2 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 18 Aug 2016 14:57:19 +0200 Subject: [PATCH 179/538] slub: Convert to hotplug state machine Install the callbacks via the state machine. Signed-off-by: Sebastian Andrzej Siewior Cc: Andrew Morton Cc: Peter Zijlstra Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: rt@linutronix.de Cc: David Rientjes Cc: Christoph Lameter Cc: Joonsoo Kim Link: http://lkml.kernel.org/r/20160818125731.27256-5-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 1 + mm/slub.c | 65 ++++++++++++-------------------------- 2 files changed, 22 insertions(+), 44 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index c2cf14953abc..82ee32107dff 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -15,6 +15,7 @@ enum cpuhp_state { CPUHP_X86_HPET_DEAD, CPUHP_X86_APB_DEAD, CPUHP_VIRT_NET_DEAD, + CPUHP_SLUB_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, diff --git a/mm/slub.c b/mm/slub.c index 9adae58462f8..2b3e740609e9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -194,10 +194,6 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) #define __OBJECT_POISON 0x80000000UL /* Poison object */ #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ -#ifdef CONFIG_SMP -static struct notifier_block slab_notifier; -#endif - /* * Tracking user of a slab. */ @@ -2304,6 +2300,25 @@ static void flush_all(struct kmem_cache *s) on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC); } +/* + * Use the cpu notifier to insure that the cpu slabs are flushed when + * necessary. + */ +static int slub_cpu_dead(unsigned int cpu) +{ + struct kmem_cache *s; + unsigned long flags; + + mutex_lock(&slab_mutex); + list_for_each_entry(s, &slab_caches, list) { + local_irq_save(flags); + __flush_cpu_slab(s, cpu); + local_irq_restore(flags); + } + mutex_unlock(&slab_mutex); + return 0; +} + /* * Check if the objects in a per cpu structure fit numa * locality expectations. @@ -4144,9 +4159,8 @@ void __init kmem_cache_init(void) /* Setup random freelists for each cache */ init_freelist_randomization(); -#ifdef CONFIG_SMP - register_cpu_notifier(&slab_notifier); -#endif + cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, + slub_cpu_dead); pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n", cache_line_size(), @@ -4210,43 +4224,6 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) return err; } -#ifdef CONFIG_SMP -/* - * Use the cpu notifier to insure that the cpu slabs are flushed when - * necessary. - */ -static int slab_cpuup_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - struct kmem_cache *s; - unsigned long flags; - - switch (action) { - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) { - local_irq_save(flags); - __flush_cpu_slab(s, cpu); - local_irq_restore(flags); - } - mutex_unlock(&slab_mutex); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block slab_notifier = { - .notifier_call = slab_cpuup_callback -}; - -#endif - void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) { struct kmem_cache *s; From 1d7ac6aec947d222042b6d22b3cec109db4fd19e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 18 Aug 2016 14:57:20 +0200 Subject: [PATCH 180/538] mm/writeback: Convert to hotplug state machine Install the callbacks via the state machine and let the core invoke the callbacks on the already online CPUs. Signed-off-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: Jens Axboe Cc: linux-mm@kvack.org Cc: rt@linutronix.de Cc: Tejun Heo Link: http://lkml.kernel.org/r/20160818125731.27256-6-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 1 + mm/page-writeback.c | 26 +++++++------------------- 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 82ee32107dff..854e59a426d4 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -16,6 +16,7 @@ enum cpuhp_state { CPUHP_X86_APB_DEAD, CPUHP_VIRT_NET_DEAD, CPUHP_SLUB_DEAD, + CPUHP_MM_WRITEBACK_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index f4cd7d8005c9..28d6f36a2d79 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2080,26 +2080,12 @@ void writeback_set_ratelimit(void) ratelimit_pages = 16; } -static int -ratelimit_handler(struct notifier_block *self, unsigned long action, - void *hcpu) +static int page_writeback_cpu_online(unsigned int cpu) { - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - case CPU_DEAD: - writeback_set_ratelimit(); - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } + writeback_set_ratelimit(); + return 0; } -static struct notifier_block ratelimit_nb = { - .notifier_call = ratelimit_handler, - .next = NULL, -}; - /* * Called early on to tune the page writeback dirty limits. * @@ -2122,8 +2108,10 @@ void __init page_writeback_init(void) { BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); - writeback_set_ratelimit(); - register_cpu_notifier(&ratelimit_nb); + cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online", + page_writeback_cpu_online, NULL); + cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL, + page_writeback_cpu_online); } /** From c4544dbc7a9bce3da6fa2361cd68cadb34e9221f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 18 Aug 2016 14:57:21 +0200 Subject: [PATCH 181/538] kernel/softirq: Convert to hotplug state machine Install the callbacks via the state machine. Signed-off-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160818125731.27256-7-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 1 + kernel/softirq.c | 27 ++++++--------------------- 2 files changed, 7 insertions(+), 21 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 854e59a426d4..a421407a317f 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -17,6 +17,7 @@ enum cpuhp_state { CPUHP_VIRT_NET_DEAD, CPUHP_SLUB_DEAD, CPUHP_MM_WRITEBACK_DEAD, + CPUHP_SOFTIRQ_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, diff --git a/kernel/softirq.c b/kernel/softirq.c index 17caf4b63342..c372114494f5 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -700,7 +700,7 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) BUG(); } -static void takeover_tasklets(unsigned int cpu) +static int takeover_tasklets(unsigned int cpu) { /* CPU is dead, so no lock needed. */ local_irq_disable(); @@ -723,27 +723,12 @@ static void takeover_tasklets(unsigned int cpu) raise_softirq_irqoff(HI_SOFTIRQ); local_irq_enable(); + return 0; } +#else +#define takeover_tasklets NULL #endif /* CONFIG_HOTPLUG_CPU */ -static int cpu_callback(struct notifier_block *nfb, unsigned long action, - void *hcpu) -{ - switch (action) { -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - takeover_tasklets((unsigned long)hcpu); - break; -#endif /* CONFIG_HOTPLUG_CPU */ - } - return NOTIFY_OK; -} - -static struct notifier_block cpu_nfb = { - .notifier_call = cpu_callback -}; - static struct smp_hotplug_thread softirq_threads = { .store = &ksoftirqd, .thread_should_run = ksoftirqd_should_run, @@ -753,8 +738,8 @@ static struct smp_hotplug_thread softirq_threads = { static __init int spawn_ksoftirqd(void) { - register_cpu_notifier(&cpu_nfb); - + cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL, + takeover_tasklets); BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); return 0; From 84a3f4db039e7c4bfe8ae9bebdebdf2a4e09bf86 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 18 Aug 2016 14:57:23 +0200 Subject: [PATCH 182/538] net/mvneta: Convert to hotplug state machine Install the callbacks via the state machine and let the core invoke the callbacks on the already online CPUs. Signed-off-by: Sebastian Andrzej Siewior Cc: Thomas Petazzoni Cc: Peter Zijlstra Cc: netdev@vger.kernel.org Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160818125731.27256-9-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- drivers/net/ethernet/marvell/mvneta.c | 232 ++++++++++++++++---------- include/linux/cpuhotplug.h | 1 + 2 files changed, 144 insertions(+), 89 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index d41c28d00b57..b74548728fb5 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -382,7 +382,8 @@ struct mvneta_port { struct mvneta_rx_queue *rxqs; struct mvneta_tx_queue *txqs; struct net_device *dev; - struct notifier_block cpu_notifier; + struct hlist_node node_online; + struct hlist_node node_dead; int rxq_def; /* Protect the access to the percpu interrupt registers, * ensuring that the configuration remains coherent. @@ -574,6 +575,7 @@ struct mvneta_rx_queue { int next_desc_to_proc; }; +static enum cpuhp_state online_hpstate; /* The hardware supports eight (8) rx queues, but we are only allowing * the first one to be used. Therefore, let's just allocate one queue. */ @@ -3311,101 +3313,104 @@ static void mvneta_percpu_elect(struct mvneta_port *pp) } }; -static int mvneta_percpu_notifier(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int mvneta_cpu_online(unsigned int cpu, struct hlist_node *node) { - struct mvneta_port *pp = container_of(nfb, struct mvneta_port, - cpu_notifier); - int cpu = (unsigned long)hcpu, other_cpu; + int other_cpu; + struct mvneta_port *pp = hlist_entry_safe(node, struct mvneta_port, + node_online); struct mvneta_pcpu_port *port = per_cpu_ptr(pp->ports, cpu); - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - spin_lock(&pp->lock); - /* Configuring the driver for a new CPU while the - * driver is stopping is racy, so just avoid it. - */ - if (pp->is_stopped) { - spin_unlock(&pp->lock); - break; - } - netif_tx_stop_all_queues(pp->dev); - /* We have to synchronise on tha napi of each CPU - * except the one just being waked up - */ - for_each_online_cpu(other_cpu) { - if (other_cpu != cpu) { - struct mvneta_pcpu_port *other_port = - per_cpu_ptr(pp->ports, other_cpu); + spin_lock(&pp->lock); + /* + * Configuring the driver for a new CPU while the driver is + * stopping is racy, so just avoid it. + */ + if (pp->is_stopped) { + spin_unlock(&pp->lock); + return 0; + } + netif_tx_stop_all_queues(pp->dev); - napi_synchronize(&other_port->napi); - } + /* + * We have to synchronise on tha napi of each CPU except the one + * just being woken up + */ + for_each_online_cpu(other_cpu) { + if (other_cpu != cpu) { + struct mvneta_pcpu_port *other_port = + per_cpu_ptr(pp->ports, other_cpu); + + napi_synchronize(&other_port->napi); } + } - /* Mask all ethernet port interrupts */ - on_each_cpu(mvneta_percpu_mask_interrupt, pp, true); - napi_enable(&port->napi); + /* Mask all ethernet port interrupts */ + on_each_cpu(mvneta_percpu_mask_interrupt, pp, true); + napi_enable(&port->napi); + /* + * Enable per-CPU interrupts on the CPU that is + * brought up. + */ + mvneta_percpu_enable(pp); - /* Enable per-CPU interrupts on the CPU that is - * brought up. - */ - mvneta_percpu_enable(pp); + /* + * Enable per-CPU interrupt on the one CPU we care + * about. + */ + mvneta_percpu_elect(pp); - /* Enable per-CPU interrupt on the one CPU we care - * about. - */ - mvneta_percpu_elect(pp); - - /* Unmask all ethernet port interrupts */ - on_each_cpu(mvneta_percpu_unmask_interrupt, pp, true); - mvreg_write(pp, MVNETA_INTR_MISC_MASK, - MVNETA_CAUSE_PHY_STATUS_CHANGE | - MVNETA_CAUSE_LINK_CHANGE | - MVNETA_CAUSE_PSC_SYNC_CHANGE); - netif_tx_start_all_queues(pp->dev); - spin_unlock(&pp->lock); - break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - netif_tx_stop_all_queues(pp->dev); - /* Thanks to this lock we are sure that any pending - * cpu election is done - */ - spin_lock(&pp->lock); - /* Mask all ethernet port interrupts */ - on_each_cpu(mvneta_percpu_mask_interrupt, pp, true); - spin_unlock(&pp->lock); + /* Unmask all ethernet port interrupts */ + on_each_cpu(mvneta_percpu_unmask_interrupt, pp, true); + mvreg_write(pp, MVNETA_INTR_MISC_MASK, + MVNETA_CAUSE_PHY_STATUS_CHANGE | + MVNETA_CAUSE_LINK_CHANGE | + MVNETA_CAUSE_PSC_SYNC_CHANGE); + netif_tx_start_all_queues(pp->dev); + spin_unlock(&pp->lock); + return 0; +} - napi_synchronize(&port->napi); - napi_disable(&port->napi); - /* Disable per-CPU interrupts on the CPU that is - * brought down. - */ - mvneta_percpu_disable(pp); +static int mvneta_cpu_down_prepare(unsigned int cpu, struct hlist_node *node) +{ + struct mvneta_port *pp = hlist_entry_safe(node, struct mvneta_port, + node_online); + struct mvneta_pcpu_port *port = per_cpu_ptr(pp->ports, cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - /* Check if a new CPU must be elected now this on is down */ - spin_lock(&pp->lock); - mvneta_percpu_elect(pp); - spin_unlock(&pp->lock); - /* Unmask all ethernet port interrupts */ - on_each_cpu(mvneta_percpu_unmask_interrupt, pp, true); - mvreg_write(pp, MVNETA_INTR_MISC_MASK, - MVNETA_CAUSE_PHY_STATUS_CHANGE | - MVNETA_CAUSE_LINK_CHANGE | - MVNETA_CAUSE_PSC_SYNC_CHANGE); - netif_tx_start_all_queues(pp->dev); - break; - } + /* + * Thanks to this lock we are sure that any pending cpu election is + * done. + */ + spin_lock(&pp->lock); + /* Mask all ethernet port interrupts */ + on_each_cpu(mvneta_percpu_mask_interrupt, pp, true); + spin_unlock(&pp->lock); - return NOTIFY_OK; + napi_synchronize(&port->napi); + napi_disable(&port->napi); + /* Disable per-CPU interrupts on the CPU that is brought down. */ + mvneta_percpu_disable(pp); + return 0; +} + +static int mvneta_cpu_dead(unsigned int cpu, struct hlist_node *node) +{ + struct mvneta_port *pp = hlist_entry_safe(node, struct mvneta_port, + node_dead); + + /* Check if a new CPU must be elected now this on is down */ + spin_lock(&pp->lock); + mvneta_percpu_elect(pp); + spin_unlock(&pp->lock); + /* Unmask all ethernet port interrupts */ + on_each_cpu(mvneta_percpu_unmask_interrupt, pp, true); + mvreg_write(pp, MVNETA_INTR_MISC_MASK, + MVNETA_CAUSE_PHY_STATUS_CHANGE | + MVNETA_CAUSE_LINK_CHANGE | + MVNETA_CAUSE_PSC_SYNC_CHANGE); + netif_tx_start_all_queues(pp->dev); + return 0; } static int mvneta_open(struct net_device *dev) @@ -3442,7 +3447,15 @@ static int mvneta_open(struct net_device *dev) /* Register a CPU notifier to handle the case where our CPU * might be taken offline. */ - register_cpu_notifier(&pp->cpu_notifier); + ret = cpuhp_state_add_instance_nocalls(online_hpstate, + &pp->node_online); + if (ret) + goto err_free_irq; + + ret = cpuhp_state_add_instance_nocalls(CPUHP_NET_MVNETA_DEAD, + &pp->node_dead); + if (ret) + goto err_free_online_hp; /* In default link is down */ netif_carrier_off(pp->dev); @@ -3450,15 +3463,19 @@ static int mvneta_open(struct net_device *dev) ret = mvneta_mdio_probe(pp); if (ret < 0) { netdev_err(dev, "cannot probe MDIO bus\n"); - goto err_free_irq; + goto err_free_dead_hp; } mvneta_start_dev(pp); return 0; +err_free_dead_hp: + cpuhp_state_remove_instance_nocalls(CPUHP_NET_MVNETA_DEAD, + &pp->node_dead); +err_free_online_hp: + cpuhp_state_remove_instance_nocalls(online_hpstate, &pp->node_online); err_free_irq: - unregister_cpu_notifier(&pp->cpu_notifier); on_each_cpu(mvneta_percpu_disable, pp, true); free_percpu_irq(pp->dev->irq, pp->ports); err_cleanup_txqs: @@ -3484,7 +3501,10 @@ static int mvneta_stop(struct net_device *dev) mvneta_stop_dev(pp); mvneta_mdio_remove(pp); - unregister_cpu_notifier(&pp->cpu_notifier); + + cpuhp_state_remove_instance_nocalls(online_hpstate, &pp->node_online); + cpuhp_state_remove_instance_nocalls(CPUHP_NET_MVNETA_DEAD, + &pp->node_dead); on_each_cpu(mvneta_percpu_disable, pp, true); free_percpu_irq(dev->irq, pp->ports); mvneta_cleanup_rxqs(pp); @@ -4024,7 +4044,6 @@ static int mvneta_probe(struct platform_device *pdev) err = of_property_read_string(dn, "managed", &managed); pp->use_inband_status = (err == 0 && strcmp(managed, "in-band-status") == 0); - pp->cpu_notifier.notifier_call = mvneta_percpu_notifier; pp->rxq_def = rxq_def; @@ -4227,7 +4246,42 @@ static struct platform_driver mvneta_driver = { }, }; -module_platform_driver(mvneta_driver); +static int __init mvneta_driver_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "net/mvmeta:online", + mvneta_cpu_online, + mvneta_cpu_down_prepare); + if (ret < 0) + goto out; + online_hpstate = ret; + ret = cpuhp_setup_state_multi(CPUHP_NET_MVNETA_DEAD, "net/mvneta:dead", + NULL, mvneta_cpu_dead); + if (ret) + goto err_dead; + + ret = platform_driver_register(&mvneta_driver); + if (ret) + goto err; + return 0; + +err: + cpuhp_remove_multi_state(CPUHP_NET_MVNETA_DEAD); +err_dead: + cpuhp_remove_multi_state(online_hpstate); +out: + return ret; +} +module_init(mvneta_driver_init); + +static void __exit mvneta_driver_exit(void) +{ + platform_driver_unregister(&mvneta_driver); + cpuhp_remove_multi_state(CPUHP_NET_MVNETA_DEAD); + cpuhp_remove_multi_state(online_hpstate); +} +module_exit(mvneta_driver_exit); MODULE_DESCRIPTION("Marvell NETA Ethernet Driver - www.marvell.com"); MODULE_AUTHOR("Rami Rosen , Thomas Petazzoni "); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index a421407a317f..332b39c21d2e 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -18,6 +18,7 @@ enum cpuhp_state { CPUHP_SLUB_DEAD, CPUHP_MM_WRITEBACK_DEAD, CPUHP_SOFTIRQ_DEAD, + CPUHP_NET_MVNETA_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, From 29c6d1bbd7a2cd88a197ea7cef171f616e198526 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 18 Aug 2016 14:57:24 +0200 Subject: [PATCH 183/538] md/raid5: Convert to hotplug state machine Install the callbacks via the state machine and let the core invoke the callbacks on the already online CPUs. Signed-off-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: Neil Brown Cc: linux-raid@vger.kernel.org Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160818125731.27256-10-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- drivers/md/raid5.c | 84 +++++++++++++------------------------- drivers/md/raid5.h | 4 +- include/linux/cpuhotplug.h | 1 + 3 files changed, 31 insertions(+), 58 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8912407a4dd0..aae8064fd9e6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6330,22 +6330,20 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu return 0; } -static void raid5_free_percpu(struct r5conf *conf) +static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node) { - unsigned long cpu; + struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); + + free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); + return 0; +} +static void raid5_free_percpu(struct r5conf *conf) +{ if (!conf->percpu) return; -#ifdef CONFIG_HOTPLUG_CPU - unregister_cpu_notifier(&conf->cpu_notify); -#endif - - get_online_cpus(); - for_each_possible_cpu(cpu) - free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); - put_online_cpus(); - + cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); free_percpu(conf->percpu); } @@ -6364,64 +6362,28 @@ static void free_conf(struct r5conf *conf) kfree(conf); } -#ifdef CONFIG_HOTPLUG_CPU -static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) { - struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); - long cpu = (long)hcpu; + struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (alloc_scratch_buffer(conf, percpu)) { - pr_err("%s: failed memory allocation for cpu%ld\n", - __func__, cpu); - return notifier_from_errno(-ENOMEM); - } - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); - break; - default: - break; + if (alloc_scratch_buffer(conf, percpu)) { + pr_err("%s: failed memory allocation for cpu%u\n", + __func__, cpu); + return -ENOMEM; } - return NOTIFY_OK; + return 0; } -#endif static int raid5_alloc_percpu(struct r5conf *conf) { - unsigned long cpu; int err = 0; conf->percpu = alloc_percpu(struct raid5_percpu); if (!conf->percpu) return -ENOMEM; -#ifdef CONFIG_HOTPLUG_CPU - conf->cpu_notify.notifier_call = raid456_cpu_notify; - conf->cpu_notify.priority = 0; - err = register_cpu_notifier(&conf->cpu_notify); - if (err) - return err; -#endif - - get_online_cpus(); - for_each_present_cpu(cpu) { - err = alloc_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); - if (err) { - pr_err("%s: failed memory allocation for cpu%ld\n", - __func__, cpu); - break; - } - } - put_online_cpus(); - + err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); if (!err) { conf->scribble_disks = max(conf->raid_disks, conf->previous_raid_disks); @@ -7953,10 +7915,21 @@ static struct md_personality raid4_personality = static int __init raid5_init(void) { + int ret; + raid5_wq = alloc_workqueue("raid5wq", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); if (!raid5_wq) return -ENOMEM; + + ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE, + "md/raid5:prepare", + raid456_cpu_up_prepare, + raid456_cpu_dead); + if (ret) { + destroy_workqueue(raid5_wq); + return ret; + } register_md_personality(&raid6_personality); register_md_personality(&raid5_personality); register_md_personality(&raid4_personality); @@ -7968,6 +7941,7 @@ static void raid5_exit(void) unregister_md_personality(&raid6_personality); unregister_md_personality(&raid5_personality); unregister_md_personality(&raid4_personality); + cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); destroy_workqueue(raid5_wq); } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 517d4b68a1be..57ec49f0839e 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -512,9 +512,7 @@ struct r5conf { } __percpu *percpu; int scribble_disks; int scribble_sectors; -#ifdef CONFIG_HOTPLUG_CPU - struct notifier_block cpu_notify; -#endif + struct hlist_node node; /* * Free stripes pool diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 332b39c21d2e..4066c74bb73c 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -27,6 +27,7 @@ enum cpuhp_state { CPUHP_SMPCFD_PREPARE, CPUHP_RELAY_PREPARE, CPUHP_SLAB_PREPARE, + CPUHP_MD_RAID5_PREPARE, CPUHP_RCUTREE_PREP, CPUHP_NOTIFY_PREPARE, CPUHP_TIMERS_DEAD, From 529351fd3c50215a462e5e604d7ceaaf27a8a0e5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 18 Aug 2016 14:57:25 +0200 Subject: [PATCH 184/538] cpuidle/pseries: Convert to hotplug state machine Install the callbacks via the state machine. Signed-off-by: Sebastian Andrzej Siewior Cc: linux-pm@vger.kernel.org Cc: Peter Zijlstra Cc: Daniel Lezcano Cc: "Rafael J. Wysocki" Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160818125731.27256-11-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- drivers/cpuidle/cpuidle-pseries.c | 51 +++++++++++++++---------------- include/linux/cpuhotplug.h | 1 + 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c index 07135e009d8b..166ccd711ec9 100644 --- a/drivers/cpuidle/cpuidle-pseries.c +++ b/drivers/cpuidle/cpuidle-pseries.c @@ -171,40 +171,30 @@ static struct cpuidle_state shared_states[] = { .enter = &shared_cede_loop }, }; -static int pseries_cpuidle_add_cpu_notifier(struct notifier_block *n, - unsigned long action, void *hcpu) +static int pseries_cpuidle_cpu_online(unsigned int cpu) { - int hotcpu = (unsigned long)hcpu; - struct cpuidle_device *dev = - per_cpu(cpuidle_devices, hotcpu); + struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu); if (dev && cpuidle_get_driver()) { - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - cpuidle_pause_and_lock(); - cpuidle_enable_device(dev); - cpuidle_resume_and_unlock(); - break; + cpuidle_pause_and_lock(); + cpuidle_enable_device(dev); + cpuidle_resume_and_unlock(); + } + return 0; +} - case CPU_DEAD: - case CPU_DEAD_FROZEN: - cpuidle_pause_and_lock(); - cpuidle_disable_device(dev); - cpuidle_resume_and_unlock(); - break; +static int pseries_cpuidle_cpu_dead(unsigned int cpu) +{ + struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu); - default: - return NOTIFY_DONE; - } + if (dev && cpuidle_get_driver()) { + cpuidle_pause_and_lock(); + cpuidle_disable_device(dev); + cpuidle_resume_and_unlock(); } - return NOTIFY_OK; + return 0; } -static struct notifier_block setup_hotplug_notifier = { - .notifier_call = pseries_cpuidle_add_cpu_notifier, -}; - /* * pseries_cpuidle_driver_init() */ @@ -273,7 +263,14 @@ static int __init pseries_processor_idle_init(void) return retval; } - register_cpu_notifier(&setup_hotplug_notifier); + retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "cpuidle/pseries:online", + pseries_cpuidle_cpu_online, NULL); + WARN_ON(retval < 0); + retval = cpuhp_setup_state_nocalls(CPUHP_CPUIDLE_DEAD, + "cpuidle/pseries:DEAD", NULL, + pseries_cpuidle_cpu_dead); + WARN_ON(retval < 0); printk(KERN_DEBUG "pseries_idle_driver registered\n"); return 0; } diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 4066c74bb73c..0fb22b95649f 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -19,6 +19,7 @@ enum cpuhp_state { CPUHP_MM_WRITEBACK_DEAD, CPUHP_SOFTIRQ_DEAD, CPUHP_NET_MVNETA_DEAD, + CPUHP_CPUIDLE_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, From 10fcca9d8704a04c6e86398f930fa28e0fb03ce4 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 Aug 2016 11:12:59 +0200 Subject: [PATCH 185/538] cpuidle/powernv: Convert to hotplug state machine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Install the callbacks via the state machine. v1…v2: - Use only CPUHP_CPUIDLE_DEAD (requested by Daniel Lezcano) Signed-off-by: Sebastian Andrzej Siewior Cc: linux-pm@vger.kernel.org Cc: Peter Zijlstra Cc: Daniel Lezcano Cc: "Rafael J. Wysocki" Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160824091259.ozyslcopxvbfdqzy@linutronix.de Signed-off-by: Thomas Gleixner --- drivers/cpuidle/cpuidle-powernv.c | 51 +++++++++++++++---------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index f7ca891b5b59..7fe442ca38f4 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -119,40 +119,30 @@ static struct cpuidle_state powernv_states[CPUIDLE_STATE_MAX] = { .enter = snooze_loop }, }; -static int powernv_cpuidle_add_cpu_notifier(struct notifier_block *n, - unsigned long action, void *hcpu) +static int powernv_cpuidle_cpu_online(unsigned int cpu) { - int hotcpu = (unsigned long)hcpu; - struct cpuidle_device *dev = - per_cpu(cpuidle_devices, hotcpu); + struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu); if (dev && cpuidle_get_driver()) { - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - cpuidle_pause_and_lock(); - cpuidle_enable_device(dev); - cpuidle_resume_and_unlock(); - break; + cpuidle_pause_and_lock(); + cpuidle_enable_device(dev); + cpuidle_resume_and_unlock(); + } + return 0; +} - case CPU_DEAD: - case CPU_DEAD_FROZEN: - cpuidle_pause_and_lock(); - cpuidle_disable_device(dev); - cpuidle_resume_and_unlock(); - break; +static int powernv_cpuidle_cpu_dead(unsigned int cpu) +{ + struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu); - default: - return NOTIFY_DONE; - } + if (dev && cpuidle_get_driver()) { + cpuidle_pause_and_lock(); + cpuidle_disable_device(dev); + cpuidle_resume_and_unlock(); } - return NOTIFY_OK; + return 0; } -static struct notifier_block setup_hotplug_notifier = { - .notifier_call = powernv_cpuidle_add_cpu_notifier, -}; - /* * powernv_cpuidle_driver_init() */ @@ -355,7 +345,14 @@ static int __init powernv_processor_idle_init(void) return retval; } - register_cpu_notifier(&setup_hotplug_notifier); + retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "cpuidle/powernv:online", + powernv_cpuidle_cpu_online, NULL); + WARN_ON(retval < 0); + retval = cpuhp_setup_state_nocalls(CPUHP_CPUIDLE_DEAD, + "cpuidle/powernv:dead", NULL, + powernv_cpuidle_cpu_dead); + WARN_ON(retval < 0); printk(KERN_DEBUG "powernv_idle_driver registered\n"); return 0; } From dfc616d8b3df3013c579e023e67f29ada60bdd50 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 Aug 2016 11:14:44 +0200 Subject: [PATCH 186/538] cpuidle/coupled: Convert to hotplug state machine Install the callbacks via the state machine. Signed-off-by: Sebastian Andrzej Siewior Cc: linux-pm@vger.kernel.org Cc: Peter Zijlstra Cc: Daniel Lezcano Cc: "Rafael J. Wysocki" Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160824091444.brdr5zpbxjvh6n3f@linutronix.de Signed-off-by: Thomas Gleixner --- drivers/cpuidle/coupled.c | 75 ++++++++++++++++---------------------- include/linux/cpuhotplug.h | 1 + 2 files changed, 32 insertions(+), 44 deletions(-) diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c index d5657d50ac40..71e586d7df71 100644 --- a/drivers/cpuidle/coupled.c +++ b/drivers/cpuidle/coupled.c @@ -749,65 +749,52 @@ static void cpuidle_coupled_allow_idle(struct cpuidle_coupled *coupled) put_cpu(); } -/** - * cpuidle_coupled_cpu_notify - notifier called during hotplug transitions - * @nb: notifier block - * @action: hotplug transition - * @hcpu: target cpu number - * - * Called when a cpu is brought on or offline using hotplug. Updates the - * coupled cpu set appropriately - */ -static int cpuidle_coupled_cpu_notify(struct notifier_block *nb, - unsigned long action, void *hcpu) +static int coupled_cpu_online(unsigned int cpu) { - int cpu = (unsigned long)hcpu; struct cpuidle_device *dev; - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - case CPU_DOWN_PREPARE: - case CPU_ONLINE: - case CPU_DEAD: - case CPU_UP_CANCELED: - case CPU_DOWN_FAILED: - break; - default: - return NOTIFY_OK; - } - mutex_lock(&cpuidle_lock); dev = per_cpu(cpuidle_devices, cpu); - if (!dev || !dev->coupled) - goto out; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - case CPU_DOWN_PREPARE: - cpuidle_coupled_prevent_idle(dev->coupled); - break; - case CPU_ONLINE: - case CPU_DEAD: + if (dev && dev->coupled) { cpuidle_coupled_update_online_cpus(dev->coupled); - /* Fall through */ - case CPU_UP_CANCELED: - case CPU_DOWN_FAILED: cpuidle_coupled_allow_idle(dev->coupled); - break; } -out: mutex_unlock(&cpuidle_lock); - return NOTIFY_OK; + return 0; } -static struct notifier_block cpuidle_coupled_cpu_notifier = { - .notifier_call = cpuidle_coupled_cpu_notify, -}; +static int coupled_cpu_up_prepare(unsigned int cpu) +{ + struct cpuidle_device *dev; + + mutex_lock(&cpuidle_lock); + + dev = per_cpu(cpuidle_devices, cpu); + if (dev && dev->coupled) + cpuidle_coupled_prevent_idle(dev->coupled); + + mutex_unlock(&cpuidle_lock); + return 0; +} static int __init cpuidle_coupled_init(void) { - return register_cpu_notifier(&cpuidle_coupled_cpu_notifier); + int ret; + + ret = cpuhp_setup_state_nocalls(CPUHP_CPUIDLE_COUPLED_PREPARE, + "cpuidle/coupled:prepare", + coupled_cpu_up_prepare, + coupled_cpu_online); + if (ret) + return ret; + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "cpuidle/coupled:online", + coupled_cpu_online, + coupled_cpu_up_prepare); + if (ret < 0) + cpuhp_remove_state_nocalls(CPUHP_CPUIDLE_COUPLED_PREPARE); + return ret; } core_initcall(cpuidle_coupled_init); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 0fb22b95649f..e8608774b5da 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -30,6 +30,7 @@ enum cpuhp_state { CPUHP_SLAB_PREPARE, CPUHP_MD_RAID5_PREPARE, CPUHP_RCUTREE_PREP, + CPUHP_CPUIDLE_COUPLED_PREPARE, CPUHP_NOTIFY_PREPARE, CPUHP_TIMERS_DEAD, CPUHP_BRINGUP_CPU, From e8483b578b229774382a95891439b2ebd9c92fc5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 18 Aug 2016 14:57:28 +0200 Subject: [PATCH 187/538] MIPS/BUS/CDMM: Convert to hotplug state machine Install the callbacks via the state machine and let the core invoke the callbacks on the already online CPUs. Signed-off-by: Sebastian Andrzej Siewior Cc: James Hogan Cc: Peter Zijlstra Cc: Ralf Baechle Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160818125731.27256-14-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- drivers/bus/mips_cdmm.c | 70 +++++++---------------------------------- 1 file changed, 12 insertions(+), 58 deletions(-) diff --git a/drivers/bus/mips_cdmm.c b/drivers/bus/mips_cdmm.c index cad49bc38b3e..1b14256376d2 100644 --- a/drivers/bus/mips_cdmm.c +++ b/drivers/bus/mips_cdmm.c @@ -596,19 +596,20 @@ BUILD_PERDEV_HELPER(cpu_down) /* int mips_cdmm_cpu_down_helper(...) */ BUILD_PERDEV_HELPER(cpu_up) /* int mips_cdmm_cpu_up_helper(...) */ /** - * mips_cdmm_bus_down() - Tear down the CDMM bus. - * @data: Pointer to unsigned int CPU number. + * mips_cdmm_cpu_down_prep() - Callback for CPUHP DOWN_PREP: + * Tear down the CDMM bus. + * @cpu: unsigned int CPU number. * * This function is executed on the hotplugged CPU and calls the CDMM * driver cpu_down callback for all devices on that CPU. */ -static long mips_cdmm_bus_down(void *data) +static int mips_cdmm_cpu_down_prep(unsigned int cpu) { struct mips_cdmm_bus *bus; long ret; /* Inform all the devices on the bus */ - ret = bus_for_each_dev(&mips_cdmm_bustype, NULL, data, + ret = bus_for_each_dev(&mips_cdmm_bustype, NULL, &cpu, mips_cdmm_cpu_down_helper); /* @@ -623,8 +624,8 @@ static long mips_cdmm_bus_down(void *data) } /** - * mips_cdmm_bus_up() - Bring up the CDMM bus. - * @data: Pointer to unsigned int CPU number. + * mips_cdmm_cpu_online() - Callback for CPUHP ONLINE: Bring up the CDMM bus. + * @cpu: unsigned int CPU number. * * This work_on_cpu callback function is executed on a given CPU to discover * CDMM devices on that CPU, or to call the CDMM driver cpu_up callback for all @@ -634,7 +635,7 @@ static long mips_cdmm_bus_down(void *data) * initialisation. When CPUs are brought online the function is * invoked directly on the hotplugged CPU. */ -static long mips_cdmm_bus_up(void *data) +static int mips_cdmm_cpu_online(unsigned int cpu) { struct mips_cdmm_bus *bus; long ret; @@ -651,50 +652,12 @@ static long mips_cdmm_bus_up(void *data) mips_cdmm_bus_discover(bus); else /* Inform all the devices on the bus */ - ret = bus_for_each_dev(&mips_cdmm_bustype, NULL, data, + ret = bus_for_each_dev(&mips_cdmm_bustype, NULL, &cpu, mips_cdmm_cpu_up_helper); return ret; } -/** - * mips_cdmm_cpu_notify() - Take action when a CPU is going online or offline. - * @nb: CPU notifier block . - * @action: Event that has taken place (CPU_*). - * @data: CPU number. - * - * This notifier is used to keep the CDMM buses updated as CPUs are offlined and - * onlined. When CPUs go offline or come back online, so does their CDMM bus, so - * devices must be informed. Also when CPUs come online for the first time the - * devices on the CDMM bus need discovering. - * - * Returns: NOTIFY_OK if event was used. - * NOTIFY_DONE if we didn't care. - */ -static int mips_cdmm_cpu_notify(struct notifier_block *nb, - unsigned long action, void *data) -{ - unsigned int cpu = (unsigned int)data; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - mips_cdmm_bus_up(&cpu); - break; - case CPU_DOWN_PREPARE: - mips_cdmm_bus_down(&cpu); - break; - default: - return NOTIFY_DONE; - } - - return NOTIFY_OK; -} - -static struct notifier_block mips_cdmm_cpu_nb = { - .notifier_call = mips_cdmm_cpu_notify, -}; - /** * mips_cdmm_init() - Initialise CDMM bus. * @@ -703,7 +666,6 @@ static struct notifier_block mips_cdmm_cpu_nb = { */ static int __init mips_cdmm_init(void) { - unsigned int cpu; int ret; /* Register the bus */ @@ -712,19 +674,11 @@ static int __init mips_cdmm_init(void) return ret; /* We want to be notified about new CPUs */ - ret = register_cpu_notifier(&mips_cdmm_cpu_nb); - if (ret) { + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "bus/cdmm:online", + mips_cdmm_cpu_online, mips_cdmm_cpu_down_prep); + if (ret < 0) pr_warn("cdmm: Failed to register CPU notifier\n"); - goto out; - } - - /* Discover devices on CDMM of online CPUs */ - for_each_online_cpu(cpu) - work_on_cpu(cpu, mips_cdmm_bus_up, &cpu); - return 0; -out: - bus_unregister(&mips_cdmm_bustype); return ret; } subsys_initcall(mips_cdmm_init); From 9a20ea4b4c34764416e935090d6e5ede02d1bada Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 18 Aug 2016 14:57:29 +0200 Subject: [PATCH 188/538] x86/kvm: Convert to hotplug state machine Install the callbacks via the state machine. The online & down callbacks are invoked on the target CPU so we can avoid using smp_call_function_single(). local_irq_disable() is used because smp_call_function_single() used to invoke the function with interrupts disabled. Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Paolo Bonzini Cc: kvm@vger.kernel.org Cc: Peter Zijlstra Cc: Gleb Natapov Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160818125731.27256-15-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/kvm.c | 43 ++++++++++++++++--------------------------- 1 file changed, 16 insertions(+), 27 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 1726c4c12336..1f431f362dd5 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -423,12 +423,7 @@ static void __init kvm_smp_prepare_boot_cpu(void) kvm_spinlock_init(); } -static void kvm_guest_cpu_online(void *dummy) -{ - kvm_guest_cpu_init(); -} - -static void kvm_guest_cpu_offline(void *dummy) +static void kvm_guest_cpu_offline(void) { kvm_disable_steal_time(); if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) @@ -437,29 +432,21 @@ static void kvm_guest_cpu_offline(void *dummy) apf_task_wake_all(); } -static int kvm_cpu_notify(struct notifier_block *self, unsigned long action, - void *hcpu) +static int kvm_cpu_online(unsigned int cpu) { - int cpu = (unsigned long)hcpu; - switch (action) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - case CPU_ONLINE_FROZEN: - smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0); - break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1); - break; - default: - break; - } - return NOTIFY_OK; + local_irq_disable(); + kvm_guest_cpu_init(); + local_irq_enable(); + return 0; } -static struct notifier_block kvm_cpu_notifier = { - .notifier_call = kvm_cpu_notify, -}; +static int kvm_cpu_down_prepare(unsigned int cpu) +{ + local_irq_disable(); + kvm_guest_cpu_offline(); + local_irq_enable(); + return 0; +} #endif static void __init kvm_apf_trap_init(void) @@ -494,7 +481,9 @@ void __init kvm_guest_init(void) #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; - register_cpu_notifier(&kvm_cpu_notifier); + if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online", + kvm_cpu_online, kvm_cpu_down_prepare) < 0) + pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n"); #else kvm_guest_cpu_init(); #endif From 68e694dcef246f0c8f6738b3aa628f8aa7186796 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 18 Aug 2016 14:57:30 +0200 Subject: [PATCH 189/538] powerpc/powermac: Convert to hotplug state machine Install the callbacks via the state machine. I assume here that the powermac has two CPUs and so only one can go up or down at a time. The variable smp_core99_host_open is here to ensure that we do not try to open or close the i2c host twice if something goes wrong and we invoke the prepare or online callback twice due to rollback. Signed-off-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: rt@linutronix.de Cc: Michael Ellerman Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20160818125731.27256-16-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- arch/powerpc/platforms/powermac/smp.c | 50 +++++++++++++-------------- include/linux/cpuhotplug.h | 1 + 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index 834868b9fdc9..366e4f510fcf 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -852,37 +852,33 @@ static void smp_core99_setup_cpu(int cpu_nr) #ifdef CONFIG_PPC64 #ifdef CONFIG_HOTPLUG_CPU -static int smp_core99_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static unsigned int smp_core99_host_open; + +static int smp_core99_cpu_prepare(unsigned int cpu) { int rc; - switch(action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - /* Open i2c bus if it was used for tb sync */ - if (pmac_tb_clock_chip_host) { - rc = pmac_i2c_open(pmac_tb_clock_chip_host, 1); - if (rc) { - pr_err("Failed to open i2c bus for time sync\n"); - return notifier_from_errno(rc); - } + /* Open i2c bus if it was used for tb sync */ + if (pmac_tb_clock_chip_host && !smp_core99_host_open) { + rc = pmac_i2c_open(pmac_tb_clock_chip_host, 1); + if (rc) { + pr_err("Failed to open i2c bus for time sync\n"); + return notifier_from_errno(rc); } - break; - case CPU_ONLINE: - case CPU_UP_CANCELED: - /* Close i2c bus if it was used for tb sync */ - if (pmac_tb_clock_chip_host) - pmac_i2c_close(pmac_tb_clock_chip_host); - break; - default: - break; + smp_core99_host_open = 1; } - return NOTIFY_OK; + return 0; } -static struct notifier_block smp_core99_cpu_nb = { - .notifier_call = smp_core99_cpu_notify, -}; +static int smp_core99_cpu_online(unsigned int cpu) +{ + /* Close i2c bus if it was used for tb sync */ + if (pmac_tb_clock_chip_host && smp_core99_host_open) { + pmac_i2c_close(pmac_tb_clock_chip_host); + smp_core99_host_open = 0; + } + return 0; +} #endif /* CONFIG_HOTPLUG_CPU */ static void __init smp_core99_bringup_done(void) @@ -902,7 +898,11 @@ static void __init smp_core99_bringup_done(void) g5_phy_disable_cpu1(); } #ifdef CONFIG_HOTPLUG_CPU - register_cpu_notifier(&smp_core99_cpu_nb); + cpuhp_setup_state_nocalls(CPUHP_POWERPC_PMAC_PREPARE, + "powerpc/pmac:prepare", smp_core99_cpu_prepare, + NULL); + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "powerpc/pmac:online", + smp_core99_cpu_online, NULL); #endif if (ppc_md.progress) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index e8608774b5da..33fba43ad292 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -31,6 +31,7 @@ enum cpuhp_state { CPUHP_MD_RAID5_PREPARE, CPUHP_RCUTREE_PREP, CPUHP_CPUIDLE_COUPLED_PREPARE, + CPUHP_POWERPC_PMAC_PREPARE, CPUHP_NOTIFY_PREPARE, CPUHP_TIMERS_DEAD, CPUHP_BRINGUP_CPU, From da3ed6519b19a9def0fcb966c6274946ad18d9a6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 18 Aug 2016 14:57:31 +0200 Subject: [PATCH 190/538] powerpc/mmu nohash: Convert to hotplug state machine Install the callbacks via the state machine. Signed-off-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: rt@linutronix.de Cc: Michael Ellerman Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20160818125731.27256-17-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- arch/powerpc/mm/mmu_context_nohash.c | 56 ++++++++++++---------------- include/linux/cpuhotplug.h | 1 + 2 files changed, 25 insertions(+), 32 deletions(-) diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index 7d95bc402dba..c491f2c8f2b9 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -369,44 +369,34 @@ void destroy_context(struct mm_struct *mm) } #ifdef CONFIG_SMP - -static int mmu_context_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int mmu_ctx_cpu_prepare(unsigned int cpu) { - unsigned int cpu = (unsigned int)(long)hcpu; - /* We don't touch CPU 0 map, it's allocated at aboot and kept * around forever */ if (cpu == boot_cpuid) - return NOTIFY_OK; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu); - stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu); - kfree(stale_map[cpu]); - stale_map[cpu] = NULL; - - /* We also clear the cpu_vm_mask bits of CPUs going away */ - clear_tasks_mm_cpumask(cpu); - break; -#endif /* CONFIG_HOTPLUG_CPU */ - } - return NOTIFY_OK; + return 0; + + pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu); + stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL); + return 0; } -static struct notifier_block mmu_context_cpu_nb = { - .notifier_call = mmu_context_cpu_notify, -}; +static int mmu_ctx_cpu_dead(unsigned int cpu) +{ +#ifdef CONFIG_HOTPLUG_CPU + if (cpu == boot_cpuid) + return 0; + + pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu); + kfree(stale_map[cpu]); + stale_map[cpu] = NULL; + + /* We also clear the cpu_vm_mask bits of CPUs going away */ + clear_tasks_mm_cpumask(cpu); +#endif + return 0; +} #endif /* CONFIG_SMP */ @@ -469,7 +459,9 @@ void __init mmu_context_init(void) #else stale_map[boot_cpuid] = memblock_virt_alloc(CTX_MAP_SIZE, 0); - register_cpu_notifier(&mmu_context_cpu_nb); + cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE, + "powerpc/mmu/ctx:prepare", + mmu_ctx_cpu_prepare, mmu_ctx_cpu_dead); #endif printk(KERN_INFO diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 33fba43ad292..afd59e2ca4b3 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -32,6 +32,7 @@ enum cpuhp_state { CPUHP_RCUTREE_PREP, CPUHP_CPUIDLE_COUPLED_PREPARE, CPUHP_POWERPC_PMAC_PREPARE, + CPUHP_POWERPC_MMU_CTX_PREPARE, CPUHP_NOTIFY_PREPARE, CPUHP_TIMERS_DEAD, CPUHP_BRINGUP_CPU, From bda7b072de999280ef78aaea4335ec58afc4bdb2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 7 Sep 2016 15:39:55 +0300 Subject: [PATCH 191/538] x86/platform/intel-mid: Implement power off sequence Tell SCU that we are about powering off the device. Signed-off-by: Andy Shevchenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160907123955.21228-1-andriy.shevchenko@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/intel-mid.h | 2 ++ arch/x86/include/asm/intel_scu_ipc.h | 2 ++ arch/x86/platform/intel-mid/intel-mid.c | 5 +++++ arch/x86/platform/intel-mid/pwr.c | 24 +++++++++++++++++++++++- 4 files changed, 32 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index 9d6b097aa73d..5b6753d1f7f4 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -18,6 +18,8 @@ extern int intel_mid_pci_init(void); extern int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state); +extern void intel_mid_pwr_power_off(void); + #define INTEL_MID_PWR_LSS_OFFSET 4 #define INTEL_MID_PWR_LSS_TYPE (1 << 7) diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h index 925b605eb5c6..4fb1d0abef95 100644 --- a/arch/x86/include/asm/intel_scu_ipc.h +++ b/arch/x86/include/asm/intel_scu_ipc.h @@ -3,6 +3,8 @@ #include +#define IPCMSG_COLD_OFF 0x80 /* Only for Tangier */ + #define IPCMSG_WARM_RESET 0xF0 #define IPCMSG_COLD_RESET 0xF1 #define IPCMSG_SOFT_RESET 0xF2 diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index ce119d2ba0d0..7850128f0026 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -70,6 +70,11 @@ EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip); static void intel_mid_power_off(void) { + /* Shut down South Complex via PWRMU */ + intel_mid_pwr_power_off(); + + /* Only for Tangier, the rest will ignore this command */ + intel_scu_ipc_simple_command(IPCMSG_COLD_OFF, 1); }; static void intel_mid_reboot(void) diff --git a/arch/x86/platform/intel-mid/pwr.c b/arch/x86/platform/intel-mid/pwr.c index 0548741b6894..2dfe998a5afd 100644 --- a/arch/x86/platform/intel-mid/pwr.c +++ b/arch/x86/platform/intel-mid/pwr.c @@ -48,7 +48,15 @@ #define PM_CMD_CM_IMMEDIATE (1 << 9) #define PM_CMD_CM_DELAY (2 << 9) #define PM_CMD_CM_TRIGGER (3 << 9) -#define PM_CMD_D3cold (1 << 21) + +/* System states */ +#define PM_CMD_SYS_STATE_S5 (5 << 16) + +/* Trigger variants */ +#define PM_CMD_CFG_TRIGGER_NC (3 << 19) + +/* Message to wait for TRIGGER_NC case */ +#define TRIGGER_NC_MSG_2 (2 << 22) /* List of commands */ #define CMD_SET_CFG 0x01 @@ -264,6 +272,20 @@ int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state) } EXPORT_SYMBOL_GPL(intel_mid_pci_set_power_state); +void intel_mid_pwr_power_off(void) +{ + struct mid_pwr *pwr = midpwr; + u32 cmd = PM_CMD_SYS_STATE_S5 | + PM_CMD_CMD(CMD_SET_CFG) | + PM_CMD_CM_TRIGGER | + PM_CMD_CFG_TRIGGER_NC | + TRIGGER_NC_MSG_2; + + /* Send command to SCU */ + writel(cmd, pwr->regs + PM_CMD); + mid_pwr_wait(pwr); +} + int intel_mid_pwr_get_lss_id(struct pci_dev *pdev) { int vndr; From f5fbf848303c8704d0e1a1e7cabd08fd0a49552f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 6 Sep 2016 21:42:54 +0300 Subject: [PATCH 192/538] x86/cpu: Rename Merrifield2 to Moorefield Merrifield2 is actually Moorefield. Rename it accordingly and drop tail digit from Merrifield1. Signed-off-by: Andy Shevchenko Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160906184254.94440-1-andriy.shevchenko@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/intel-family.h | 4 ++-- arch/x86/platform/atom/punit_atom_debug.c | 2 +- drivers/pci/pci-mid.c | 2 +- drivers/powercap/intel_rapl.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 627719475457..9ae5ab80a497 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -56,8 +56,8 @@ #define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */ #define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */ #define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */ -#define INTEL_FAM6_ATOM_MERRIFIELD1 0x4A /* Tangier */ -#define INTEL_FAM6_ATOM_MERRIFIELD2 0x5A /* Annidale */ +#define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */ +#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Annidale */ #define INTEL_FAM6_ATOM_GOLDMONT 0x5C #define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c index 8ff7b9355416..d49d3be81953 100644 --- a/arch/x86/platform/atom/punit_atom_debug.c +++ b/arch/x86/platform/atom/punit_atom_debug.c @@ -155,7 +155,7 @@ static void punit_dbgfs_unregister(void) static const struct x86_cpu_id intel_punit_cpu_ids[] = { ICPU(INTEL_FAM6_ATOM_SILVERMONT1, punit_device_byt), - ICPU(INTEL_FAM6_ATOM_MERRIFIELD1, punit_device_tng), + ICPU(INTEL_FAM6_ATOM_MERRIFIELD, punit_device_tng), ICPU(INTEL_FAM6_ATOM_AIRMONT, punit_device_cht), {} }; diff --git a/drivers/pci/pci-mid.c b/drivers/pci/pci-mid.c index c878aa71173b..b7ea64f63149 100644 --- a/drivers/pci/pci-mid.c +++ b/drivers/pci/pci-mid.c @@ -61,7 +61,7 @@ static struct pci_platform_pm_ops mid_pci_platform_pm = { #define ICPU(model) { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, } static const struct x86_cpu_id lpss_cpu_ids[] = { - ICPU(INTEL_FAM6_ATOM_MERRIFIELD1), + ICPU(INTEL_FAM6_ATOM_MERRIFIELD), {} }; diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c index fbab29dfa793..243b233ff31b 100644 --- a/drivers/powercap/intel_rapl.c +++ b/drivers/powercap/intel_rapl.c @@ -1154,8 +1154,8 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT1, rapl_defaults_byt), RAPL_CPU(INTEL_FAM6_ATOM_AIRMONT, rapl_defaults_cht), - RAPL_CPU(INTEL_FAM6_ATOM_MERRIFIELD1, rapl_defaults_tng), - RAPL_CPU(INTEL_FAM6_ATOM_MERRIFIELD2, rapl_defaults_ann), + RAPL_CPU(INTEL_FAM6_ATOM_MERRIFIELD, rapl_defaults_tng), + RAPL_CPU(INTEL_FAM6_ATOM_MOOREFIELD, rapl_defaults_ann), RAPL_CPU(INTEL_FAM6_ATOM_GOLDMONT, rapl_defaults_core), RAPL_CPU(INTEL_FAM6_ATOM_DENVERTON, rapl_defaults_core), From 6271cfdfc0e4731b76921ef02fdd87409d71dfdf Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 30 Aug 2016 17:27:57 -0700 Subject: [PATCH 193/538] x86/mm: Improve stack-overflow #PF handling If we get a page fault indicating kernel stack overflow, invoke handle_stack_overflow(). To prevent us from overflowing the stack again while handling the overflow (because we are likely to have very little stack space left), call handle_stack_overflow() on the double-fault stack. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/6d6cf96b3fb9b4c9aa303817e1dc4de0c7c36487.1472603235.git.luto@kernel.org [ Minor edit. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/traps.h | 6 ++++++ arch/x86/kernel/traps.c | 6 +++--- arch/x86/mm/fault.c | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index c3496619740a..01fd0a7f48cd 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -117,6 +117,12 @@ extern void ist_exit(struct pt_regs *regs); extern void ist_begin_non_atomic(struct pt_regs *regs); extern void ist_end_non_atomic(void); +#ifdef CONFIG_VMAP_STACK +void __noreturn handle_stack_overflow(const char *message, + struct pt_regs *regs, + unsigned long fault_address); +#endif + /* Interrupts/Exceptions */ enum { X86_TRAP_DE = 0, /* 0, Divide-by-zero */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 907b4e4aeb5e..bd4e3d4d3625 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -293,9 +293,9 @@ DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) #ifdef CONFIG_VMAP_STACK -static void __noreturn handle_stack_overflow(const char *message, - struct pt_regs *regs, - unsigned long fault_address) +__visible void __noreturn handle_stack_overflow(const char *message, + struct pt_regs *regs, + unsigned long fault_address) { printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n", (void *)fault_address, current->stack, diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index dc8023060456..0b92fce3e6c0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -753,6 +753,38 @@ no_context(struct pt_regs *regs, unsigned long error_code, return; } +#ifdef CONFIG_VMAP_STACK + /* + * Stack overflow? During boot, we can fault near the initial + * stack in the direct map, but that's not an overflow -- check + * that we're in vmalloc space to avoid this. + */ + if (is_vmalloc_addr((void *)address) && + (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || + address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { + register void *__sp asm("rsp"); + unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); + /* + * We're likely to be running with very little stack space + * left. It's plausible that we'd hit this condition but + * double-fault even before we get this far, in which case + * we're fine: the double-fault handler will deal with it. + * + * We don't want to make it all the way into the oops code + * and then double-fault, though, because we're likely to + * break the console driver and lose most of the stack dump. + */ + asm volatile ("movq %[stack], %%rsp\n\t" + "call handle_stack_overflow\n\t" + "1: jmp 1b" + : "+r" (__sp) + : "D" ("kernel stack overflow (page fault)"), + "S" (regs), "d" (address), + [stack] "rm" (stack)); + unreachable(); + } +#endif + /* * 32-bit: * From 9472fe7040bba45c6200858cbe40d643cf02bccb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 30 Aug 2016 08:04:15 -0700 Subject: [PATCH 194/538] virtio_console: Stop doing DMA on the stack virtio_console uses a small DMA buffer for control requests. Move that buffer into heap memory. Doing virtio DMA on the stack is normally okay on non-DMA-API virtio systems (which is currently most of them), but it breaks completely if the stack is virtually mapped (CONFIG_VMAP_STACK=y). Tested by typing both directions using picocom aimed at /dev/hvc0. Signed-off-by: Andy Lutomirski Reviewed-by: Amit Shah Cc: Linus Torvalds Cc: Michael S. Tsirkin Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: virtualization@lists.linux-foundation.org Link: http://lkml.kernel.org/r/0afe68f9b4be6c95af9e7672b07acd0274c26dfe.1472569320.git.luto@kernel.org Signed-off-by: Ingo Molnar --- drivers/char/virtio_console.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index d2406fe25533..5da47e26a012 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -165,6 +165,12 @@ struct ports_device { */ struct virtqueue *c_ivq, *c_ovq; + /* + * A control packet buffer for guest->host requests, protected + * by c_ovq_lock. + */ + struct virtio_console_control cpkt; + /* Array of per-port IO virtqueues */ struct virtqueue **in_vqs, **out_vqs; @@ -560,28 +566,29 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id, unsigned int event, unsigned int value) { struct scatterlist sg[1]; - struct virtio_console_control cpkt; struct virtqueue *vq; unsigned int len; if (!use_multiport(portdev)) return 0; - cpkt.id = cpu_to_virtio32(portdev->vdev, port_id); - cpkt.event = cpu_to_virtio16(portdev->vdev, event); - cpkt.value = cpu_to_virtio16(portdev->vdev, value); - vq = portdev->c_ovq; - sg_init_one(sg, &cpkt, sizeof(cpkt)); - spin_lock(&portdev->c_ovq_lock); - if (virtqueue_add_outbuf(vq, sg, 1, &cpkt, GFP_ATOMIC) == 0) { + + portdev->cpkt.id = cpu_to_virtio32(portdev->vdev, port_id); + portdev->cpkt.event = cpu_to_virtio16(portdev->vdev, event); + portdev->cpkt.value = cpu_to_virtio16(portdev->vdev, value); + + sg_init_one(sg, &portdev->cpkt, sizeof(struct virtio_console_control)); + + if (virtqueue_add_outbuf(vq, sg, 1, &portdev->cpkt, GFP_ATOMIC) == 0) { virtqueue_kick(vq); while (!virtqueue_get_buf(vq, &len) && !virtqueue_is_broken(vq)) cpu_relax(); } + spin_unlock(&portdev->c_ovq_lock); return 0; } From 1d723de7396c1c028a091a37b2211ff6892c7f52 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 28 Aug 2016 11:51:06 +0100 Subject: [PATCH 195/538] selftests/x86: Fix spelling mistake "preseve" -> "preserve" Trivial fix to spelling mistakes in printf messages. Signed-off-by: Colin Ian King Acked-by: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Cc: linux-kselftest@vger.kernel.org Link: http://lkml.kernel.org/r/20160828105106.9763-1-colin.king@canonical.com Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/ptrace_syscall.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/x86/ptrace_syscall.c b/tools/testing/selftests/x86/ptrace_syscall.c index 421456784bc6..b037ce9cf116 100644 --- a/tools/testing/selftests/x86/ptrace_syscall.c +++ b/tools/testing/selftests/x86/ptrace_syscall.c @@ -147,7 +147,7 @@ static void test_sys32_regs(void (*do_syscall)(struct syscall_args32 *)) if (args.nr != getpid() || args.arg0 != 10 || args.arg1 != 11 || args.arg2 != 12 || args.arg3 != 13 || args.arg4 != 14 || args.arg5 != 15) { - printf("[FAIL]\tgetpid() failed to preseve regs\n"); + printf("[FAIL]\tgetpid() failed to preserve regs\n"); nerrs++; } else { printf("[OK]\tgetpid() preserves regs\n"); @@ -162,7 +162,7 @@ static void test_sys32_regs(void (*do_syscall)(struct syscall_args32 *)) if (args.nr != 0 || args.arg0 != getpid() || args.arg1 != SIGUSR1 || args.arg2 != 12 || args.arg3 != 13 || args.arg4 != 14 || args.arg5 != 15) { - printf("[FAIL]\tkill(getpid(), SIGUSR1) failed to preseve regs\n"); + printf("[FAIL]\tkill(getpid(), SIGUSR1) failed to preserve regs\n"); nerrs++; } else { printf("[OK]\tkill(getpid(), SIGUSR1) preserves regs\n"); From 019e579d395733d14097c2d29c8c43226dad1617 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 24 Aug 2016 11:50:14 -0500 Subject: [PATCH 196/538] perf/x86: Check perf_callchain_store() error Add a check to perf_callchain_kernel() so that it returns early if the callchain entry array is already full. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Brian Gerst Cc: Byungchul Park Cc: Frederic Weisbecker Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/dce6d60bab08be2600efd90021d9b85620646161.1472057064.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index d0efb5cb1b00..c1319ac19ebb 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2277,7 +2277,8 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re return; } - perf_callchain_store(entry, regs->ip); + if (perf_callchain_store(entry, regs->ip)) + return; dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); } From 3e344a0db900757caaf0beeb749de4c7b59bfd60 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 24 Aug 2016 11:50:15 -0500 Subject: [PATCH 197/538] oprofile/x86: Add regs->ip to oprofile trace dump_trace() doesn't add the interrupted instruction's address to the trace, so add it manually. This makes the profile more useful, and also makes it more consistent with what perf profiling does. Signed-off-by: Josh Poimboeuf Acked-by: Robert Richter Cc: Andy Lutomirski Cc: Brian Gerst Cc: Byungchul Park Cc: Frederic Weisbecker Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/6c745a83dbd69fc6857ef9b2f8be0f011d775936.1472057064.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/oprofile/backtrace.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index cb31a4440e58..2ef6c8b56311 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -114,9 +114,16 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) if (!user_mode(regs)) { unsigned long stack = kernel_stack_pointer(regs); - if (depth) - dump_trace(NULL, regs, (unsigned long *)stack, 0, - &backtrace_ops, &depth); + + if (!depth) + return; + + oprofile_add_trace(regs->ip); + if (!--depth) + return; + + dump_trace(NULL, regs, (unsigned long *)stack, 0, + &backtrace_ops, &depth); return; } From d438f5fda30ec087512355e405e9c8955d8bd337 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 24 Aug 2016 11:50:16 -0500 Subject: [PATCH 198/538] x86/dumpstack: Make printk_stack_address() more generally useful Change printk_stack_address() to be useful when called by an unwinder outside the context of dump_trace(). Specifically: - printk_stack_address()'s 'data' argument is always used as the log level string. Make that explicit. - Call touch_nmi_watchdog(). Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Brian Gerst Cc: Byungchul Park Cc: Frederic Weisbecker Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/9fbe0db05bacf66d337c162edbf61450d0cff1e2.1472057064.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 01072e9e165e..f0ddf855957e 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -26,10 +26,11 @@ int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; static void printk_stack_address(unsigned long address, int reliable, - void *data) + char *log_lvl) { + touch_nmi_watchdog(); printk("%s [<%p>] %s%pB\n", - (char *)data, (void *)address, reliable ? "" : "? ", + log_lvl, (void *)address, reliable ? "" : "? ", (void *)address); } @@ -148,7 +149,6 @@ static int print_trace_stack(void *data, char *name) */ static int print_trace_address(void *data, unsigned long addr, int reliable) { - touch_nmi_watchdog(); printk_stack_address(addr, reliable, data); return 0; } From 4b8afafbe743be1a81c96ddcd75b19c534d5e262 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 24 Aug 2016 11:50:17 -0500 Subject: [PATCH 199/538] x86/dumpstack: Add get_stack_pointer() and get_frame_pointer() The various functions involved in dumping the stack all do similar things with regard to getting the stack pointer and the frame pointer based on the regs and task arguments. Create helper functions to do that instead. Signed-off-by: Josh Poimboeuf Reviewed-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Brian Gerst Cc: Byungchul Park Cc: Frederic Weisbecker Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/f448914885a35f333fe04da1b97a6c2cc1f80974.1472057064.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/stacktrace.h | 41 +++++++++++++++++-------------- arch/x86/kernel/dumpstack.c | 5 ++-- arch/x86/kernel/dumpstack_32.c | 25 +++---------------- arch/x86/kernel/dumpstack_64.c | 30 +++------------------- 4 files changed, 33 insertions(+), 68 deletions(-) diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 7646fb2772f8..3552f5e7189e 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -50,36 +50,41 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, #ifdef CONFIG_X86_32 #define STACKSLOTS_PER_LINE 8 -#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) #else #define STACKSLOTS_PER_LINE 4 -#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) #endif #ifdef CONFIG_FRAME_POINTER -static inline unsigned long -stack_frame(struct task_struct *task, struct pt_regs *regs) +static inline unsigned long * +get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { - unsigned long bp; - if (regs) - return regs->bp; + return (unsigned long *)regs->bp; - if (task == current) { - /* Grab bp right from our regs */ - get_bp(bp); - return bp; - } + if (!task || task == current) + return __builtin_frame_address(0); - return ((struct inactive_task_frame *)task->thread.sp)->bp; + return (unsigned long *)((struct inactive_task_frame *)task->thread.sp)->bp; } #else -static inline unsigned long -stack_frame(struct task_struct *task, struct pt_regs *regs) +static inline unsigned long * +get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { - return 0; + return NULL; +} +#endif /* CONFIG_FRAME_POINTER */ + +static inline unsigned long * +get_stack_pointer(struct task_struct *task, struct pt_regs *regs) +{ + if (regs) + return (unsigned long *)kernel_stack_pointer(regs); + + if (!task || task == current) + return __builtin_frame_address(0); + + return (unsigned long *)task->thread.sp; } -#endif extern void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, @@ -106,7 +111,7 @@ static inline unsigned long caller_frame_pointer(void) { struct stack_frame *frame; - get_bp(frame); + frame = __builtin_frame_address(0); #ifdef CONFIG_FRAME_POINTER frame = frame->next_frame; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index f0ddf855957e..6d6f46837eea 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -170,15 +170,14 @@ show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, void show_stack(struct task_struct *task, unsigned long *sp) { unsigned long bp = 0; - unsigned long stack; /* * Stack frames below this one aren't interesting. Don't show them * if we're printing for %current. */ if (!sp && (!task || task == current)) { - sp = &stack; - bp = stack_frame(current, NULL); + sp = get_stack_pointer(current, NULL); + bp = (unsigned long)get_frame_pointer(current, NULL); } show_stack_log_lvl(task, NULL, sp, bp, ""); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 09675712eba8..358fe1cd4e5b 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -46,19 +46,9 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, int graph = 0; u32 *prev_esp; - if (!task) - task = current; - - if (!stack) { - unsigned long dummy; - - stack = &dummy; - if (task != current) - stack = (unsigned long *)task->thread.sp; - } - - if (!bp) - bp = stack_frame(task, regs); + task = task ? : current; + stack = stack ? : get_stack_pointer(task, regs); + bp = bp ? : (unsigned long)get_frame_pointer(task, regs); for (;;) { void *end_stack; @@ -95,14 +85,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack; int i; - if (sp == NULL) { - if (regs) - sp = (unsigned long *)regs->sp; - else if (task) - sp = (unsigned long *)task->thread.sp; - else - sp = (unsigned long *)&sp; - } + sp = sp ? : get_stack_pointer(task, regs); stack = sp; for (i = 0; i < kstack_depth_to_print; i++) { diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 066eb5c77fd6..7f3b8066f719 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -151,25 +151,14 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, { const unsigned cpu = get_cpu(); unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu); - unsigned long dummy; unsigned used = 0; int graph = 0; int done = 0; - if (!task) - task = current; + task = task ? : current; + stack = stack ? : get_stack_pointer(task, regs); + bp = bp ? : (unsigned long)get_frame_pointer(task, regs); - if (!stack) { - if (regs) - stack = (unsigned long *)regs->sp; - else if (task != current) - stack = (unsigned long *)task->thread.sp; - else - stack = &dummy; - } - - if (!bp) - bp = stack_frame(task, regs); /* * Print function call entries in all stacks, starting at the * current stack address. If the stacks consist of nested @@ -256,18 +245,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long)); - /* - * Debugging aid: "show_stack(NULL, NULL);" prints the - * back trace for this cpu: - */ - if (sp == NULL) { - if (regs) - sp = (unsigned long *)regs->sp; - else if (task) - sp = (unsigned long *)task->thread.sp; - else - sp = (unsigned long *)&sp; - } + sp = sp ? : get_stack_pointer(task, regs); stack = sp; for (i = 0; i < kstack_depth_to_print; i++) { From 5a8ff54c260ecfed3de9b8d1272eb87826935df8 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 24 Aug 2016 11:50:18 -0500 Subject: [PATCH 200/538] x86/dumpstack: Remove unnecessary stack pointer arguments When calling show_stack_log_lvl() or dump_trace() with a regs argument, providing a stack pointer or frame pointer is redundant. Signed-off-by: Josh Poimboeuf d Reviewed-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Brian Gerst Cc: Byungchul Park Cc: Frederic Weisbecker Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1694e2e955e3b9a73a3c3d5ba2634344014dd550.1472057064.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 2 +- arch/x86/kernel/dumpstack_32.c | 2 +- arch/x86/kernel/dumpstack_64.c | 5 +---- arch/x86/oprofile/backtrace.c | 5 +---- 4 files changed, 4 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 6d6f46837eea..c6c6c39c367f 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -185,7 +185,7 @@ void show_stack(struct task_struct *task, unsigned long *sp) void show_stack_regs(struct pt_regs *regs) { - show_stack_log_lvl(current, regs, (unsigned long *)regs->sp, regs->bp, ""); + show_stack_log_lvl(current, regs, NULL, 0, ""); } static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 358fe1cd4e5b..c533b8b5a373 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -122,7 +122,7 @@ void show_regs(struct pt_regs *regs) u8 *ip; pr_emerg("Stack:\n"); - show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); + show_stack_log_lvl(NULL, regs, NULL, 0, KERN_EMERG); pr_emerg("Code:"); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 7f3b8066f719..b243352c779e 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -283,9 +283,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, void show_regs(struct pt_regs *regs) { int i; - unsigned long sp; - sp = regs->sp; show_regs_print_info(KERN_DEFAULT); __show_regs(regs, 1); @@ -300,8 +298,7 @@ void show_regs(struct pt_regs *regs) u8 *ip; printk(KERN_DEFAULT "Stack:\n"); - show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - 0, KERN_DEFAULT); + show_stack_log_lvl(NULL, regs, NULL, 0, KERN_DEFAULT); printk(KERN_DEFAULT "Code: "); diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 2ef6c8b56311..d950f9ea9a8c 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -113,8 +113,6 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); if (!user_mode(regs)) { - unsigned long stack = kernel_stack_pointer(regs); - if (!depth) return; @@ -122,8 +120,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) if (!--depth) return; - dump_trace(NULL, regs, (unsigned long *)stack, 0, - &backtrace_ops, &depth); + dump_trace(NULL, regs, NULL, 0, &backtrace_ops, &depth); return; } From 3ec979658e5cc0fab86a42af79a650299e4d7135 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 20 Aug 2016 01:40:13 +0000 Subject: [PATCH 201/538] x86/e820: Fix very large 'size' handling boundary condition The (start, size) tuple represents a range [start, start + size - 1], which means "start" and "start + size - 1" should be compared to see whether the range overflows. For example, a range with (start, size): (0xffffffff fffffff0, 0x00000000 00000010) represents [0xffffffff fffffff0, 0xffffffff ffffffff] ... would be judged overflow in the original code, while actually it is not. This patch fixes this and makes sure it still works when size is zero. Signed-off-by: Wei Yang Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: yinghai@kernel.org Link: http://lkml.kernel.org/r/1471657213-31817-1-git-send-email-richard.weiyang@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 621b501f8935..871f1863457d 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -388,11 +388,11 @@ static int __init __append_e820_map(struct e820entry *biosmap, int nr_map) while (nr_map) { u64 start = biosmap->addr; u64 size = biosmap->size; - u64 end = start + size; + u64 end = start + size - 1; u32 type = biosmap->type; /* Overflow in 64 bits? Ignore the memory map. */ - if (start > end) + if (start > end && likely(size)) return -1; e820_add_region(start, size, type); From 8e522e1d321b12829960c9b26668c92f14c68d7f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 8 Sep 2016 13:32:31 +0300 Subject: [PATCH 202/538] x86/platform/intel-mid: Add Intel Penwell to ID table Commit: ca22312dc840 ("x86/platform/intel-mid: Extend PWRMU to support Penwell") ... enabled the PWRMU driver on platforms based on Intel Penwell, but unfortunately this is not enough. Add Intel Penwell ID to pci-mid.c driver as well. To avoid confusion in the future add a comment to both drivers. Signed-off-by: Andy Shevchenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: ca22312dc840 ("x86/platform/intel-mid: Extend PWRMU to support Penwell") Link: http://lkml.kernel.org/r/20160908103232.137587-1-andriy.shevchenko@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/platform/intel-mid/pwr.c | 1 + drivers/pci/pci-mid.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/arch/x86/platform/intel-mid/pwr.c b/arch/x86/platform/intel-mid/pwr.c index 2dfe998a5afd..146ed54e92e5 100644 --- a/arch/x86/platform/intel-mid/pwr.c +++ b/arch/x86/platform/intel-mid/pwr.c @@ -427,6 +427,7 @@ static const struct mid_pwr_device_info mid_info = { .set_initial_state = mid_set_initial_state, }; +/* This table should be in sync with the one in drivers/pci/pci-mid.c */ static const struct pci_device_id mid_pwr_pci_ids[] = { { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_PENWELL), (kernel_ulong_t)&mid_info }, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_TANGIER), (kernel_ulong_t)&mid_info }, diff --git a/drivers/pci/pci-mid.c b/drivers/pci/pci-mid.c index b7ea64f63149..55f453de562e 100644 --- a/drivers/pci/pci-mid.c +++ b/drivers/pci/pci-mid.c @@ -60,7 +60,12 @@ static struct pci_platform_pm_ops mid_pci_platform_pm = { #define ICPU(model) { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, } +/* + * This table should be in sync with the one in + * arch/x86/platform/intel-mid/pwr.c. + */ static const struct x86_cpu_id lpss_cpu_ids[] = { + ICPU(INTEL_FAM6_ATOM_PENWELL), ICPU(INTEL_FAM6_ATOM_MERRIFIELD), {} }; From f43ea76cf310c3be95cb75ae1350cbe76a8f2380 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 8 Sep 2016 13:32:32 +0300 Subject: [PATCH 203/538] x86/platform/intel-mid: Keep SRAM powered on at boot On Penwell SRAM has to be powered on, otherwise it prevents booting. Signed-off-by: Andy Shevchenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: ca22312dc840 ("x86/platform/intel-mid: Extend PWRMU to support Penwell") Link: http://lkml.kernel.org/r/20160908103232.137587-2-andriy.shevchenko@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/platform/intel-mid/pwr.c | 45 ++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/arch/x86/platform/intel-mid/pwr.c b/arch/x86/platform/intel-mid/pwr.c index 146ed54e92e5..5d3b45ad1c03 100644 --- a/arch/x86/platform/intel-mid/pwr.c +++ b/arch/x86/platform/intel-mid/pwr.c @@ -380,7 +380,7 @@ static int mid_pwr_probe(struct pci_dev *pdev, const struct pci_device_id *id) return 0; } -static int mid_set_initial_state(struct mid_pwr *pwr) +static int mid_set_initial_state(struct mid_pwr *pwr, const u32 *states) { unsigned int i, j; int ret; @@ -405,10 +405,10 @@ static int mid_set_initial_state(struct mid_pwr *pwr) * NOTE: The actual device mapping is provided by a platform at run * time using vendor capability of PCI configuration space. */ - mid_pwr_set_state(pwr, 0, 0xffffffff); - mid_pwr_set_state(pwr, 1, 0xffffffff); - mid_pwr_set_state(pwr, 2, 0xffffffff); - mid_pwr_set_state(pwr, 3, 0xffffffff); + mid_pwr_set_state(pwr, 0, states[0]); + mid_pwr_set_state(pwr, 1, states[1]); + mid_pwr_set_state(pwr, 2, states[2]); + mid_pwr_set_state(pwr, 3, states[3]); /* Send command to SCU */ ret = mid_pwr_wait_for_cmd(pwr, CMD_SET_CFG); @@ -423,14 +423,41 @@ static int mid_set_initial_state(struct mid_pwr *pwr) return 0; } -static const struct mid_pwr_device_info mid_info = { - .set_initial_state = mid_set_initial_state, +static int pnw_set_initial_state(struct mid_pwr *pwr) +{ + /* On Penwell SRAM must stay powered on */ + const u32 states[] = { + 0xf00fffff, /* PM_SSC(0) */ + 0xffffffff, /* PM_SSC(1) */ + 0xffffffff, /* PM_SSC(2) */ + 0xffffffff, /* PM_SSC(3) */ + }; + return mid_set_initial_state(pwr, states); +} + +static int tng_set_initial_state(struct mid_pwr *pwr) +{ + const u32 states[] = { + 0xffffffff, /* PM_SSC(0) */ + 0xffffffff, /* PM_SSC(1) */ + 0xffffffff, /* PM_SSC(2) */ + 0xffffffff, /* PM_SSC(3) */ + }; + return mid_set_initial_state(pwr, states); +} + +static const struct mid_pwr_device_info pnw_info = { + .set_initial_state = pnw_set_initial_state, +}; + +static const struct mid_pwr_device_info tng_info = { + .set_initial_state = tng_set_initial_state, }; /* This table should be in sync with the one in drivers/pci/pci-mid.c */ static const struct pci_device_id mid_pwr_pci_ids[] = { - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_PENWELL), (kernel_ulong_t)&mid_info }, - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_TANGIER), (kernel_ulong_t)&mid_info }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_PENWELL), (kernel_ulong_t)&pnw_info }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_TANGIER), (kernel_ulong_t)&tng_info }, {} }; From 2f30ea5090cbc57ea573cdc66421264b3de3fb0a Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 8 Sep 2016 18:09:57 +0200 Subject: [PATCH 204/538] xfrm_user: propagate sec ctx allocation errors When we fail to attach the security context in xfrm_state_construct() we'll return 0 as error value which, in turn, will wrongly claim success to userland when, in fact, we won't be adding / updating the XFRM state. This is a regression introduced by commit fd21150a0fe1 ("[XFRM] netlink: Inline attach_encap_tmpl(), attach_sec_ctx(), and attach_one_addr()"). Fix it by propagating the error returned by security_xfrm_state_alloc() in this case. Fixes: fd21150a0fe1 ("[XFRM] netlink: Inline attach_encap_tmpl()...") Signed-off-by: Mathias Krause Cc: Thomas Graf Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index cb65d916a345..08892091cfe3 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -581,9 +581,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, if (err) goto error; - if (attrs[XFRMA_SEC_CTX] && - security_xfrm_state_alloc(x, nla_data(attrs[XFRMA_SEC_CTX]))) - goto error; + if (attrs[XFRMA_SEC_CTX]) { + err = security_xfrm_state_alloc(x, + nla_data(attrs[XFRMA_SEC_CTX])); + if (err) + goto error; + } if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn, attrs[XFRMA_REPLAY_ESN_VAL]))) From 1fb81e09d487656aa23f2acb1232c7f56b4c2367 Mon Sep 17 00:00:00 2001 From: "thomas.zeitlhofer+lkml@ze-it.at" Date: Wed, 7 Sep 2016 20:40:38 +0200 Subject: [PATCH 205/538] vti: use right inner_mode for inbound inter address family policy checks In case of inter address family tunneling (IPv6 over vti4 or IPv4 over vti6), the inbound policy checks in vti_rcv_cb() and vti6_rcv_cb() are using the wrong address family. As a result, all inbound inter address family traffic is dropped. Use the xfrm_ip2inner_mode() helper, as done in xfrm_input() (i.e., also increment LINUX_MIB_XFRMINSTATEMODEERROR in case of error), to select the inner_mode that contains the right address family for the inbound policy checks. Signed-off-by: Thomas Zeitlhofer Signed-off-by: Steffen Klassert --- net/ipv4/ip_vti.c | 15 ++++++++++++++- net/ipv6/ip6_vti.c | 15 ++++++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index cc701fa70b12..5d7944f394d9 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -88,6 +88,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) struct net_device *dev; struct pcpu_sw_netstats *tstats; struct xfrm_state *x; + struct xfrm_mode *inner_mode; struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; u32 orig_mark = skb->mark; int ret; @@ -105,7 +106,19 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) } x = xfrm_input_state(skb); - family = x->inner_mode->afinfo->family; + + inner_mode = x->inner_mode; + + if (x->sel.family == AF_UNSPEC) { + inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); + if (inner_mode == NULL) { + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMINSTATEMODEERROR); + return -EINVAL; + } + } + + family = inner_mode->afinfo->family; skb->mark = be32_to_cpu(tunnel->parms.i_key); ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index d90a11f14040..52a2f735881f 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -340,6 +340,7 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err) struct net_device *dev; struct pcpu_sw_netstats *tstats; struct xfrm_state *x; + struct xfrm_mode *inner_mode; struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6; u32 orig_mark = skb->mark; int ret; @@ -357,7 +358,19 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err) } x = xfrm_input_state(skb); - family = x->inner_mode->afinfo->family; + + inner_mode = x->inner_mode; + + if (x->sel.family == AF_UNSPEC) { + inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); + if (inner_mode == NULL) { + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMINSTATEMODEERROR); + return -EINVAL; + } + } + + family = inner_mode->afinfo->family; skb->mark = be32_to_cpu(t->parms.i_key); ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); From e8c6226d483cb28f55cab718065ea1b7226d40e8 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 29 Jul 2016 09:30:10 -0700 Subject: [PATCH 206/538] x86/pkeys: Add fault handling for PF_PK page fault bit PF_PK means that a memory access violated the protection key access restrictions. It is unconditionally an access_error() because the permissions set on the VMA don't matter (the PKRU value overrides it), and we never "resolve" PK faults (like how a COW can "resolve write fault). Signed-off-by: Dave Hansen Acked-by: Mel Gorman Cc: linux-arch@vger.kernel.org Cc: Dave Hansen Cc: arnd@arndb.de Cc: linux-api@vger.kernel.org Cc: linux-mm@kvack.org Cc: luto@kernel.org Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/20160729163010.DD1FE1ED@viggo.jf.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/fault.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index dc8023060456..b88d8acb3ab5 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1112,6 +1112,15 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) { /* This is only called for the current mm, so: */ bool foreign = false; + + /* + * Read or write was blocked by protection keys. This is + * always an unconditional error and can never result in + * a follow-up action to resolve the fault, like a COW. + */ + if (error_code & PF_PK) + return 1; + /* * Make sure to check the VMA so that we do not perform * faults just to hit a PF_PK as soon as we fill in a From 7d06d9c9bd813fc956b9c7bffc1b9724009983eb Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 29 Jul 2016 09:30:12 -0700 Subject: [PATCH 207/538] mm: Implement new pkey_mprotect() system call pkey_mprotect() is just like mprotect, except it also takes a protection key as an argument. On systems that do not support protection keys, it still works, but requires that key=0. Otherwise it does exactly what mprotect does. I expect it to get used like this, if you want to guarantee that any mapping you create can *never* be accessed without the right protection keys set up. int real_prot = PROT_READ|PROT_WRITE; pkey = pkey_alloc(0, PKEY_DENY_ACCESS); ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey); This way, there is *no* window where the mapping is accessible since it was always either PROT_NONE or had a protection key set that denied all access. We settled on 'unsigned long' for the type of the key here. We only need 4 bits on x86 today, but I figured that other architectures might need some more space. Semantically, we have a bit of a problem if we combine this syscall with our previously-introduced execute-only support: What do we do when we mix execute-only pkey use with pkey_mprotect() use? For instance: pkey_mprotect(ptr, PAGE_SIZE, PROT_WRITE, 6); // set pkey=6 mprotect(ptr, PAGE_SIZE, PROT_EXEC); // set pkey=X_ONLY_PKEY? mprotect(ptr, PAGE_SIZE, PROT_WRITE); // is pkey=6 again? To solve that, we make the plain-mprotect()-initiated execute-only support only apply to VMAs that have the default protection key (0) set on them. Proposed semantics: 1. protection key 0 is special and represents the default, "unassigned" protection key. It is always allocated. 2. mprotect() never affects a mapping's pkey_mprotect()-assigned protection key. A protection key of 0 (even if set explicitly) represents an unassigned protection key. 2a. mprotect(PROT_EXEC) on a mapping with an assigned protection key may or may not result in a mapping with execute-only properties. pkey_mprotect() plus pkey_set() on all threads should be used to _guarantee_ execute-only semantics if this is not a strong enough semantic. 3. mprotect(PROT_EXEC) may result in an "execute-only" mapping. The kernel will internally attempt to allocate and dedicate a protection key for the purpose of execute-only mappings. This may not be possible in cases where there are no free protection keys available. It can also happen, of course, in situations where there is no hardware support for protection keys. Signed-off-by: Dave Hansen Acked-by: Mel Gorman Cc: linux-arch@vger.kernel.org Cc: Dave Hansen Cc: arnd@arndb.de Cc: linux-api@vger.kernel.org Cc: linux-mm@kvack.org Cc: luto@kernel.org Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/20160729163012.3DDD36C4@viggo.jf.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/mmu_context.h | 15 ++++++++++----- arch/x86/include/asm/pkeys.h | 11 +++++++++-- include/linux/pkeys.h | 12 ------------ mm/mprotect.c | 30 ++++++++++++++++++++++++++---- 4 files changed, 45 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index d8abfcf524d1..af0251fc85ed 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -4,6 +4,7 @@ #include #include #include +#include #include @@ -195,16 +196,20 @@ static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma, mpx_notify_unmap(mm, vma, start, end); } +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS static inline int vma_pkey(struct vm_area_struct *vma) { - u16 pkey = 0; -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3; - pkey = (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT; -#endif - return pkey; + + return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT; +} +#else +static inline int vma_pkey(struct vm_area_struct *vma) +{ + return 0; } +#endif static inline bool __pkru_allows_pkey(u16 pkey, bool write) { diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 7b84565c916c..33777c291a85 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -1,7 +1,12 @@ #ifndef _ASM_X86_PKEYS_H #define _ASM_X86_PKEYS_H -#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1) +#define PKEY_DEDICATED_EXECUTE_ONLY 15 +/* + * Consider the PKEY_DEDICATED_EXECUTE_ONLY key unavailable. + */ +#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? \ + PKEY_DEDICATED_EXECUTE_ONLY : 1) extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); @@ -10,7 +15,6 @@ extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, * Try to dedicate one of the protection keys to be used as an * execute-only protection key. */ -#define PKEY_DEDICATED_EXECUTE_ONLY 15 extern int __execute_only_pkey(struct mm_struct *mm); static inline int execute_only_pkey(struct mm_struct *mm) { @@ -31,4 +35,7 @@ static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, return __arch_override_mprotect_pkey(vma, prot, pkey); } +extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, + unsigned long init_val); + #endif /*_ASM_X86_PKEYS_H */ diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h index 1d405a2b7272..0030b4024559 100644 --- a/include/linux/pkeys.h +++ b/include/linux/pkeys.h @@ -18,16 +18,4 @@ #define PKEY_DEDICATED_EXECUTE_ONLY 0 #endif /* ! CONFIG_ARCH_HAS_PKEYS */ -/* - * This is called from mprotect_pkey(). - * - * Returns true if the protection keys is valid. - */ -static inline bool validate_pkey(int pkey) -{ - if (pkey < 0) - return false; - return (pkey < arch_max_pkey()); -} - #endif /* _LINUX_PKEYS_H */ diff --git a/mm/mprotect.c b/mm/mprotect.c index a4830f0325fe..dd3f40a2935f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -352,8 +352,11 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, return error; } -SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, - unsigned long, prot) +/* + * pkey==-1 when doing a legacy mprotect() + */ +static int do_mprotect_pkey(unsigned long start, size_t len, + unsigned long prot, int pkey) { unsigned long nstart, end, tmp, reqprot; struct vm_area_struct *vma, *prev; @@ -361,6 +364,12 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); + /* + * A temporary safety check since we are not validating + * the pkey before we introduce the allocation code. + */ + if (pkey != -1) + return -EINVAL; prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ @@ -409,7 +418,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, for (nstart = start ; ; ) { unsigned long newflags; - int pkey = arch_override_mprotect_pkey(vma, prot, -1); + int new_vma_pkey; /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ @@ -417,7 +426,8 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, if (rier && (vma->vm_flags & VM_MAYEXEC)) prot |= PROT_EXEC; - newflags = calc_vm_prot_bits(prot, pkey); + new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey); + newflags = calc_vm_prot_bits(prot, new_vma_pkey); newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); /* newflags >> 4 shift VM_MAY% in place of VM_% */ @@ -454,3 +464,15 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, up_write(¤t->mm->mmap_sem); return error; } + +SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, + unsigned long, prot) +{ + return do_mprotect_pkey(start, len, prot, -1); +} + +SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len, + unsigned long, prot, int, pkey) +{ + return do_mprotect_pkey(start, len, prot, pkey); +} From a8502b67d739c1d7a4542c1da0a5d98a6a58c177 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 29 Jul 2016 09:30:13 -0700 Subject: [PATCH 208/538] x86/pkeys: Make mprotect_key() mask off additional vm_flags Today, mprotect() takes 4 bits of data: PROT_READ/WRITE/EXEC/NONE. Three of those bits: READ/WRITE/EXEC get translated directly in to vma->vm_flags by calc_vm_prot_bits(). If a bit is unset in mprotect()'s 'prot' argument then it must be cleared in vma->vm_flags during the mprotect() call. We do this clearing today by first calculating the VMA flags we want set, then clearing the ones we do not want to inherit from the original VMA: vm_flags = calc_vm_prot_bits(prot, key); ... newflags = vm_flags; newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); However, we *also* want to mask off the original VMA's vm_flags in which we store the protection key. To do that, this patch adds a new macro: ARCH_VM_PKEY_FLAGS which allows the architecture to specify additional bits that it would like cleared. We use that to ensure that the VM_PKEY_BIT* bits get cleared. Signed-off-by: Dave Hansen Acked-by: Mel Gorman Reviewed-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Dave Hansen Cc: arnd@arndb.de Cc: linux-api@vger.kernel.org Cc: linux-mm@kvack.org Cc: luto@kernel.org Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/20160729163013.E48D6981@viggo.jf.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/pkeys.h | 2 ++ include/linux/pkeys.h | 1 + mm/mprotect.c | 11 ++++++++++- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 33777c291a85..666ffc862ef7 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -38,4 +38,6 @@ static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); +#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3) + #endif /*_ASM_X86_PKEYS_H */ diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h index 0030b4024559..6899b0bc7ce0 100644 --- a/include/linux/pkeys.h +++ b/include/linux/pkeys.h @@ -16,6 +16,7 @@ #define execute_only_pkey(mm) (0) #define arch_override_mprotect_pkey(vma, prot, pkey) (0) #define PKEY_DEDICATED_EXECUTE_ONLY 0 +#define ARCH_VM_PKEY_FLAGS 0 #endif /* ! CONFIG_ARCH_HAS_PKEYS */ #endif /* _LINUX_PKEYS_H */ diff --git a/mm/mprotect.c b/mm/mprotect.c index dd3f40a2935f..abd9c8257b2e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -417,6 +417,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, prev = vma; for (nstart = start ; ; ) { + unsigned long mask_off_old_flags; unsigned long newflags; int new_vma_pkey; @@ -426,9 +427,17 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if (rier && (vma->vm_flags & VM_MAYEXEC)) prot |= PROT_EXEC; + /* + * Each mprotect() call explicitly passes r/w/x permissions. + * If a permission is not passed to mprotect(), it must be + * cleared from the VMA. + */ + mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC | + ARCH_VM_PKEY_FLAGS; + new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey); newflags = calc_vm_prot_bits(prot, new_vma_pkey); - newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); + newflags |= (vma->vm_flags & ~mask_off_old_flags); /* newflags >> 4 shift VM_MAY% in place of VM_% */ if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { From e8c24d3a23a469f1f40d4de24d872ca7023ced0a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 29 Jul 2016 09:30:15 -0700 Subject: [PATCH 209/538] x86/pkeys: Allocation/free syscalls This patch adds two new system calls: int pkey_alloc(unsigned long flags, unsigned long init_access_rights) int pkey_free(int pkey); These implement an "allocator" for the protection keys themselves, which can be thought of as analogous to the allocator that the kernel has for file descriptors. The kernel tracks which numbers are in use, and only allows operations on keys that are valid. A key which was not obtained by pkey_alloc() may not, for instance, be passed to pkey_mprotect(). These system calls are also very important given the kernel's use of pkeys to implement execute-only support. These help ensure that userspace can never assume that it has control of a key unless it first asks the kernel. The kernel does not promise to preserve PKRU (right register) contents except for allocated pkeys. The 'init_access_rights' argument to pkey_alloc() specifies the rights that will be established for the returned pkey. For instance: pkey = pkey_alloc(flags, PKEY_DENY_WRITE); will allocate 'pkey', but also sets the bits in PKRU[1] such that writing to 'pkey' is already denied. The kernel does not prevent pkey_free() from successfully freeing in-use pkeys (those still assigned to a memory range by pkey_mprotect()). It would be expensive to implement the checks for this, so we instead say, "Just don't do it" since sane software will never do it anyway. Any piece of userspace calling pkey_alloc() needs to be prepared for it to fail. Why? pkey_alloc() returns the same error code (ENOSPC) when there are no pkeys and when pkeys are unsupported. They can be unsupported for a whole host of reasons, so apps must be prepared for this. Also, libraries or LD_PRELOADs might steal keys before an application gets access to them. This allocation mechanism could be implemented in userspace. Even if we did it in userspace, we would still need additional user/kernel interfaces to tell userspace which keys are being used by the kernel internally (such as for execute-only mappings). Having the kernel provide this facility completely removes the need for these additional interfaces, or having an implementation of this in userspace at all. Note that we have to make changes to all of the architectures that do not use mman-common.h because we use the new PKEY_DENY_ACCESS/WRITE macros in arch-independent code. 1. PKRU is the Protection Key Rights User register. It is a usermode-accessible register that controls whether writes and/or access to each individual pkey is allowed or denied. Signed-off-by: Dave Hansen Acked-by: Mel Gorman Cc: linux-arch@vger.kernel.org Cc: Dave Hansen Cc: arnd@arndb.de Cc: linux-api@vger.kernel.org Cc: linux-mm@kvack.org Cc: luto@kernel.org Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/20160729163015.444FE75F@viggo.jf.intel.com Signed-off-by: Thomas Gleixner --- arch/alpha/include/uapi/asm/mman.h | 5 ++ arch/mips/include/uapi/asm/mman.h | 5 ++ arch/parisc/include/uapi/asm/mman.h | 5 ++ arch/x86/include/asm/mmu.h | 8 +++ arch/x86/include/asm/mmu_context.h | 10 +++- arch/x86/include/asm/pkeys.h | 73 +++++++++++++++++++++++--- arch/x86/kernel/fpu/xstate.c | 5 +- arch/x86/mm/pkeys.c | 38 +++++++++++--- arch/xtensa/include/uapi/asm/mman.h | 5 ++ include/linux/pkeys.h | 28 ++++++++-- include/uapi/asm-generic/mman-common.h | 5 ++ mm/mprotect.c | 61 ++++++++++++++++++--- 12 files changed, 221 insertions(+), 27 deletions(-) diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index fec1947b8dbc..02760f6e6ca4 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -78,4 +78,9 @@ #define MAP_HUGE_SHIFT 26 #define MAP_HUGE_MASK 0x3f +#define PKEY_DISABLE_ACCESS 0x1 +#define PKEY_DISABLE_WRITE 0x2 +#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ + PKEY_DISABLE_WRITE) + #endif /* __ALPHA_MMAN_H__ */ diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index ccdcfcbb24aa..655e2fb5395b 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -105,4 +105,9 @@ #define MAP_HUGE_SHIFT 26 #define MAP_HUGE_MASK 0x3f +#define PKEY_DISABLE_ACCESS 0x1 +#define PKEY_DISABLE_WRITE 0x2 +#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ + PKEY_DISABLE_WRITE) + #endif /* _ASM_MMAN_H */ diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index f3db7d8eb0c2..5979745815a5 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -75,4 +75,9 @@ #define MAP_HUGE_SHIFT 26 #define MAP_HUGE_MASK 0x3f +#define PKEY_DISABLE_ACCESS 0x1 +#define PKEY_DISABLE_WRITE 0x2 +#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ + PKEY_DISABLE_WRITE) + #endif /* __PARISC_MMAN_H__ */ diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 1ea0baef1175..72198c64e646 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -23,6 +23,14 @@ typedef struct { const struct vdso_image *vdso_image; /* vdso image in use */ atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */ +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS + /* + * One bit per protection key says whether userspace can + * use it or not. protected by mmap_sem. + */ + u16 pkey_allocation_map; + s16 execute_only_pkey; +#endif } mm_context_t; #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index af0251fc85ed..8e0a9fe86de4 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -108,7 +108,16 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS + if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { + /* pkey 0 is the default and always allocated */ + mm->context.pkey_allocation_map = 0x1; + /* -1 means unallocated or invalid */ + mm->context.execute_only_pkey = -1; + } + #endif init_new_context_ldt(tsk, mm); + return 0; } static inline void destroy_context(struct mm_struct *mm) @@ -263,5 +272,4 @@ static inline bool arch_pte_access_permitted(pte_t pte, bool write) { return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write); } - #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 666ffc862ef7..b406889de0db 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -1,12 +1,7 @@ #ifndef _ASM_X86_PKEYS_H #define _ASM_X86_PKEYS_H -#define PKEY_DEDICATED_EXECUTE_ONLY 15 -/* - * Consider the PKEY_DEDICATED_EXECUTE_ONLY key unavailable. - */ -#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? \ - PKEY_DEDICATED_EXECUTE_ONLY : 1) +#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1) extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); @@ -40,4 +35,70 @@ extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, #define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3) +#define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map) +#define mm_set_pkey_allocated(mm, pkey) do { \ + mm_pkey_allocation_map(mm) |= (1U << pkey); \ +} while (0) +#define mm_set_pkey_free(mm, pkey) do { \ + mm_pkey_allocation_map(mm) &= ~(1U << pkey); \ +} while (0) + +static inline +bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) +{ + return mm_pkey_allocation_map(mm) & (1U << pkey); +} + +/* + * Returns a positive, 4-bit key on success, or -1 on failure. + */ +static inline +int mm_pkey_alloc(struct mm_struct *mm) +{ + /* + * Note: this is the one and only place we make sure + * that the pkey is valid as far as the hardware is + * concerned. The rest of the kernel trusts that + * only good, valid pkeys come out of here. + */ + u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1); + int ret; + + /* + * Are we out of pkeys? We must handle this specially + * because ffz() behavior is undefined if there are no + * zeros. + */ + if (mm_pkey_allocation_map(mm) == all_pkeys_mask) + return -1; + + ret = ffz(mm_pkey_allocation_map(mm)); + + mm_set_pkey_allocated(mm, ret); + + return ret; +} + +static inline +int mm_pkey_free(struct mm_struct *mm, int pkey) +{ + /* + * pkey 0 is special, always allocated and can never + * be freed. + */ + if (!pkey) + return -EINVAL; + if (!mm_pkey_is_allocated(mm, pkey)) + return -EINVAL; + + mm_set_pkey_free(mm, pkey); + + return 0; +} + +extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, + unsigned long init_val); +extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, + unsigned long init_val); + #endif /*_ASM_X86_PKEYS_H */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 01567aa87503..124aa5c593f8 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -5,6 +5,7 @@ */ #include #include +#include #include #include @@ -866,9 +867,10 @@ const void *get_xsave_field_ptr(int xsave_state) return get_xsave_addr(&fpu->state.xsave, xsave_state); } +#ifdef CONFIG_ARCH_HAS_PKEYS + #define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2) #define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1) - /* * This will go out and modify PKRU register to set the access * rights for @pkey to @init_val. @@ -914,6 +916,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, return 0; } +#endif /* ! CONFIG_ARCH_HAS_PKEYS */ /* * This is similar to user_regset_copyout(), but will not add offset to diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index e8c474451928..e6113bbb56e1 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -21,8 +21,19 @@ int __execute_only_pkey(struct mm_struct *mm) { + bool need_to_set_mm_pkey = false; + int execute_only_pkey = mm->context.execute_only_pkey; int ret; + /* Do we need to assign a pkey for mm's execute-only maps? */ + if (execute_only_pkey == -1) { + /* Go allocate one to use, which might fail */ + execute_only_pkey = mm_pkey_alloc(mm); + if (execute_only_pkey < 0) + return -1; + need_to_set_mm_pkey = true; + } + /* * We do not want to go through the relatively costly * dance to set PKRU if we do not need to. Check it @@ -32,22 +43,33 @@ int __execute_only_pkey(struct mm_struct *mm) * can make fpregs inactive. */ preempt_disable(); - if (fpregs_active() && - !__pkru_allows_read(read_pkru(), PKEY_DEDICATED_EXECUTE_ONLY)) { + if (!need_to_set_mm_pkey && + fpregs_active() && + !__pkru_allows_read(read_pkru(), execute_only_pkey)) { preempt_enable(); - return PKEY_DEDICATED_EXECUTE_ONLY; + return execute_only_pkey; } preempt_enable(); - ret = arch_set_user_pkey_access(current, PKEY_DEDICATED_EXECUTE_ONLY, + + /* + * Set up PKRU so that it denies access for everything + * other than execution. + */ + ret = arch_set_user_pkey_access(current, execute_only_pkey, PKEY_DISABLE_ACCESS); /* * If the PKRU-set operation failed somehow, just return * 0 and effectively disable execute-only support. */ - if (ret) - return 0; + if (ret) { + mm_set_pkey_free(mm, execute_only_pkey); + return -1; + } - return PKEY_DEDICATED_EXECUTE_ONLY; + /* We got one, store it and use it from here on out */ + if (need_to_set_mm_pkey) + mm->context.execute_only_pkey = execute_only_pkey; + return execute_only_pkey; } static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) @@ -55,7 +77,7 @@ static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) /* Do this check first since the vm_flags should be hot */ if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC) return false; - if (vma_pkey(vma) != PKEY_DEDICATED_EXECUTE_ONLY) + if (vma_pkey(vma) != vma->vm_mm->context.execute_only_pkey) return false; return true; diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 9e079d49e7f2..24365b30aae9 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -117,4 +117,9 @@ #define MAP_HUGE_SHIFT 26 #define MAP_HUGE_MASK 0x3f +#define PKEY_DISABLE_ACCESS 0x1 +#define PKEY_DISABLE_WRITE 0x2 +#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ + PKEY_DISABLE_WRITE) + #endif /* _XTENSA_MMAN_H */ diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h index 6899b0bc7ce0..8ff21125dc8a 100644 --- a/include/linux/pkeys.h +++ b/include/linux/pkeys.h @@ -4,11 +4,6 @@ #include #include -#define PKEY_DISABLE_ACCESS 0x1 -#define PKEY_DISABLE_WRITE 0x2 -#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ - PKEY_DISABLE_WRITE) - #ifdef CONFIG_ARCH_HAS_PKEYS #include #else /* ! CONFIG_ARCH_HAS_PKEYS */ @@ -17,6 +12,29 @@ #define arch_override_mprotect_pkey(vma, prot, pkey) (0) #define PKEY_DEDICATED_EXECUTE_ONLY 0 #define ARCH_VM_PKEY_FLAGS 0 + +static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) +{ + return (pkey == 0); +} + +static inline int mm_pkey_alloc(struct mm_struct *mm) +{ + return -1; +} + +static inline int mm_pkey_free(struct mm_struct *mm, int pkey) +{ + WARN_ONCE(1, "free of protection key when disabled"); + return -EINVAL; +} + +static inline int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, + unsigned long init_val) +{ + return 0; +} + #endif /* ! CONFIG_ARCH_HAS_PKEYS */ #endif /* _LINUX_PKEYS_H */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 58274382a616..8c27db0c5c08 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -72,4 +72,9 @@ #define MAP_HUGE_SHIFT 26 #define MAP_HUGE_MASK 0x3f +#define PKEY_DISABLE_ACCESS 0x1 +#define PKEY_DISABLE_WRITE 0x2 +#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ + PKEY_DISABLE_WRITE) + #endif /* __ASM_GENERIC_MMAN_COMMON_H */ diff --git a/mm/mprotect.c b/mm/mprotect.c index abd9c8257b2e..7b35ee3894ee 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -23,11 +23,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include "internal.h" @@ -364,12 +366,6 @@ static int do_mprotect_pkey(unsigned long start, size_t len, const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); - /* - * A temporary safety check since we are not validating - * the pkey before we introduce the allocation code. - */ - if (pkey != -1) - return -EINVAL; prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ @@ -391,6 +387,14 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if (down_write_killable(¤t->mm->mmap_sem)) return -EINTR; + /* + * If userspace did not allocate the pkey, do not let + * them use it here. + */ + error = -EINVAL; + if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey)) + goto out; + vma = find_vma(current->mm, start); error = -ENOMEM; if (!vma) @@ -485,3 +489,48 @@ SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len, { return do_mprotect_pkey(start, len, prot, pkey); } + +SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val) +{ + int pkey; + int ret; + + /* No flags supported yet. */ + if (flags) + return -EINVAL; + /* check for unsupported init values */ + if (init_val & ~PKEY_ACCESS_MASK) + return -EINVAL; + + down_write(¤t->mm->mmap_sem); + pkey = mm_pkey_alloc(current->mm); + + ret = -ENOSPC; + if (pkey == -1) + goto out; + + ret = arch_set_user_pkey_access(current, pkey, init_val); + if (ret) { + mm_pkey_free(current->mm, pkey); + goto out; + } + ret = pkey; +out: + up_write(¤t->mm->mmap_sem); + return ret; +} + +SYSCALL_DEFINE1(pkey_free, int, pkey) +{ + int ret; + + down_write(¤t->mm->mmap_sem); + ret = mm_pkey_free(current->mm, pkey); + up_write(¤t->mm->mmap_sem); + + /* + * We could provie warnings or errors if any VMA still + * has the pkey set here. + */ + return ret; +} From f9afc6197e9bba1e2e62e262704f661810cc8bba Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 29 Jul 2016 09:30:17 -0700 Subject: [PATCH 210/538] x86: Wire up protection keys system calls This is all that we need to get the new system calls themselves working on x86. Signed-off-by: Dave Hansen Cc: linux-arch@vger.kernel.org Cc: Dave Hansen Cc: mgorman@techsingularity.net Cc: arnd@arndb.de Cc: linux-api@vger.kernel.org Cc: linux-mm@kvack.org Cc: luto@kernel.org Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/20160729163017.E3C06FD2@viggo.jf.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/entry/syscalls/syscall_32.tbl | 5 +++++ arch/x86/entry/syscalls/syscall_64.tbl | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index f848572169ea..ff6ef7b30822 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -386,3 +386,8 @@ 377 i386 copy_file_range sys_copy_file_range 378 i386 preadv2 sys_preadv2 compat_sys_preadv2 379 i386 pwritev2 sys_pwritev2 compat_sys_pwritev2 +380 i386 pkey_mprotect sys_pkey_mprotect +381 i386 pkey_alloc sys_pkey_alloc +382 i386 pkey_free sys_pkey_free +#383 i386 pkey_get sys_pkey_get +#384 i386 pkey_set sys_pkey_set diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index e9ce9c7c39b4..2f024d02511d 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -335,6 +335,11 @@ 326 common copy_file_range sys_copy_file_range 327 64 preadv2 sys_preadv2 328 64 pwritev2 sys_pwritev2 +329 common pkey_mprotect sys_pkey_mprotect +330 common pkey_alloc sys_pkey_alloc +331 common pkey_free sys_pkey_free +#332 common pkey_get sys_pkey_get +#333 common pkey_set sys_pkey_set # # x32-specific system call numbers start at 512 to avoid cache impact From a60f7b69d92c0142c80a30d669a76b617b7f6879 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 29 Jul 2016 09:30:18 -0700 Subject: [PATCH 211/538] generic syscalls: Wire up memory protection keys syscalls These new syscalls are implemented as generic code, so enable them for architectures like arm64 which use the generic syscall table. According to Arnd: Even if the support is x86 specific for the forseeable future, it may be good to reserve the number just in case. The other architecture specific syscall lists are usually left to the individual arch maintainers, most a lot of the newer architectures share this table. Signed-off-by: Dave Hansen Acked-by: Arnd Bergmann Cc: linux-arch@vger.kernel.org Cc: Dave Hansen Cc: mgorman@techsingularity.net Cc: linux-api@vger.kernel.org Cc: linux-mm@kvack.org Cc: luto@kernel.org Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/20160729163018.505A6875@viggo.jf.intel.com Signed-off-by: Thomas Gleixner --- include/linux/syscalls.h | 8 ++++++++ include/uapi/asm-generic/unistd.h | 12 +++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index d02239022bd0..0d7abb8b7315 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -898,4 +898,12 @@ asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in, asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags); +asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, + unsigned long prot, int pkey); +asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val); +asmlinkage long sys_pkey_free(int pkey); +//asmlinkage long sys_pkey_get(int pkey, unsigned long flags); +//asmlinkage long sys_pkey_set(int pkey, unsigned long access_rights, +// unsigned long flags); + #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index a26415b5151c..dbfee7e86ba6 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -724,9 +724,19 @@ __SYSCALL(__NR_copy_file_range, sys_copy_file_range) __SC_COMP(__NR_preadv2, sys_preadv2, compat_sys_preadv2) #define __NR_pwritev2 287 __SC_COMP(__NR_pwritev2, sys_pwritev2, compat_sys_pwritev2) +#define __NR_pkey_mprotect 288 +__SYSCALL(__NR_pkey_mprotect, sys_pkey_mprotect) +#define __NR_pkey_alloc 289 +__SYSCALL(__NR_pkey_alloc, sys_pkey_alloc) +#define __NR_pkey_free 290 +__SYSCALL(__NR_pkey_free, sys_pkey_free) +#define __NR_pkey_get 291 +//__SYSCALL(__NR_pkey_get, sys_pkey_get) +#define __NR_pkey_set 292 +//__SYSCALL(__NR_pkey_set, sys_pkey_set) #undef __NR_syscalls -#define __NR_syscalls 288 +#define __NR_syscalls 291 /* * All syscalls below here should go away really, From c74fe3940848c6afea83bfbda64a9baf9da547c8 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 29 Jul 2016 09:30:20 -0700 Subject: [PATCH 212/538] pkeys: Add details of system call use to Documentation/ This spells out all of the pkey-related system calls that we have and provides some example code fragments to demonstrate how we expect them to be used. Signed-off-by: Dave Hansen Cc: linux-arch@vger.kernel.org Cc: Dave Hansen Cc: mgorman@techsingularity.net Cc: arnd@arndb.de Cc: linux-api@vger.kernel.org Cc: linux-mm@kvack.org Cc: luto@kernel.org Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/20160729163020.59350E33@viggo.jf.intel.com Signed-off-by: Thomas Gleixner --- Documentation/x86/protection-keys.txt | 62 +++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/Documentation/x86/protection-keys.txt b/Documentation/x86/protection-keys.txt index c281ded1ba16..6da7689601d1 100644 --- a/Documentation/x86/protection-keys.txt +++ b/Documentation/x86/protection-keys.txt @@ -18,6 +18,68 @@ even though there is theoretically space in the PAE PTEs. These permissions are enforced on data access only and have no effect on instruction fetches. +=========================== Syscalls =========================== + +There are 2 system calls which directly interact with pkeys: + + int pkey_alloc(unsigned long flags, unsigned long init_access_rights) + int pkey_free(int pkey); + int pkey_mprotect(unsigned long start, size_t len, + unsigned long prot, int pkey); + +Before a pkey can be used, it must first be allocated with +pkey_alloc(). An application calls the WRPKRU instruction +directly in order to change access permissions to memory covered +with a key. In this example WRPKRU is wrapped by a C function +called pkey_set(). + + int real_prot = PROT_READ|PROT_WRITE; + pkey = pkey_alloc(0, PKEY_DENY_WRITE); + ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey); + ... application runs here + +Now, if the application needs to update the data at 'ptr', it can +gain access, do the update, then remove its write access: + + pkey_set(pkey, 0); // clear PKEY_DENY_WRITE + *ptr = foo; // assign something + pkey_set(pkey, PKEY_DENY_WRITE); // set PKEY_DENY_WRITE again + +Now when it frees the memory, it will also free the pkey since it +is no longer in use: + + munmap(ptr, PAGE_SIZE); + pkey_free(pkey); + +=========================== Behavior =========================== + +The kernel attempts to make protection keys consistent with the +behavior of a plain mprotect(). For instance if you do this: + + mprotect(ptr, size, PROT_NONE); + something(ptr); + +you can expect the same effects with protection keys when doing this: + + pkey = pkey_alloc(0, PKEY_DISABLE_WRITE | PKEY_DISABLE_READ); + pkey_mprotect(ptr, size, PROT_READ|PROT_WRITE, pkey); + something(ptr); + +That should be true whether something() is a direct access to 'ptr' +like: + + *ptr = foo; + +or when the kernel does the access on the application's behalf like +with a read(): + + read(fd, ptr, 1); + +The kernel will send a SIGSEGV in both cases, but si_code will be set +to SEGV_PKERR when violating protection keys versus SEGV_ACCERR when +the plain mprotect() permissions are violated. + =========================== Config Option =========================== This config option adds approximately 1.5kb of text. and 50 bytes of From acd547b29880800d29222c4632d2c145e401988c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 29 Jul 2016 09:30:21 -0700 Subject: [PATCH 213/538] x86/pkeys: Default to a restrictive init PKRU PKRU is the register that lets you disallow writes or all access to a given protection key. The XSAVE hardware defines an "init state" of 0 for PKRU: its most permissive state, allowing access/writes to everything. Since we start off all new processes with the init state, we start all processes off with the most permissive possible PKRU. This is unfortunate. If a thread is clone()'d [1] before a program has time to set PKRU to a restrictive value, that thread will be able to write to all data, no matter what pkey is set on it. This weakens any integrity guarantees that we want pkeys to provide. To fix this, we define a very restrictive PKRU to override the XSAVE-provided value when we create a new FPU context. We choose a value that only allows access to pkey 0, which is as restrictive as we can practically make it. This does not cause any practical problems with applications using protection keys because we require them to specify initial permissions for each key when it is allocated, which override the restrictive default. In the end, this ensures that threads which do not know how to manage their own pkey rights can not do damage to data which is pkey-protected. I would have thought this was a pretty contrived scenario, except that I heard a bug report from an MPX user who was creating threads in some very early code before main(). It may be crazy, but folks evidently _do_ it. Signed-off-by: Dave Hansen Cc: linux-arch@vger.kernel.org Cc: Dave Hansen Cc: mgorman@techsingularity.net Cc: arnd@arndb.de Cc: linux-api@vger.kernel.org Cc: linux-mm@kvack.org Cc: luto@kernel.org Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/20160729163021.F3C25D4A@viggo.jf.intel.com Signed-off-by: Thomas Gleixner --- Documentation/kernel-parameters.txt | 5 ++++ arch/x86/include/asm/pkeys.h | 1 + arch/x86/kernel/fpu/core.c | 4 +++ arch/x86/mm/pkeys.c | 38 +++++++++++++++++++++++++++++ include/linux/pkeys.h | 4 +++ 5 files changed, 52 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index a4f4d693e2c1..3725976d0af5 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1643,6 +1643,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. initrd= [BOOT] Specify the location of the initial ramdisk + init_pkru= [x86] Specify the default memory protection keys rights + register contents for all processes. 0x55555554 by + default (disallow access to all but pkey 0). Can + override in debugfs after boot. + inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver Format: diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index b406889de0db..34684adb6899 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -100,5 +100,6 @@ extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); +extern void copy_init_pkru_to_fpregs(void); #endif /*_ASM_X86_PKEYS_H */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 3fc03a09a93b..47004010ad5d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -12,6 +12,7 @@ #include #include +#include #define CREATE_TRACE_POINTS #include @@ -505,6 +506,9 @@ static inline void copy_init_fpstate_to_fpregs(void) copy_kernel_to_fxregs(&init_fpstate.fxsave); else copy_kernel_to_fregs(&init_fpstate.fsave); + + if (boot_cpu_has(X86_FEATURE_OSPKE)) + copy_init_pkru_to_fpregs(); } /* diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index e6113bbb56e1..ddc54949078a 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -121,3 +121,41 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey */ return vma_pkey(vma); } + +#define PKRU_AD_KEY(pkey) (PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY)) + +/* + * Make the default PKRU value (at execve() time) as restrictive + * as possible. This ensures that any threads clone()'d early + * in the process's lifetime will not accidentally get access + * to data which is pkey-protected later on. + */ +u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | + PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) | + PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) | + PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) | + PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15); + +/* + * Called from the FPU code when creating a fresh set of FPU + * registers. This is called from a very specific context where + * we know the FPU regstiers are safe for use and we can use PKRU + * directly. The fact that PKRU is only available when we are + * using eagerfpu mode makes this possible. + */ +void copy_init_pkru_to_fpregs(void) +{ + u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value); + /* + * Any write to PKRU takes it out of the XSAVE 'init + * state' which increases context switch cost. Avoid + * writing 0 when PKRU was already 0. + */ + if (!init_pkru_value_snapshot && !read_pkru()) + return; + /* + * Override the PKRU state that came from 'init_fpstate' + * with the baseline from the process. + */ + write_pkru(init_pkru_value_snapshot); +} diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h index 8ff21125dc8a..e4c08c1ff0c5 100644 --- a/include/linux/pkeys.h +++ b/include/linux/pkeys.h @@ -35,6 +35,10 @@ static inline int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, return 0; } +static inline void copy_init_pkru_to_fpregs(void) +{ +} + #endif /* ! CONFIG_ARCH_HAS_PKEYS */ #endif /* _LINUX_PKEYS_H */ From 76de993727d22eb29c716abacfae9d9444bb7897 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 29 Jul 2016 09:30:23 -0700 Subject: [PATCH 214/538] x86/pkeys: Allow configuration of init_pkru As discussed in the previous patch, there is a reliability benefit to allowing an init value for the Protection Keys Rights User register (PKRU) which differs from what the XSAVE hardware provides. But, having PKRU be 0 (its init value) provides some nonzero amount of optimization potential to the hardware. It can, for instance, skip writes to the XSAVE buffer when it knows that PKRU is in its init state. The cost of losing this optimization is approximately 100 cycles per context switch for a workload which lightly using XSAVE state (something not using AVX much). The overhead comes from a combinaation of actually manipulating PKRU and the overhead of pullin in an extra cacheline. This overhead is not huge, but it's also not something that I think we should unconditionally inflict on everyone. So, make it configurable both at boot-time and from debugfs. Changes to the debugfs value affect all processes created after the write to debugfs. Signed-off-by: Dave Hansen Cc: linux-arch@vger.kernel.org Cc: Dave Hansen Cc: mgorman@techsingularity.net Cc: arnd@arndb.de Cc: linux-api@vger.kernel.org Cc: linux-mm@kvack.org Cc: luto@kernel.org Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/20160729163023.407672D2@viggo.jf.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/pkeys.c | 66 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index ddc54949078a..f88ce0e5efd9 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -11,6 +11,7 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. */ +#include /* debugfs_create_u32() */ #include /* mm_struct, vma, etc... */ #include /* PKEY_* */ #include @@ -159,3 +160,68 @@ void copy_init_pkru_to_fpregs(void) */ write_pkru(init_pkru_value_snapshot); } + +static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[32]; + unsigned int len; + + len = sprintf(buf, "0x%x\n", init_pkru_value); + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +static ssize_t init_pkru_write_file(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[32]; + ssize_t len; + u32 new_init_pkru; + + len = min(count, sizeof(buf) - 1); + if (copy_from_user(buf, user_buf, len)) + return -EFAULT; + + /* Make the buffer a valid string that we can not overrun */ + buf[len] = '\0'; + if (kstrtouint(buf, 0, &new_init_pkru)) + return -EINVAL; + + /* + * Don't allow insane settings that will blow the system + * up immediately if someone attempts to disable access + * or writes to pkey 0. + */ + if (new_init_pkru & (PKRU_AD_BIT|PKRU_WD_BIT)) + return -EINVAL; + + WRITE_ONCE(init_pkru_value, new_init_pkru); + return count; +} + +static const struct file_operations fops_init_pkru = { + .read = init_pkru_read_file, + .write = init_pkru_write_file, + .llseek = default_llseek, +}; + +static int __init create_init_pkru_value(void) +{ + debugfs_create_file("init_pkru", S_IRUSR | S_IWUSR, + arch_debugfs_dir, NULL, &fops_init_pkru); + return 0; +} +late_initcall(create_init_pkru_value); + +static __init int setup_init_pkru(char *opt) +{ + u32 new_init_pkru; + + if (kstrtouint(opt, 0, &new_init_pkru)) + return 1; + + WRITE_ONCE(init_pkru_value, new_init_pkru); + + return 1; +} +__setup("init_pkru=", setup_init_pkru); From 5f23f6d082a95237387f18d3fde8d472aae9659a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 29 Jul 2016 09:30:24 -0700 Subject: [PATCH 215/538] x86/pkeys: Add self-tests This code should be a good demonstration of how to use the new system calls as well as how to use protection keys in general. This code shows how to: 1. Manipulate the Protection Keys Rights User (PKRU) register 2. Set a protection key on memory 3. Fetch and/or modify PKRU from the signal XSAVE state 4. Read the kernel-provided protection key in the siginfo 5. Set up an execute-only mapping There are currently 13 tests: test_read_of_write_disabled_region test_read_of_access_disabled_region test_write_of_write_disabled_region test_write_of_access_disabled_region test_kernel_write_of_access_disabled_region test_kernel_write_of_write_disabled_region test_kernel_gup_of_access_disabled_region test_kernel_gup_write_to_write_disabled_region test_executing_on_unreadable_memory test_ptrace_of_child test_pkey_syscalls_on_non_allocated_pkey test_pkey_syscalls_bad_args test_pkey_alloc_exhaust Each of the tests is run with plain memory (via mmap(MAP_ANON)), transparent huge pages, and hugetlb. Signed-off-by: Dave Hansen Cc: linux-arch@vger.kernel.org Cc: Dave Hansen Cc: mgorman@techsingularity.net Cc: arnd@arndb.de Cc: linux-api@vger.kernel.org Cc: shuahkh@osg.samsung.com Cc: linux-mm@kvack.org Cc: luto@kernel.org Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/20160729163024.FC5A0C2D@viggo.jf.intel.com Signed-off-by: Thomas Gleixner --- tools/testing/selftests/x86/Makefile | 3 +- tools/testing/selftests/x86/pkey-helpers.h | 219 +++ tools/testing/selftests/x86/protection_keys.c | 1410 +++++++++++++++++ 3 files changed, 1631 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/x86/pkey-helpers.h create mode 100644 tools/testing/selftests/x86/protection_keys.c diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 4f747ee07f10..a89f80a5b711 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -5,7 +5,8 @@ include ../lib.mk .PHONY: all all_32 all_64 warn_32bit_failure clean TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \ - check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test + check_initial_reg_state sigreturn ldt_gdt iopl \ + protection_keys TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer diff --git a/tools/testing/selftests/x86/pkey-helpers.h b/tools/testing/selftests/x86/pkey-helpers.h new file mode 100644 index 000000000000..b20293956eec --- /dev/null +++ b/tools/testing/selftests/x86/pkey-helpers.h @@ -0,0 +1,219 @@ +#ifndef _PKEYS_HELPER_H +#define _PKEYS_HELPER_H +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NR_PKEYS 16 +#define PKRU_BITS_PER_PKEY 2 + +#ifndef DEBUG_LEVEL +#define DEBUG_LEVEL 0 +#endif +#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 +extern int dprint_in_signal; +extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; +static inline void sigsafe_printf(const char *format, ...) +{ + va_list ap; + + va_start(ap, format); + if (!dprint_in_signal) { + vprintf(format, ap); + } else { + int len = vsnprintf(dprint_in_signal_buffer, + DPRINT_IN_SIGNAL_BUF_SIZE, + format, ap); + /* + * len is amount that would have been printed, + * but actual write is truncated at BUF_SIZE. + */ + if (len > DPRINT_IN_SIGNAL_BUF_SIZE) + len = DPRINT_IN_SIGNAL_BUF_SIZE; + write(1, dprint_in_signal_buffer, len); + } + va_end(ap); +} +#define dprintf_level(level, args...) do { \ + if (level <= DEBUG_LEVEL) \ + sigsafe_printf(args); \ + fflush(NULL); \ +} while (0) +#define dprintf0(args...) dprintf_level(0, args) +#define dprintf1(args...) dprintf_level(1, args) +#define dprintf2(args...) dprintf_level(2, args) +#define dprintf3(args...) dprintf_level(3, args) +#define dprintf4(args...) dprintf_level(4, args) + +extern unsigned int shadow_pkru; +static inline unsigned int __rdpkru(void) +{ + unsigned int eax, edx; + unsigned int ecx = 0; + unsigned int pkru; + + asm volatile(".byte 0x0f,0x01,0xee\n\t" + : "=a" (eax), "=d" (edx) + : "c" (ecx)); + pkru = eax; + return pkru; +} + +static inline unsigned int _rdpkru(int line) +{ + unsigned int pkru = __rdpkru(); + + dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n", + line, pkru, shadow_pkru); + assert(pkru == shadow_pkru); + + return pkru; +} + +#define rdpkru() _rdpkru(__LINE__) + +static inline void __wrpkru(unsigned int pkru) +{ + unsigned int eax = pkru; + unsigned int ecx = 0; + unsigned int edx = 0; + + dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); + asm volatile(".byte 0x0f,0x01,0xef\n\t" + : : "a" (eax), "c" (ecx), "d" (edx)); + assert(pkru == __rdpkru()); +} + +static inline void wrpkru(unsigned int pkru) +{ + dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); + /* will do the shadow check for us: */ + rdpkru(); + __wrpkru(pkru); + shadow_pkru = pkru; + dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru()); +} + +/* + * These are technically racy. since something could + * change PKRU between the read and the write. + */ +static inline void __pkey_access_allow(int pkey, int do_allow) +{ + unsigned int pkru = rdpkru(); + int bit = pkey * 2; + + if (do_allow) + pkru &= (1<mmap (see exit_mmap()), so make sure it is immune to pkeys + * look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel + * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks + * + * Compile like this: + * gcc -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm + * gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pkey-helpers.h" + +int iteration_nr = 1; +int test_nr; + +unsigned int shadow_pkru; + +#define HPAGE_SIZE (1UL<<21) +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) +#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) +#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) +#define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) +#define ALIGN_PTR_DOWN(p, ptr_align_to) ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to)) +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + +#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) + +int dprint_in_signal; +char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; + +extern void abort_hooks(void); +#define pkey_assert(condition) do { \ + if (!(condition)) { \ + dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ + __FILE__, __LINE__, \ + test_nr, iteration_nr); \ + dprintf0("errno at assert: %d", errno); \ + abort_hooks(); \ + assert(condition); \ + } \ +} while (0) +#define raw_assert(cond) assert(cond) + +void cat_into_file(char *str, char *file) +{ + int fd = open(file, O_RDWR); + int ret; + + dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file); + /* + * these need to be raw because they are called under + * pkey_assert() + */ + raw_assert(fd >= 0); + ret = write(fd, str, strlen(str)); + if (ret != strlen(str)) { + perror("write to file failed"); + fprintf(stderr, "filename: '%s' str: '%s'\n", file, str); + raw_assert(0); + } + close(fd); +} + +#if CONTROL_TRACING > 0 +static int warned_tracing; +int tracing_root_ok(void) +{ + if (geteuid() != 0) { + if (!warned_tracing) + fprintf(stderr, "WARNING: not run as root, " + "can not do tracing control\n"); + warned_tracing = 1; + return 0; + } + return 1; +} +#endif + +void tracing_on(void) +{ +#if CONTROL_TRACING > 0 +#define TRACEDIR "/sys/kernel/debug/tracing" + char pidstr[32]; + + if (!tracing_root_ok()) + return; + + sprintf(pidstr, "%d", getpid()); + cat_into_file("0", TRACEDIR "/tracing_on"); + cat_into_file("\n", TRACEDIR "/trace"); + if (1) { + cat_into_file("function_graph", TRACEDIR "/current_tracer"); + cat_into_file("1", TRACEDIR "/options/funcgraph-proc"); + } else { + cat_into_file("nop", TRACEDIR "/current_tracer"); + } + cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid"); + cat_into_file("1", TRACEDIR "/tracing_on"); + dprintf1("enabled tracing\n"); +#endif +} + +void tracing_off(void) +{ +#if CONTROL_TRACING > 0 + if (!tracing_root_ok()) + return; + cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on"); +#endif +} + +void abort_hooks(void) +{ + fprintf(stderr, "running %s()...\n", __func__); + tracing_off(); +#ifdef SLEEP_ON_ABORT + sleep(SLEEP_ON_ABORT); +#endif +} + +static inline void __page_o_noops(void) +{ + /* 8-bytes of instruction * 512 bytes = 1 page */ + asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); +} + +/* + * This attempts to have roughly a page of instructions followed by a few + * instructions that do a write, and another page of instructions. That + * way, we are pretty sure that the write is in the second page of + * instructions and has at least a page of padding behind it. + * + * *That* lets us be sure to madvise() away the write instruction, which + * will then fault, which makes sure that the fault code handles + * execute-only memory properly. + */ +__attribute__((__aligned__(PAGE_SIZE))) +void lots_o_noops_around_write(int *write_to_me) +{ + dprintf3("running %s()\n", __func__); + __page_o_noops(); + /* Assume this happens in the second page of instructions: */ + *write_to_me = __LINE__; + /* pad out by another page: */ + __page_o_noops(); + dprintf3("%s() done\n", __func__); +} + +/* Define some kernel-like types */ +#define u8 uint8_t +#define u16 uint16_t +#define u32 uint32_t +#define u64 uint64_t + +#ifdef __i386__ +#define SYS_mprotect_key 380 +#define SYS_pkey_alloc 381 +#define SYS_pkey_free 382 +#define REG_IP_IDX REG_EIP +#define si_pkey_offset 0x18 +#else +#define SYS_mprotect_key 329 +#define SYS_pkey_alloc 330 +#define SYS_pkey_free 331 +#define REG_IP_IDX REG_RIP +#define si_pkey_offset 0x20 +#endif + +void dump_mem(void *dumpme, int len_bytes) +{ + char *c = (void *)dumpme; + int i; + + for (i = 0; i < len_bytes; i += sizeof(u64)) { + u64 *ptr = (u64 *)(c + i); + dprintf1("dump[%03d][@%p]: %016jx\n", i, ptr, *ptr); + } +} + +#define __SI_FAULT (3 << 16) +#define SEGV_BNDERR (__SI_FAULT|3) /* failed address bound checks */ +#define SEGV_PKUERR (__SI_FAULT|4) + +static char *si_code_str(int si_code) +{ + if (si_code & SEGV_MAPERR) + return "SEGV_MAPERR"; + if (si_code & SEGV_ACCERR) + return "SEGV_ACCERR"; + if (si_code & SEGV_BNDERR) + return "SEGV_BNDERR"; + if (si_code & SEGV_PKUERR) + return "SEGV_PKUERR"; + return "UNKNOWN"; +} + +int pkru_faults; +int last_si_pkey = -1; +void signal_handler(int signum, siginfo_t *si, void *vucontext) +{ + ucontext_t *uctxt = vucontext; + int trapno; + unsigned long ip; + char *fpregs; + u32 *pkru_ptr; + u64 si_pkey; + u32 *si_pkey_ptr; + int pkru_offset; + fpregset_t fpregset; + + dprint_in_signal = 1; + dprintf1(">>>>===============SIGSEGV============================\n"); + dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__, + __rdpkru(), shadow_pkru); + + trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; + ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; + fpregset = uctxt->uc_mcontext.fpregs; + fpregs = (void *)fpregset; + + dprintf2("%s() trapno: %d ip: 0x%lx info->si_code: %s/%d\n", __func__, + trapno, ip, si_code_str(si->si_code), si->si_code); +#ifdef __i386__ + /* + * 32-bit has some extra padding so that userspace can tell whether + * the XSTATE header is present in addition to the "legacy" FPU + * state. We just assume that it is here. + */ + fpregs += 0x70; +#endif + pkru_offset = pkru_xstate_offset(); + pkru_ptr = (void *)(&fpregs[pkru_offset]); + + dprintf1("siginfo: %p\n", si); + dprintf1(" fpregs: %p\n", fpregs); + /* + * If we got a PKRU fault, we *HAVE* to have at least one bit set in + * here. + */ + dprintf1("pkru_xstate_offset: %d\n", pkru_xstate_offset()); + if (DEBUG_LEVEL > 4) + dump_mem(pkru_ptr - 128, 256); + pkey_assert(*pkru_ptr); + + si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset); + dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr); + dump_mem(si_pkey_ptr - 8, 24); + si_pkey = *si_pkey_ptr; + pkey_assert(si_pkey < NR_PKEYS); + last_si_pkey = si_pkey; + + if ((si->si_code == SEGV_MAPERR) || + (si->si_code == SEGV_ACCERR) || + (si->si_code == SEGV_BNDERR)) { + printf("non-PK si_code, exiting...\n"); + exit(4); + } + + dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr); + /* need __rdpkru() version so we do not do shadow_pkru checking */ + dprintf1("signal pkru from pkru: %08x\n", __rdpkru()); + dprintf1("si_pkey from siginfo: %jx\n", si_pkey); + *(u64 *)pkru_ptr = 0x00000000; + dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n"); + pkru_faults++; + dprintf1("<<<<==================================================\n"); + return; + if (trapno == 14) { + fprintf(stderr, + "ERROR: In signal handler, page fault, trapno = %d, ip = %016lx\n", + trapno, ip); + fprintf(stderr, "si_addr %p\n", si->si_addr); + fprintf(stderr, "REG_ERR: %lx\n", + (unsigned long)uctxt->uc_mcontext.gregs[REG_ERR]); + exit(1); + } else { + fprintf(stderr, "unexpected trap %d! at 0x%lx\n", trapno, ip); + fprintf(stderr, "si_addr %p\n", si->si_addr); + fprintf(stderr, "REG_ERR: %lx\n", + (unsigned long)uctxt->uc_mcontext.gregs[REG_ERR]); + exit(2); + } + dprint_in_signal = 0; +} + +int wait_all_children(void) +{ + int status; + return waitpid(-1, &status, 0); +} + +void sig_chld(int x) +{ + dprint_in_signal = 1; + dprintf2("[%d] SIGCHLD: %d\n", getpid(), x); + dprint_in_signal = 0; +} + +void setup_sigsegv_handler(void) +{ + int r, rs; + struct sigaction newact; + struct sigaction oldact; + + /* #PF is mapped to sigsegv */ + int signum = SIGSEGV; + + newact.sa_handler = 0; + newact.sa_sigaction = signal_handler; + + /*sigset_t - signals to block while in the handler */ + /* get the old signal mask. */ + rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask); + pkey_assert(rs == 0); + + /* call sa_sigaction, not sa_handler*/ + newact.sa_flags = SA_SIGINFO; + + newact.sa_restorer = 0; /* void(*)(), obsolete */ + r = sigaction(signum, &newact, &oldact); + r = sigaction(SIGALRM, &newact, &oldact); + pkey_assert(r == 0); +} + +void setup_handlers(void) +{ + signal(SIGCHLD, &sig_chld); + setup_sigsegv_handler(); +} + +pid_t fork_lazy_child(void) +{ + pid_t forkret; + + forkret = fork(); + pkey_assert(forkret >= 0); + dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); + + if (!forkret) { + /* in the child */ + while (1) { + dprintf1("child sleeping...\n"); + sleep(30); + } + } + return forkret; +} + +void davecmp(void *_a, void *_b, int len) +{ + int i; + unsigned long *a = _a; + unsigned long *b = _b; + + for (i = 0; i < len / sizeof(*a); i++) { + if (a[i] == b[i]) + continue; + + dprintf3("[%3d]: a: %016lx b: %016lx\n", i, a[i], b[i]); + } +} + +void dumpit(char *f) +{ + int fd = open(f, O_RDONLY); + char buf[100]; + int nr_read; + + dprintf2("maps fd: %d\n", fd); + do { + nr_read = read(fd, &buf[0], sizeof(buf)); + write(1, buf, nr_read); + } while (nr_read > 0); + close(fd); +} + +#define PKEY_DISABLE_ACCESS 0x1 +#define PKEY_DISABLE_WRITE 0x2 + +u32 pkey_get(int pkey, unsigned long flags) +{ + u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); + u32 pkru = __rdpkru(); + u32 shifted_pkru; + u32 masked_pkru; + + dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", + __func__, pkey, flags, 0, 0); + dprintf2("%s() raw pkru: %x\n", __func__, pkru); + + shifted_pkru = (pkru >> (pkey * PKRU_BITS_PER_PKEY)); + dprintf2("%s() shifted_pkru: %x\n", __func__, shifted_pkru); + masked_pkru = shifted_pkru & mask; + dprintf2("%s() masked pkru: %x\n", __func__, masked_pkru); + /* + * shift down the relevant bits to the lowest two, then + * mask off all the other high bits. + */ + return masked_pkru; +} + +int pkey_set(int pkey, unsigned long rights, unsigned long flags) +{ + u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); + u32 old_pkru = __rdpkru(); + u32 new_pkru; + + /* make sure that 'rights' only contains the bits we expect: */ + assert(!(rights & ~mask)); + + /* copy old pkru */ + new_pkru = old_pkru; + /* mask out bits from pkey in old value: */ + new_pkru &= ~(mask << (pkey * PKRU_BITS_PER_PKEY)); + /* OR in new bits for pkey: */ + new_pkru |= (rights << (pkey * PKRU_BITS_PER_PKEY)); + + __wrpkru(new_pkru); + + dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x pkru now: %x old_pkru: %x\n", + __func__, pkey, rights, flags, 0, __rdpkru(), old_pkru); + return 0; +} + +void pkey_disable_set(int pkey, int flags) +{ + unsigned long syscall_flags = 0; + int ret; + int pkey_rights; + u32 orig_pkru; + + dprintf1("START->%s(%d, 0x%x)\n", __func__, + pkey, flags); + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + pkey_rights = pkey_get(pkey, syscall_flags); + + dprintf1("%s(%d) pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + pkey_assert(pkey_rights >= 0); + + pkey_rights |= flags; + + ret = pkey_set(pkey, pkey_rights, syscall_flags); + assert(!ret); + /*pkru and flags have the same format */ + shadow_pkru |= flags << (pkey * 2); + dprintf1("%s(%d) shadow: 0x%x\n", __func__, pkey, shadow_pkru); + + pkey_assert(ret >= 0); + + pkey_rights = pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); + if (flags) + pkey_assert(rdpkru() > orig_pkru); + dprintf1("END<---%s(%d, 0x%x)\n", __func__, + pkey, flags); +} + +void pkey_disable_clear(int pkey, int flags) +{ + unsigned long syscall_flags = 0; + int ret; + int pkey_rights = pkey_get(pkey, syscall_flags); + u32 orig_pkru = rdpkru(); + + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + dprintf1("%s(%d) pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + pkey_assert(pkey_rights >= 0); + + pkey_rights |= flags; + + ret = pkey_set(pkey, pkey_rights, 0); + /* pkru and flags have the same format */ + shadow_pkru &= ~(flags << (pkey * 2)); + pkey_assert(ret >= 0); + + pkey_rights = pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); + if (flags) + assert(rdpkru() > orig_pkru); +} + +void pkey_write_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); +} +void pkey_write_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_WRITE); +} +void pkey_access_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); +} +void pkey_access_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); +} + +int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey) +{ + int sret; + + dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__, + ptr, size, orig_prot, pkey); + + errno = 0; + sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey); + if (errno) { + dprintf2("SYS_mprotect_key sret: %d\n", sret); + dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot); + dprintf2("SYS_mprotect_key failed, errno: %d\n", errno); + if (DEBUG_LEVEL >= 2) + perror("SYS_mprotect_pkey"); + } + return sret; +} + +int sys_pkey_alloc(unsigned long flags, unsigned long init_val) +{ + int ret = syscall(SYS_pkey_alloc, flags, init_val); + dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n", + __func__, flags, init_val, ret, errno); + return ret; +} + +int alloc_pkey(void) +{ + int ret; + unsigned long init_val = 0x0; + + dprintf1("alloc_pkey()::%d, pkru: 0x%x shadow: %x\n", + __LINE__, __rdpkru(), shadow_pkru); + ret = sys_pkey_alloc(0, init_val); + /* + * pkey_alloc() sets PKRU, so we need to reflect it in + * shadow_pkru: + */ + dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", + __LINE__, ret, __rdpkru(), shadow_pkru); + if (ret) { + /* clear both the bits: */ + shadow_pkru &= ~(0x3 << (ret * 2)); + dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", + __LINE__, ret, __rdpkru(), shadow_pkru); + /* + * move the new state in from init_val + * (remember, we cheated and init_val == pkru format) + */ + shadow_pkru |= (init_val << (ret * 2)); + } + dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", + __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("alloc_pkey()::%d errno: %d\n", __LINE__, errno); + /* for shadow checking: */ + rdpkru(); + dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", + __LINE__, ret, __rdpkru(), shadow_pkru); + return ret; +} + +int sys_pkey_free(unsigned long pkey) +{ + int ret = syscall(SYS_pkey_free, pkey); + dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret); + return ret; +} + +/* + * I had a bug where pkey bits could be set by mprotect() but + * not cleared. This ensures we get lots of random bit sets + * and clears on the vma and pte pkey bits. + */ +int alloc_random_pkey(void) +{ + int max_nr_pkey_allocs; + int ret; + int i; + int alloced_pkeys[NR_PKEYS]; + int nr_alloced = 0; + int random_index; + memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); + + /* allocate every possible key and make a note of which ones we got */ + max_nr_pkey_allocs = NR_PKEYS; + max_nr_pkey_allocs = 1; + for (i = 0; i < max_nr_pkey_allocs; i++) { + int new_pkey = alloc_pkey(); + if (new_pkey < 0) + break; + alloced_pkeys[nr_alloced++] = new_pkey; + } + + pkey_assert(nr_alloced > 0); + /* select a random one out of the allocated ones */ + random_index = rand() % nr_alloced; + ret = alloced_pkeys[random_index]; + /* now zero it out so we don't free it next */ + alloced_pkeys[random_index] = 0; + + /* go through the allocated ones that we did not want and free them */ + for (i = 0; i < nr_alloced; i++) { + int free_ret; + if (!alloced_pkeys[i]) + continue; + free_ret = sys_pkey_free(alloced_pkeys[i]); + pkey_assert(!free_ret); + } + dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, + __LINE__, ret, __rdpkru(), shadow_pkru); + return ret; +} + +int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey) +{ + int nr_iterations = random() % 100; + int ret; + + while (0) { + int rpkey = alloc_random_pkey(); + ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); + dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", + ptr, size, orig_prot, pkey, ret); + if (nr_iterations-- < 0) + break; + + dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, + __LINE__, ret, __rdpkru(), shadow_pkru); + sys_pkey_free(rpkey); + dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, + __LINE__, ret, __rdpkru(), shadow_pkru); + } + pkey_assert(pkey < NR_PKEYS); + + ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); + dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", + ptr, size, orig_prot, pkey, ret); + pkey_assert(!ret); + dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, + __LINE__, ret, __rdpkru(), shadow_pkru); + return ret; +} + +struct pkey_malloc_record { + void *ptr; + long size; +}; +struct pkey_malloc_record *pkey_malloc_records; +long nr_pkey_malloc_records; +void record_pkey_malloc(void *ptr, long size) +{ + long i; + struct pkey_malloc_record *rec = NULL; + + for (i = 0; i < nr_pkey_malloc_records; i++) { + rec = &pkey_malloc_records[i]; + /* find a free record */ + if (rec) + break; + } + if (!rec) { + /* every record is full */ + size_t old_nr_records = nr_pkey_malloc_records; + size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1); + size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record); + dprintf2("new_nr_records: %zd\n", new_nr_records); + dprintf2("new_size: %zd\n", new_size); + pkey_malloc_records = realloc(pkey_malloc_records, new_size); + pkey_assert(pkey_malloc_records != NULL); + rec = &pkey_malloc_records[nr_pkey_malloc_records]; + /* + * realloc() does not initialize memory, so zero it from + * the first new record all the way to the end. + */ + for (i = 0; i < new_nr_records - old_nr_records; i++) + memset(rec + i, 0, sizeof(*rec)); + } + dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n", + (int)(rec - pkey_malloc_records), rec, ptr, size); + rec->ptr = ptr; + rec->size = size; + nr_pkey_malloc_records++; +} + +void free_pkey_malloc(void *ptr) +{ + long i; + int ret; + dprintf3("%s(%p)\n", __func__, ptr); + for (i = 0; i < nr_pkey_malloc_records; i++) { + struct pkey_malloc_record *rec = &pkey_malloc_records[i]; + dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n", + ptr, i, rec, rec->ptr, rec->size); + if ((ptr < rec->ptr) || + (ptr >= rec->ptr + rec->size)) + continue; + + dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n", + ptr, i, rec, rec->ptr, rec->size); + nr_pkey_malloc_records--; + ret = munmap(rec->ptr, rec->size); + dprintf3("munmap ret: %d\n", ret); + pkey_assert(!ret); + dprintf3("clearing rec->ptr, rec: %p\n", rec); + rec->ptr = NULL; + dprintf3("done clearing rec->ptr, rec: %p\n", rec); + return; + } + pkey_assert(false); +} + + +void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) +{ + void *ptr; + int ret; + + rdpkru(); + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + pkey_assert(pkey < NR_PKEYS); + ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + pkey_assert(ptr != (void *)-1); + ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); + pkey_assert(!ret); + record_pkey_malloc(ptr, size); + rdpkru(); + + dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); + return ptr; +} + +void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) +{ + int ret; + void *ptr; + + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + /* + * Guarantee we can fit at least one huge page in the resulting + * allocation by allocating space for 2: + */ + size = ALIGN_UP(size, HPAGE_SIZE * 2); + ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + pkey_assert(ptr != (void *)-1); + record_pkey_malloc(ptr, size); + mprotect_pkey(ptr, size, prot, pkey); + + dprintf1("unaligned ptr: %p\n", ptr); + ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE); + dprintf1(" aligned ptr: %p\n", ptr); + ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE); + dprintf1("MADV_HUGEPAGE ret: %d\n", ret); + ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED); + dprintf1("MADV_WILLNEED ret: %d\n", ret); + memset(ptr, 0, HPAGE_SIZE); + + dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr); + return ptr; +} + +int hugetlb_setup_ok; +#define GET_NR_HUGE_PAGES 10 +void setup_hugetlbfs(void) +{ + int err; + int fd; + int validated_nr_pages; + int i; + char buf[] = "123"; + + if (geteuid() != 0) { + fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); + return; + } + + cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages"); + + /* + * Now go make sure that we got the pages and that they + * are 2M pages. Someone might have made 1G the default. + */ + fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", O_RDONLY); + if (fd < 0) { + perror("opening sysfs 2M hugetlb config"); + return; + } + + /* -1 to guarantee leaving the trailing \0 */ + err = read(fd, buf, sizeof(buf)-1); + close(fd); + if (err <= 0) { + perror("reading sysfs 2M hugetlb config"); + return; + } + + if (atoi(buf) != GET_NR_HUGE_PAGES) { + fprintf(stderr, "could not confirm 2M pages, got: '%s' expected %d\n", + buf, GET_NR_HUGE_PAGES); + return; + } + + hugetlb_setup_ok = 1; +} + +void *malloc_pkey_hugetlb(long size, int prot, u16 pkey) +{ + void *ptr; + int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB; + + if (!hugetlb_setup_ok) + return PTR_ERR_ENOTSUP; + + dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey); + size = ALIGN_UP(size, HPAGE_SIZE * 2); + pkey_assert(pkey < NR_PKEYS); + ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0); + pkey_assert(ptr != (void *)-1); + mprotect_pkey(ptr, size, prot, pkey); + + record_pkey_malloc(ptr, size); + + dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr); + return ptr; +} + +void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) +{ + void *ptr; + int fd; + + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + pkey_assert(pkey < NR_PKEYS); + fd = open("/dax/foo", O_RDWR); + pkey_assert(fd >= 0); + + ptr = mmap(0, size, prot, MAP_SHARED, fd, 0); + pkey_assert(ptr != (void *)-1); + + mprotect_pkey(ptr, size, prot, pkey); + + record_pkey_malloc(ptr, size); + + dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr); + close(fd); + return ptr; +} + +void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { + + malloc_pkey_with_mprotect, + malloc_pkey_anon_huge, + malloc_pkey_hugetlb +/* can not do direct with the pkey_mprotect() API: + malloc_pkey_mmap_direct, + malloc_pkey_mmap_dax, +*/ +}; + +void *malloc_pkey(long size, int prot, u16 pkey) +{ + void *ret; + static int malloc_type; + int nr_malloc_types = ARRAY_SIZE(pkey_malloc); + + pkey_assert(pkey < NR_PKEYS); + + while (1) { + pkey_assert(malloc_type < nr_malloc_types); + + ret = pkey_malloc[malloc_type](size, prot, pkey); + pkey_assert(ret != (void *)-1); + + malloc_type++; + if (malloc_type >= nr_malloc_types) + malloc_type = (random()%nr_malloc_types); + + /* try again if the malloc_type we tried is unsupported */ + if (ret == PTR_ERR_ENOTSUP) + continue; + + break; + } + + dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__, + size, prot, pkey, ret); + return ret; +} + +int last_pkru_faults; +void expected_pk_fault(int pkey) +{ + dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n", + __func__, last_pkru_faults, pkru_faults); + dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); + pkey_assert(last_pkru_faults + 1 == pkru_faults); + pkey_assert(last_si_pkey == pkey); + /* + * The signal handler shold have cleared out PKRU to let the + * test program continue. We now have to restore it. + */ + if (__rdpkru() != 0) + pkey_assert(0); + + __wrpkru(shadow_pkru); + dprintf1("%s() set PKRU=%x to restore state after signal nuked it\n", + __func__, shadow_pkru); + last_pkru_faults = pkru_faults; + last_si_pkey = -1; +} + +void do_not_expect_pk_fault(void) +{ + pkey_assert(last_pkru_faults == pkru_faults); +} + +int test_fds[10] = { -1 }; +int nr_test_fds; +void __save_test_fd(int fd) +{ + pkey_assert(fd >= 0); + pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds)); + test_fds[nr_test_fds] = fd; + nr_test_fds++; +} + +int get_test_read_fd(void) +{ + int test_fd = open("/etc/passwd", O_RDONLY); + __save_test_fd(test_fd); + return test_fd; +} + +void close_test_fds(void) +{ + int i; + + for (i = 0; i < nr_test_fds; i++) { + if (test_fds[i] < 0) + continue; + close(test_fds[i]); + test_fds[i] = -1; + } + nr_test_fds = 0; +} + +#define barrier() __asm__ __volatile__("": : :"memory") +__attribute__((noinline)) int read_ptr(int *ptr) +{ + /* + * Keep GCC from optimizing this away somehow + */ + barrier(); + return *ptr; +} + +void test_read_of_write_disabled_region(int *ptr, u16 pkey) +{ + int ptr_contents; + + dprintf1("disabling write access to PKEY[1], doing read\n"); + pkey_write_deny(pkey); + ptr_contents = read_ptr(ptr); + dprintf1("*ptr: %d\n", ptr_contents); + dprintf1("\n"); +} +void test_read_of_access_disabled_region(int *ptr, u16 pkey) +{ + int ptr_contents; + + dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr); + rdpkru(); + pkey_access_deny(pkey); + ptr_contents = read_ptr(ptr); + dprintf1("*ptr: %d\n", ptr_contents); + expected_pk_fault(pkey); +} +void test_write_of_write_disabled_region(int *ptr, u16 pkey) +{ + dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); + pkey_write_deny(pkey); + *ptr = __LINE__; + expected_pk_fault(pkey); +} +void test_write_of_access_disabled_region(int *ptr, u16 pkey) +{ + dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); + pkey_access_deny(pkey); + *ptr = __LINE__; + expected_pk_fault(pkey); +} +void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) +{ + int ret; + int test_fd = get_test_read_fd(); + + dprintf1("disabling access to PKEY[%02d], " + "having kernel read() to buffer\n", pkey); + pkey_access_deny(pkey); + ret = read(test_fd, ptr, 1); + dprintf1("read ret: %d\n", ret); + pkey_assert(ret); +} +void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) +{ + int ret; + int test_fd = get_test_read_fd(); + + pkey_write_deny(pkey); + ret = read(test_fd, ptr, 100); + dprintf1("read ret: %d\n", ret); + if (ret < 0 && (DEBUG_LEVEL > 0)) + perror("verbose read result (OK for this to be bad)"); + pkey_assert(ret); +} + +void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey) +{ + int pipe_ret, vmsplice_ret; + struct iovec iov; + int pipe_fds[2]; + + pipe_ret = pipe(pipe_fds); + + pkey_assert(pipe_ret == 0); + dprintf1("disabling access to PKEY[%02d], " + "having kernel vmsplice from buffer\n", pkey); + pkey_access_deny(pkey); + iov.iov_base = ptr; + iov.iov_len = PAGE_SIZE; + vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT); + dprintf1("vmsplice() ret: %d\n", vmsplice_ret); + pkey_assert(vmsplice_ret == -1); + + close(pipe_fds[0]); + close(pipe_fds[1]); +} + +void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) +{ + int ignored = 0xdada; + int futex_ret; + int some_int = __LINE__; + + dprintf1("disabling write to PKEY[%02d], " + "doing futex gunk in buffer\n", pkey); + *ptr = some_int; + pkey_write_deny(pkey); + futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL, + &ignored, ignored); + if (DEBUG_LEVEL > 0) + perror("futex"); + dprintf1("futex() ret: %d\n", futex_ret); +} + +/* Assumes that all pkeys other than 'pkey' are unallocated */ +void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey) +{ + int err; + int i; + + /* Note: 0 is the default pkey, so don't mess with it */ + for (i = 1; i < NR_PKEYS; i++) { + if (pkey == i) + continue; + + dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i); + err = sys_pkey_free(i); + pkey_assert(err); + + /* not enforced when pkey_get() is not a syscall + err = pkey_get(i, 0); + pkey_assert(err < 0); + */ + + err = sys_pkey_free(i); + pkey_assert(err); + + err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i); + pkey_assert(err); + } +} + +/* Assumes that all pkeys other than 'pkey' are unallocated */ +void test_pkey_syscalls_bad_args(int *ptr, u16 pkey) +{ + int err; + int bad_flag = (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) + 1; + int bad_pkey = NR_PKEYS+99; + + /* not enforced when pkey_get() is not a syscall + err = pkey_get(bad_pkey, bad_flag); + pkey_assert(err < 0); + */ + + /* pass a known-invalid pkey in: */ + err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey); + pkey_assert(err); +} + +/* Assumes that all pkeys other than 'pkey' are unallocated */ +void test_pkey_alloc_exhaust(int *ptr, u16 pkey) +{ + unsigned long flags; + unsigned long init_val; + int err; + int allocated_pkeys[NR_PKEYS] = {0}; + int nr_allocated_pkeys = 0; + int i; + + for (i = 0; i < NR_PKEYS*2; i++) { + int new_pkey; + dprintf1("%s() alloc loop: %d\n", __func__, i); + new_pkey = alloc_pkey(); + dprintf4("%s()::%d, err: %d pkru: 0x%x shadow: 0x%x\n", __func__, + __LINE__, err, __rdpkru(), shadow_pkru); + rdpkru(); /* for shadow checking */ + dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC); + if ((new_pkey == -1) && (errno == ENOSPC)) { + dprintf2("%s() failed to allocate pkey after %d tries\n", + __func__, nr_allocated_pkeys); + break; + } + pkey_assert(nr_allocated_pkeys < NR_PKEYS); + allocated_pkeys[nr_allocated_pkeys++] = new_pkey; + } + + dprintf3("%s()::%d\n", __func__, __LINE__); + + /* + * ensure it did not reach the end of the loop without + * failure: + */ + pkey_assert(i < NR_PKEYS*2); + + /* + * There are 16 pkeys supported in hardware. One is taken + * up for the default (0) and another can be taken up by + * an execute-only mapping. Ensure that we can allocate + * at least 14 (16-2). + */ + pkey_assert(i >= NR_PKEYS-2); + + for (i = 0; i < nr_allocated_pkeys; i++) { + err = sys_pkey_free(allocated_pkeys[i]); + pkey_assert(!err); + rdpkru(); /* for shadow checking */ + } +} + +void test_ptrace_of_child(int *ptr, u16 pkey) +{ + __attribute__((__unused__)) int peek_result; + pid_t child_pid; + void *ignored = 0; + long ret; + int status; + /* + * This is the "control" for our little expermient. Make sure + * we can always access it when ptracing. + */ + int *plain_ptr_unaligned = malloc(HPAGE_SIZE); + int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE); + + /* + * Fork a child which is an exact copy of this process, of course. + * That means we can do all of our tests via ptrace() and then plain + * memory access and ensure they work differently. + */ + child_pid = fork_lazy_child(); + dprintf1("[%d] child pid: %d\n", getpid(), child_pid); + + ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored); + if (ret) + perror("attach"); + dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__); + pkey_assert(ret != -1); + ret = waitpid(child_pid, &status, WUNTRACED); + if ((ret != child_pid) || !(WIFSTOPPED(status))) { + fprintf(stderr, "weird waitpid result %ld stat %x\n", + ret, status); + pkey_assert(0); + } + dprintf2("waitpid ret: %ld\n", ret); + dprintf2("waitpid status: %d\n", status); + + pkey_access_deny(pkey); + pkey_write_deny(pkey); + + /* Write access, untested for now: + ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data); + pkey_assert(ret != -1); + dprintf1("poke at %p: %ld\n", peek_at, ret); + */ + + /* + * Try to access the pkey-protected "ptr" via ptrace: + */ + ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored); + /* expect it to work, without an error: */ + pkey_assert(ret != -1); + /* Now access from the current task, and expect an exception: */ + peek_result = read_ptr(ptr); + expected_pk_fault(pkey); + + /* + * Try to access the NON-pkey-protected "plain_ptr" via ptrace: + */ + ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored); + /* expect it to work, without an error: */ + pkey_assert(ret != -1); + /* Now access from the current task, and expect NO exception: */ + peek_result = read_ptr(plain_ptr); + do_not_expect_pk_fault(); + + ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); + pkey_assert(ret != -1); + + ret = kill(child_pid, SIGKILL); + pkey_assert(ret != -1); + + wait(&status); + + free(plain_ptr_unaligned); +} + +void test_executing_on_unreadable_memory(int *ptr, u16 pkey) +{ + void *p1; + int scratch; + int ptr_contents; + int ret; + + p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE); + dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write); + /* lots_o_noops_around_write should be page-aligned already */ + assert(p1 == &lots_o_noops_around_write); + + /* Point 'p1' at the *second* page of the function: */ + p1 += PAGE_SIZE; + + madvise(p1, PAGE_SIZE, MADV_DONTNEED); + lots_o_noops_around_write(&scratch); + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + + ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey); + pkey_assert(!ret); + pkey_access_deny(pkey); + + dprintf2("pkru: %x\n", rdpkru()); + + /* + * Make sure this is an *instruction* fault + */ + madvise(p1, PAGE_SIZE, MADV_DONTNEED); + lots_o_noops_around_write(&scratch); + do_not_expect_pk_fault(); + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + expected_pk_fault(pkey); +} + +void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) +{ + int size = PAGE_SIZE; + int sret; + + if (cpu_has_pku()) { + dprintf1("SKIP: %s: no CPU support\n", __func__); + return; + } + + sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey); + pkey_assert(sret < 0); +} + +void (*pkey_tests[])(int *ptr, u16 pkey) = { + test_read_of_write_disabled_region, + test_read_of_access_disabled_region, + test_write_of_write_disabled_region, + test_write_of_access_disabled_region, + test_kernel_write_of_access_disabled_region, + test_kernel_write_of_write_disabled_region, + test_kernel_gup_of_access_disabled_region, + test_kernel_gup_write_to_write_disabled_region, + test_executing_on_unreadable_memory, + test_ptrace_of_child, + test_pkey_syscalls_on_non_allocated_pkey, + test_pkey_syscalls_bad_args, + test_pkey_alloc_exhaust, +}; + +void run_tests_once(void) +{ + int *ptr; + int prot = PROT_READ|PROT_WRITE; + + for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { + int pkey; + int orig_pkru_faults = pkru_faults; + + dprintf1("======================\n"); + dprintf1("test %d preparing...\n", test_nr); + + tracing_on(); + pkey = alloc_random_pkey(); + dprintf1("test %d starting with pkey: %d\n", test_nr, pkey); + ptr = malloc_pkey(PAGE_SIZE, prot, pkey); + dprintf1("test %d starting...\n", test_nr); + pkey_tests[test_nr](ptr, pkey); + dprintf1("freeing test memory: %p\n", ptr); + free_pkey_malloc(ptr); + sys_pkey_free(pkey); + + dprintf1("pkru_faults: %d\n", pkru_faults); + dprintf1("orig_pkru_faults: %d\n", orig_pkru_faults); + + tracing_off(); + close_test_fds(); + + printf("test %2d PASSED (itertation %d)\n", test_nr, iteration_nr); + dprintf1("======================\n\n"); + } + iteration_nr++; +} + +void pkey_setup_shadow(void) +{ + shadow_pkru = __rdpkru(); +} + +int main(void) +{ + int nr_iterations = 22; + + setup_handlers(); + + printf("has pku: %d\n", cpu_has_pku()); + + if (!cpu_has_pku()) { + int size = PAGE_SIZE; + int *ptr; + + printf("running PKEY tests for unsupported CPU/OS\n"); + + ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + assert(ptr != (void *)-1); + test_mprotect_pkey_on_unsupported_cpu(ptr, 1); + exit(0); + } + + pkey_setup_shadow(); + printf("startup pkru: %x\n", rdpkru()); + setup_hugetlbfs(); + + while (nr_iterations-- > 0) + run_tests_once(); + + printf("done (all tests OK)\n"); + return 0; +} From 70164742783c371516199271d923731afc40e25e Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Thu, 21 Jul 2016 23:13:51 +0930 Subject: [PATCH 216/538] clocksource/drivers/moxart: Refactor enable/disable This patch abstracts the enable and disable register writes into their own functions in preparation for future changes to use SoC specific values for the writes. Signed-off-by: Joel Stanley Signed-off-by: Daniel Lezcano --- drivers/clocksource/moxart_timer.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/clocksource/moxart_timer.c b/drivers/clocksource/moxart_timer.c index 841454417acd..a3aaa5658a49 100644 --- a/drivers/clocksource/moxart_timer.c +++ b/drivers/clocksource/moxart_timer.c @@ -58,15 +58,25 @@ static void __iomem *base; static unsigned int clock_count_per_tick; -static int moxart_shutdown(struct clock_event_device *evt) +static inline void moxart_disable(struct clock_event_device *evt) { writel(TIMER1_DISABLE, base + TIMER_CR); +} + +static inline void moxart_enable(struct clock_event_device *evt) +{ + writel(TIMER1_ENABLE, base + TIMER_CR); +} + +static int moxart_shutdown(struct clock_event_device *evt) +{ + moxart_disable(evt); return 0; } static int moxart_set_oneshot(struct clock_event_device *evt) { - writel(TIMER1_DISABLE, base + TIMER_CR); + moxart_disable(evt); writel(~0, base + TIMER1_BASE + REG_LOAD); return 0; } @@ -74,21 +84,21 @@ static int moxart_set_oneshot(struct clock_event_device *evt) static int moxart_set_periodic(struct clock_event_device *evt) { writel(clock_count_per_tick, base + TIMER1_BASE + REG_LOAD); - writel(TIMER1_ENABLE, base + TIMER_CR); + moxart_enable(evt); return 0; } static int moxart_clkevt_next_event(unsigned long cycles, - struct clock_event_device *unused) + struct clock_event_device *evt) { u32 u; - writel(TIMER1_DISABLE, base + TIMER_CR); + moxart_disable(evt); u = readl(base + TIMER1_BASE + REG_COUNT) - cycles; writel(u, base + TIMER1_BASE + REG_MATCH1); - writel(TIMER1_ENABLE, base + TIMER_CR); + moxart_enable(evt); return 0; } From 82fdd070873f7ac9b3e37b3d4523b4ae27d02e50 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Thu, 21 Jul 2016 23:13:52 +0930 Subject: [PATCH 217/538] clocksource/drivers/moxart: Use struct to hold state Add a struct moxart_timer to hold the driver state, including the irqaction and struct clock_event_device. Most importantly this holds values for enabling and disabling the timer, so future support can be added for devices that use different bits for enable/disable. In preparation for future hardware support we add a MOXART prefix to the existing values. Signed-off-by: Joel Stanley Signed-off-by: Daniel Lezcano --- drivers/clocksource/moxart_timer.c | 147 +++++++++++++++++------------ 1 file changed, 86 insertions(+), 61 deletions(-) diff --git a/drivers/clocksource/moxart_timer.c b/drivers/clocksource/moxart_timer.c index a3aaa5658a49..cb0b34786a8e 100644 --- a/drivers/clocksource/moxart_timer.c +++ b/drivers/clocksource/moxart_timer.c @@ -21,6 +21,7 @@ #include #include #include +#include #define TIMER1_BASE 0x00 #define TIMER2_BASE 0x10 @@ -36,36 +37,51 @@ #define TIMER_INTR_MASK 0x38 /* - * TIMER_CR flags: + * Moxart TIMER_CR flags: * - * TIMEREG_CR_*_CLOCK 0: PCLK, 1: EXT1CLK - * TIMEREG_CR_*_INT overflow interrupt enable bit + * MOXART_CR_*_CLOCK 0: PCLK, 1: EXT1CLK + * MOXART_CR_*_INT overflow interrupt enable bit */ -#define TIMEREG_CR_1_ENABLE BIT(0) -#define TIMEREG_CR_1_CLOCK BIT(1) -#define TIMEREG_CR_1_INT BIT(2) -#define TIMEREG_CR_2_ENABLE BIT(3) -#define TIMEREG_CR_2_CLOCK BIT(4) -#define TIMEREG_CR_2_INT BIT(5) -#define TIMEREG_CR_3_ENABLE BIT(6) -#define TIMEREG_CR_3_CLOCK BIT(7) -#define TIMEREG_CR_3_INT BIT(8) -#define TIMEREG_CR_COUNT_UP BIT(9) - -#define TIMER1_ENABLE (TIMEREG_CR_2_ENABLE | TIMEREG_CR_1_ENABLE) -#define TIMER1_DISABLE (TIMEREG_CR_2_ENABLE) - -static void __iomem *base; -static unsigned int clock_count_per_tick; +#define MOXART_CR_1_ENABLE BIT(0) +#define MOXART_CR_1_CLOCK BIT(1) +#define MOXART_CR_1_INT BIT(2) +#define MOXART_CR_2_ENABLE BIT(3) +#define MOXART_CR_2_CLOCK BIT(4) +#define MOXART_CR_2_INT BIT(5) +#define MOXART_CR_3_ENABLE BIT(6) +#define MOXART_CR_3_CLOCK BIT(7) +#define MOXART_CR_3_INT BIT(8) +#define MOXART_CR_COUNT_UP BIT(9) + +#define MOXART_TIMER1_ENABLE (MOXART_CR_2_ENABLE | MOXART_CR_1_ENABLE) +#define MOXART_TIMER1_DISABLE (MOXART_CR_2_ENABLE) + +struct moxart_timer { + void __iomem *base; + unsigned int t1_disable_val; + unsigned int t1_enable_val; + unsigned int count_per_tick; + struct clock_event_device clkevt; + struct irqaction act; +}; + +static inline struct moxart_timer *to_moxart(struct clock_event_device *evt) +{ + return container_of(evt, struct moxart_timer, clkevt); +} static inline void moxart_disable(struct clock_event_device *evt) { - writel(TIMER1_DISABLE, base + TIMER_CR); + struct moxart_timer *timer = to_moxart(evt); + + writel(timer->t1_disable_val, timer->base + TIMER_CR); } static inline void moxart_enable(struct clock_event_device *evt) { - writel(TIMER1_ENABLE, base + TIMER_CR); + struct moxart_timer *timer = to_moxart(evt); + + writel(timer->t1_enable_val, timer->base + TIMER_CR); } static int moxart_shutdown(struct clock_event_device *evt) @@ -77,13 +93,17 @@ static int moxart_shutdown(struct clock_event_device *evt) static int moxart_set_oneshot(struct clock_event_device *evt) { moxart_disable(evt); - writel(~0, base + TIMER1_BASE + REG_LOAD); + writel(~0, to_moxart(evt)->base + TIMER1_BASE + REG_LOAD); return 0; } static int moxart_set_periodic(struct clock_event_device *evt) { - writel(clock_count_per_tick, base + TIMER1_BASE + REG_LOAD); + struct moxart_timer *timer = to_moxart(evt); + + moxart_disable(evt); + writel(timer->count_per_tick, timer->base + TIMER1_BASE + REG_LOAD); + writel(0, timer->base + TIMER1_BASE + REG_MATCH1); moxart_enable(evt); return 0; } @@ -91,30 +111,19 @@ static int moxart_set_periodic(struct clock_event_device *evt) static int moxart_clkevt_next_event(unsigned long cycles, struct clock_event_device *evt) { + struct moxart_timer *timer = to_moxart(evt); u32 u; moxart_disable(evt); - u = readl(base + TIMER1_BASE + REG_COUNT) - cycles; - writel(u, base + TIMER1_BASE + REG_MATCH1); + u = readl(timer->base + TIMER1_BASE + REG_COUNT) - cycles; + writel(u, timer->base + TIMER1_BASE + REG_MATCH1); moxart_enable(evt); return 0; } -static struct clock_event_device moxart_clockevent = { - .name = "moxart_timer", - .rating = 200, - .features = CLOCK_EVT_FEAT_PERIODIC | - CLOCK_EVT_FEAT_ONESHOT, - .set_state_shutdown = moxart_shutdown, - .set_state_periodic = moxart_set_periodic, - .set_state_oneshot = moxart_set_oneshot, - .tick_resume = moxart_set_oneshot, - .set_next_event = moxart_clkevt_next_event, -}; - static irqreturn_t moxart_timer_interrupt(int irq, void *dev_id) { struct clock_event_device *evt = dev_id; @@ -122,21 +131,19 @@ static irqreturn_t moxart_timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static struct irqaction moxart_timer_irq = { - .name = "moxart-timer", - .flags = IRQF_TIMER, - .handler = moxart_timer_interrupt, - .dev_id = &moxart_clockevent, -}; - static int __init moxart_timer_init(struct device_node *node) { int ret, irq; unsigned long pclk; struct clk *clk; + struct moxart_timer *timer; - base = of_iomap(node, 0); - if (!base) { + timer = kzalloc(sizeof(*timer), GFP_KERNEL); + if (!timer) + return -ENOMEM; + + timer->base = of_iomap(node, 0); + if (!timer->base) { pr_err("%s: of_iomap failed\n", node->full_name); return -ENXIO; } @@ -147,12 +154,6 @@ static int __init moxart_timer_init(struct device_node *node) return -EINVAL; } - ret = setup_irq(irq, &moxart_timer_irq); - if (ret) { - pr_err("%s: setup_irq failed\n", node->full_name); - return ret; - } - clk = of_clk_get(node, 0); if (IS_ERR(clk)) { pr_err("%s: of_clk_get failed\n", node->full_name); @@ -161,7 +162,31 @@ static int __init moxart_timer_init(struct device_node *node) pclk = clk_get_rate(clk); - ret = clocksource_mmio_init(base + TIMER2_BASE + REG_COUNT, + if (of_device_is_compatible(node, "moxa,moxart-timer")) { + timer->t1_enable_val = MOXART_TIMER1_ENABLE; + timer->t1_disable_val = MOXART_TIMER1_DISABLE; + } else + panic("%s: unknown platform\n", node->full_name); + + timer->count_per_tick = DIV_ROUND_CLOSEST(pclk, HZ); + + timer->clkevt.name = node->name; + timer->clkevt.rating = 200; + timer->clkevt.features = CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_ONESHOT; + timer->clkevt.set_state_shutdown = moxart_shutdown; + timer->clkevt.set_state_periodic = moxart_set_periodic; + timer->clkevt.set_state_oneshot = moxart_set_oneshot; + timer->clkevt.tick_resume = moxart_set_oneshot; + timer->clkevt.set_next_event = moxart_clkevt_next_event; + timer->clkevt.cpumask = cpumask_of(0); + timer->clkevt.irq = irq; + timer->act.name = node->name; + timer->act.flags = IRQF_TIMER; + timer->act.handler = moxart_timer_interrupt; + timer->act.dev_id = &timer->clkevt; + + ret = clocksource_mmio_init(timer->base + TIMER2_BASE + REG_COUNT, "moxart_timer", pclk, 200, 32, clocksource_mmio_readl_down); if (ret) { @@ -169,13 +194,14 @@ static int __init moxart_timer_init(struct device_node *node) return ret; } - clock_count_per_tick = DIV_ROUND_CLOSEST(pclk, HZ); - - writel(~0, base + TIMER2_BASE + REG_LOAD); - writel(TIMEREG_CR_2_ENABLE, base + TIMER_CR); + ret = setup_irq(irq, &timer->act); + if (ret) { + pr_err("%s: setup_irq failed\n", node->full_name); + return ret; + } - moxart_clockevent.cpumask = cpumask_of(0); - moxart_clockevent.irq = irq; + writel(~0, timer->base + TIMER2_BASE + REG_LOAD); + writel(timer->t1_disable_val, timer->base + TIMER_CR); /* * documentation is not publicly available: @@ -183,8 +209,7 @@ static int __init moxart_timer_init(struct device_node *node) * max_delta 0xfffffffe should be ok because count * register size is u32 */ - clockevents_config_and_register(&moxart_clockevent, pclk, - 0x4, 0xfffffffe); + clockevents_config_and_register(&timer->clkevt, pclk, 0x4, 0xfffffffe); return 0; } From ba36d53db536d31c49c139484e82581eeb377278 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Thu, 21 Jul 2016 23:13:53 +0930 Subject: [PATCH 218/538] clocksource/drivers/moxart: Add Aspeed support The Aspeed SoC has timer IP with a very similar register layout to the moxart timer. This patch adds support for the fourth and fifth gen aspeed SoCs, and has been tested on the ast2400 and ast2500. Signed-off-by: Joel Stanley Acked-by: Rob Herring Signed-off-by: Daniel Lezcano --- .../bindings/timer/moxa,moxart-timer.txt | 4 ++- drivers/clocksource/moxart_timer.c | 32 +++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/timer/moxa,moxart-timer.txt b/Documentation/devicetree/bindings/timer/moxa,moxart-timer.txt index da2d510cae47..e207c11630af 100644 --- a/Documentation/devicetree/bindings/timer/moxa,moxart-timer.txt +++ b/Documentation/devicetree/bindings/timer/moxa,moxart-timer.txt @@ -2,7 +2,9 @@ MOXA ART timer Required properties: -- compatible : Must be "moxa,moxart-timer" +- compatible : Must be one of: + - "moxa,moxart-timer" + - "aspeed,ast2400-timer" - reg : Should contain registers location and length - interrupts : Should contain the timer interrupt number - clocks : Should contain phandle for the clock that drives the counter diff --git a/drivers/clocksource/moxart_timer.c b/drivers/clocksource/moxart_timer.c index cb0b34786a8e..ad2bead9ce45 100644 --- a/drivers/clocksource/moxart_timer.c +++ b/drivers/clocksource/moxart_timer.c @@ -56,6 +56,23 @@ #define MOXART_TIMER1_ENABLE (MOXART_CR_2_ENABLE | MOXART_CR_1_ENABLE) #define MOXART_TIMER1_DISABLE (MOXART_CR_2_ENABLE) +/* + * The ASpeed variant of the IP block has a different layout + * for the control register + */ +#define ASPEED_CR_1_ENABLE BIT(0) +#define ASPEED_CR_1_CLOCK BIT(1) +#define ASPEED_CR_1_INT BIT(2) +#define ASPEED_CR_2_ENABLE BIT(4) +#define ASPEED_CR_2_CLOCK BIT(5) +#define ASPEED_CR_2_INT BIT(6) +#define ASPEED_CR_3_ENABLE BIT(8) +#define ASPEED_CR_3_CLOCK BIT(9) +#define ASPEED_CR_3_INT BIT(10) + +#define ASPEED_TIMER1_ENABLE (ASPEED_CR_2_ENABLE | ASPEED_CR_1_ENABLE) +#define ASPEED_TIMER1_DISABLE (ASPEED_CR_2_ENABLE) + struct moxart_timer { void __iomem *base; unsigned int t1_disable_val; @@ -165,6 +182,9 @@ static int __init moxart_timer_init(struct device_node *node) if (of_device_is_compatible(node, "moxa,moxart-timer")) { timer->t1_enable_val = MOXART_TIMER1_ENABLE; timer->t1_disable_val = MOXART_TIMER1_DISABLE; + } else if (of_device_is_compatible(node, "aspeed,ast2400-timer")) { + timer->t1_enable_val = ASPEED_TIMER1_ENABLE; + timer->t1_disable_val = ASPEED_TIMER1_DISABLE; } else panic("%s: unknown platform\n", node->full_name); @@ -200,6 +220,17 @@ static int __init moxart_timer_init(struct device_node *node) return ret; } + /* Clear match registers */ + writel(0, timer->base + TIMER1_BASE + REG_MATCH1); + writel(0, timer->base + TIMER1_BASE + REG_MATCH2); + writel(0, timer->base + TIMER2_BASE + REG_MATCH1); + writel(0, timer->base + TIMER2_BASE + REG_MATCH2); + + /* + * Start timer 2 rolling as our main wall clock source, keep timer 1 + * disabled + */ + writel(0, timer->base + TIMER_CR); writel(~0, timer->base + TIMER2_BASE + REG_LOAD); writel(timer->t1_disable_val, timer->base + TIMER_CR); @@ -214,3 +245,4 @@ static int __init moxart_timer_init(struct device_node *node) return 0; } CLOCKSOURCE_OF_DECLARE(moxart, "moxa,moxart-timer", moxart_timer_init); +CLOCKSOURCE_OF_DECLARE(aspeed, "aspeed,ast2400-timer", moxart_timer_init); From f99fd22e4d4bc84880a8a3117311bbf0e3a6a9dc Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 6 Sep 2016 13:22:10 -0400 Subject: [PATCH 219/538] x86/hpet: Reduce HPET counter read contention On a large system with many CPUs, using HPET as the clock source can have a significant impact on the overall system performance because of the following reasons: 1) There is a single HPET counter shared by all the CPUs. 2) HPET counter reading is a very slow operation. Using HPET as the default clock source may happen when, for example, the TSC clock calibration exceeds the allowable tolerance. Something the performance slowdown can be so severe that the system may crash because of a NMI watchdog soft lockup, for example. During the TSC clock calibration process, the default clock source will be set temporarily to HPET. For systems with many CPUs, it is possible that NMI watchdog soft lockup may occur occasionally during that short time period where HPET clocking is active as is shown in the kernel log below: [ 71.646504] hpet0: 8 comparators, 64-bit 14.318180 MHz counter [ 71.655313] Switching to clocksource hpet [ 95.679135] BUG: soft lockup - CPU#144 stuck for 23s! [swapper/144:0] [ 95.693363] BUG: soft lockup - CPU#145 stuck for 23s! [swapper/145:0] [ 95.695580] BUG: soft lockup - CPU#582 stuck for 23s! [swapper/582:0] [ 95.698128] BUG: soft lockup - CPU#357 stuck for 23s! [swapper/357:0] This patch addresses the above issues by reducing HPET read contention using the fact that if more than one CPUs are trying to access HPET at the same time, it will be more efficient when only one CPU in the group reads the HPET counter and shares it with the rest of the group instead of each group member trying to read the HPET counter individually. This is done by using a combination quadword that contains a 32-bit stored HPET value and a 32-bit spinlock. The CPU that gets the lock will be responsible for reading the HPET counter and storing it in the quadword. The others will monitor the change in HPET value and lock status and grab the latest stored HPET value accordingly. This change is only enabled on 64-bit SMP configuration. On a 4-socket Haswell-EX box with 144 threads (HT on), running the AIM7 compute workload (1500 users) on a 4.8-rc1 kernel (HZ=1000) with and without the patch has the following performance numbers (with HPET or TSC as clock source): TSC = 1042431 jobs/min HPET w/o patch = 798068 jobs/min HPET with patch = 1029445 jobs/min The perf profile showed a reduction of the %CPU time consumed by read_hpet from 11.19% without patch to 1.24% with patch. [ tglx: It's really sad that we need to have such hacks just to deal with the fact that cpu vendors have not managed to fix the TSC wreckage within 15+ years. Were They Forgetting? ] Signed-off-by: Waiman Long Tested-by: Prarit Bhargava Cc: Scott J Norton Cc: Douglas Hatch Cc: Randy Wright Cc: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1473182530-29175-1-git-send-email-Waiman.Long@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 94 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index c6dfd801df97..274fab99169d 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -756,10 +756,104 @@ static void hpet_reserve_msi_timers(struct hpet_data *hd) /* * Clock source related code */ +#if defined(CONFIG_SMP) && defined(CONFIG_64BIT) +/* + * Reading the HPET counter is a very slow operation. If a large number of + * CPUs are trying to access the HPET counter simultaneously, it can cause + * massive delay and slow down system performance dramatically. This may + * happen when HPET is the default clock source instead of TSC. For a + * really large system with hundreds of CPUs, the slowdown may be so + * severe that it may actually crash the system because of a NMI watchdog + * soft lockup, for example. + * + * If multiple CPUs are trying to access the HPET counter at the same time, + * we don't actually need to read the counter multiple times. Instead, the + * other CPUs can use the counter value read by the first CPU in the group. + * + * This special feature is only enabled on x86-64 systems. It is unlikely + * that 32-bit x86 systems will have enough CPUs to require this feature + * with its associated locking overhead. And we also need 64-bit atomic + * read. + * + * The lock and the hpet value are stored together and can be read in a + * single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t + * is 32 bits in size. + */ +union hpet_lock { + struct { + arch_spinlock_t lock; + u32 value; + }; + u64 lockval; +}; + +static union hpet_lock hpet __cacheline_aligned = { + { .lock = __ARCH_SPIN_LOCK_UNLOCKED, }, +}; + +static cycle_t read_hpet(struct clocksource *cs) +{ + unsigned long flags; + union hpet_lock old, new; + + BUILD_BUG_ON(sizeof(union hpet_lock) != 8); + + /* + * Read HPET directly if in NMI. + */ + if (in_nmi()) + return (cycle_t)hpet_readl(HPET_COUNTER); + + /* + * Read the current state of the lock and HPET value atomically. + */ + old.lockval = READ_ONCE(hpet.lockval); + + if (arch_spin_is_locked(&old.lock)) + goto contended; + + local_irq_save(flags); + if (arch_spin_trylock(&hpet.lock)) { + new.value = hpet_readl(HPET_COUNTER); + /* + * Use WRITE_ONCE() to prevent store tearing. + */ + WRITE_ONCE(hpet.value, new.value); + arch_spin_unlock(&hpet.lock); + local_irq_restore(flags); + return (cycle_t)new.value; + } + local_irq_restore(flags); + +contended: + /* + * Contended case + * -------------- + * Wait until the HPET value change or the lock is free to indicate + * its value is up-to-date. + * + * It is possible that old.value has already contained the latest + * HPET value while the lock holder was in the process of releasing + * the lock. Checking for lock state change will enable us to return + * the value immediately instead of waiting for the next HPET reader + * to come along. + */ + do { + cpu_relax(); + new.lockval = READ_ONCE(hpet.lockval); + } while ((new.value == old.value) && arch_spin_is_locked(&new.lock)); + + return (cycle_t)new.value; +} +#else +/* + * For UP or 32-bit. + */ static cycle_t read_hpet(struct clocksource *cs) { return (cycle_t)hpet_readl(HPET_COUNTER); } +#endif static struct clocksource clocksource_hpet = { .name = "hpet", From a179b69359feb26ddb148bb6a2c0c53a8d1dc5be Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 24 Aug 2016 05:36:53 -0300 Subject: [PATCH 220/538] [media] cec: don't Feature Abort broadcast msgs when unregistered If the adapter is configured as 'Unregistered', then cec_receive_notify incorrectly thinks that broadcast messages are directed messages. The destination for broadcast messages is 0xf, and the logical address assigned to Unregistered devices is also 0xf and the logic didn't handle that correctly. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/cec/cec-adap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/staging/media/cec/cec-adap.c b/drivers/staging/media/cec/cec-adap.c index e980ac9c9279..946986f3ac0d 100644 --- a/drivers/staging/media/cec/cec-adap.c +++ b/drivers/staging/media/cec/cec-adap.c @@ -1409,7 +1409,6 @@ static int cec_receive_notify(struct cec_adapter *adap, struct cec_msg *msg, u8 init_laddr = cec_msg_initiator(msg); u8 devtype = cec_log_addr2dev(adap, dest_laddr); int la_idx = cec_log_addr2idx(adap, dest_laddr); - bool is_directed = la_idx >= 0; bool from_unregistered = init_laddr == 0xf; struct cec_msg tx_cec_msg = { }; @@ -1571,7 +1570,7 @@ static int cec_receive_notify(struct cec_adapter *adap, struct cec_msg *msg, * Unprocessed messages are aborted if userspace isn't doing * any processing either. */ - if (is_directed && !is_reply && !adap->follower_cnt && + if (!is_broadcast && !is_reply && !adap->follower_cnt && !adap->cec_follower && msg->msg[1] != CEC_MSG_FEATURE_ABORT) return cec_feature_abort(adap, msg); break; From 60815d4a78204915f5cdf79a536bc96d5d23ae5f Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 24 Aug 2016 07:17:22 -0300 Subject: [PATCH 221/538] [media] cec: fix ioctl return code when not registered Don't return the confusing -EIO error code when the device is not registered, instead return -ENODEV which is the proper thing to do in this situation. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/cec/cec-api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/media/cec/cec-api.c b/drivers/staging/media/cec/cec-api.c index 6f58ee85eea4..e274e2f22398 100644 --- a/drivers/staging/media/cec/cec-api.c +++ b/drivers/staging/media/cec/cec-api.c @@ -435,7 +435,7 @@ static long cec_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) void __user *parg = (void __user *)arg; if (!devnode->registered) - return -EIO; + return -ENODEV; switch (cmd) { case CEC_ADAP_G_CAPS: From 4971531af319f8bdd9a81a87eecfb6b19f2f8c8e Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 21 Jun 2016 23:11:38 +0100 Subject: [PATCH 222/538] x86/efi: Test for EFI_MEMMAP functionality when iterating EFI memmap Both efi_find_mirror() and efi_fake_memmap() really want to know whether the EFI memory map is available, not just whether the machine was booted using EFI. efi_fake_memmap() even has a check for EFI_MEMMAP at the start of the function. Since we've already got other code that has this dependency, merge everything under one if() conditional, and remove the now superfluous check from efi_fake_memmap(). Tested-by: Dave Young [kexec/kdump] Tested-by: Ard Biesheuvel [arm] Acked-by: Ard Biesheuvel Cc: Taku Izumi Cc: Tony Luck Cc: Xishi Qiu Cc: Kamezawa Hiroyuki Signed-off-by: Matt Fleming --- arch/x86/kernel/setup.c | 17 ++++++++--------- drivers/firmware/efi/fake_mem.c | 2 +- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0fa60f5f5a16..4fd69e532c15 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1096,19 +1096,18 @@ void __init setup_arch(char **cmdline_p) memblock_set_current_limit(ISA_END_ADDRESS); memblock_x86_fill(); - if (efi_enabled(EFI_BOOT)) { + reserve_bios_regions(); + + if (efi_enabled(EFI_MEMMAP)) { efi_fake_memmap(); efi_find_mirror(); - } - - reserve_bios_regions(); - /* - * The EFI specification says that boot service code won't be called - * after ExitBootServices(). This is, in fact, a lie. - */ - if (efi_enabled(EFI_MEMMAP)) + /* + * The EFI specification says that boot service code won't be + * called after ExitBootServices(). This is, in fact, a lie. + */ efi_reserve_boot_services(); + } /* preallocate 4k for mptable mpc */ early_reserve_e820_mpc_new(); diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c index 48430aba13c1..c437388a7b85 100644 --- a/drivers/firmware/efi/fake_mem.c +++ b/drivers/firmware/efi/fake_mem.c @@ -64,7 +64,7 @@ void __init efi_fake_memmap(void) void *old, *new; int i; - if (!nr_fake_mem || !efi_enabled(EFI_MEMMAP)) + if (!nr_fake_mem) return; /* count up the number of EFI memory descriptor */ From ab72a27da4c6c19b0e3d6d7556fdd4afb581c8ac Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Mon, 20 Jun 2016 14:36:51 +0100 Subject: [PATCH 223/538] x86/efi: Consolidate region mapping logic EFI regions are currently mapped in two separate places. The bulk of the work is done in efi_map_regions() but when CONFIG_EFI_MIXED is enabled the additional regions that are required when operating in mixed mode are mapping in efi_setup_page_tables(). Pull everything into efi_map_regions() and refactor the test for which regions should be mapped into a should_map_region() function. Generously sprinkle comments to clarify the different cases. Acked-by: Borislav Petkov Tested-by: Dave Young [kexec/kdump] Tested-by: Ard Biesheuvel [arm] Acked-by: Ard Biesheuvel Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 50 +++++++++++++++++++++++++++++----- arch/x86/platform/efi/efi_64.c | 20 -------------- 2 files changed, 43 insertions(+), 27 deletions(-) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 1fbb408e2e72..625ec729b4e8 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -745,6 +745,46 @@ static void *efi_map_next_entry(void *entry) return entry; } +static bool should_map_region(efi_memory_desc_t *md) +{ + /* + * Runtime regions always require runtime mappings (obviously). + */ + if (md->attribute & EFI_MEMORY_RUNTIME) + return true; + + /* + * 32-bit EFI doesn't suffer from the bug that requires us to + * reserve boot services regions, and mixed mode support + * doesn't exist for 32-bit kernels. + */ + if (IS_ENABLED(CONFIG_X86_32)) + return false; + + /* + * Map all of RAM so that we can access arguments in the 1:1 + * mapping when making EFI runtime calls. + */ + if (IS_ENABLED(CONFIG_EFI_MIXED) && !efi_is_native()) { + if (md->type == EFI_CONVENTIONAL_MEMORY || + md->type == EFI_LOADER_DATA || + md->type == EFI_LOADER_CODE) + return true; + } + + /* + * Map boot services regions as a workaround for buggy + * firmware that accesses them even when they shouldn't. + * + * See efi_{reserve,free}_boot_services(). + */ + if (md->type == EFI_BOOT_SERVICES_CODE || + md->type == EFI_BOOT_SERVICES_DATA) + return true; + + return false; +} + /* * Map the efi memory ranges of the runtime services and update new_mmap with * virtual addresses. @@ -761,13 +801,9 @@ static void * __init efi_map_regions(int *count, int *pg_shift) p = NULL; while ((p = efi_map_next_entry(p))) { md = p; - if (!(md->attribute & EFI_MEMORY_RUNTIME)) { -#ifdef CONFIG_X86_64 - if (md->type != EFI_BOOT_SERVICES_CODE && - md->type != EFI_BOOT_SERVICES_DATA) -#endif - continue; - } + + if (!should_map_region(md)) + continue; efi_map_region(md); get_systab_virt_addr(md); diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 677e29e29473..45434ea345e9 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -214,7 +214,6 @@ void efi_sync_low_kernel_mappings(void) int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) { unsigned long pfn, text; - efi_memory_desc_t *md; struct page *page; unsigned npages; pgd_t *pgd; @@ -248,25 +247,6 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) if (!IS_ENABLED(CONFIG_EFI_MIXED)) return 0; - /* - * Map all of RAM so that we can access arguments in the 1:1 - * mapping when making EFI runtime calls. - */ - for_each_efi_memory_desc(md) { - if (md->type != EFI_CONVENTIONAL_MEMORY && - md->type != EFI_LOADER_DATA && - md->type != EFI_LOADER_CODE) - continue; - - pfn = md->phys_addr >> PAGE_SHIFT; - npages = md->num_pages; - - if (kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, npages, _PAGE_RW)) { - pr_err("Failed to map 1:1 memory\n"); - return 1; - } - } - page = alloc_page(GFP_KERNEL|__GFP_DMA32); if (!page) panic("Unable to allocate EFI runtime stack < 4GB\n"); From 9479c7cebfb568f8b8b424be7f1cac120e9eea95 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 26 Feb 2016 21:22:05 +0000 Subject: [PATCH 224/538] efi: Refactor efi_memmap_init_early() into arch-neutral code Every EFI architecture apart from ia64 needs to setup the EFI memory map at efi.memmap, and the code for doing that is essentially the same across all implementations. Therefore, it makes sense to factor this out into the common code under drivers/firmware/efi/. The only slight variation is the data structure out of which we pull the initial memory map information, such as physical address, memory descriptor size and version, etc. We can address this by passing a generic data structure (struct efi_memory_map_data) as the argument to efi_memmap_init_early() which contains the minimum info required for initialising the memory map. In the process, this patch also fixes a few undesirable implementation differences: - ARM and arm64 were failing to clear the EFI_MEMMAP bit when unmapping the early EFI memory map. EFI_MEMMAP indicates whether the EFI memory map is mapped (not the regions contained within) and can be traversed. It's more correct to set the bit as soon as we memremap() the passed in EFI memmap. - Rename efi_unmmap_memmap() to efi_memmap_unmap() to adhere to the regular naming scheme. This patch also uses a read-write mapping for the memory map instead of the read-only mapping currently used on ARM and arm64. x86 needs the ability to update the memory map in-place when assigning virtual addresses to regions (efi_map_region()) and tagging regions when reserving boot services (efi_reserve_boot_services()). There's no way for the generic fake_mem code to know which mapping to use without introducing some arch-specific constant/hook, so just use read-write since read-only is of dubious value for the EFI memory map. Tested-by: Dave Young [kexec/kdump] Tested-by: Ard Biesheuvel [arm] Acked-by: Ard Biesheuvel Cc: Leif Lindholm Cc: Peter Jones Cc: Borislav Petkov Cc: Mark Rutland Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 1 - arch/x86/platform/efi/efi.c | 66 ++++++++---------------------- arch/x86/platform/efi/quirks.c | 4 +- drivers/firmware/efi/arm-init.c | 17 ++++---- drivers/firmware/efi/arm-runtime.c | 2 +- drivers/firmware/efi/efi.c | 46 +++++++++++++++++++++ drivers/firmware/efi/fake_mem.c | 15 ++++--- include/linux/efi.h | 16 ++++++++ 8 files changed, 99 insertions(+), 68 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index d0bb76d81402..4630e2bfa8fb 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -117,7 +117,6 @@ extern int __init efi_memblock_x86_reserve_range(void); extern pgd_t * __init efi_call_phys_prolog(void); extern void __init efi_call_phys_epilog(pgd_t *save_pgd); extern void __init efi_print_memmap(void); -extern void __init efi_unmap_memmap(void); extern void __init efi_memory_uc(u64 addr, unsigned long size); extern void __init efi_map_region(efi_memory_desc_t *md); extern void __init efi_map_region_fixed(efi_memory_desc_t *md); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 625ec729b4e8..5ccde8b6cdd1 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -172,7 +172,9 @@ static void __init do_add_efi_memmap(void) int __init efi_memblock_x86_reserve_range(void) { struct efi_info *e = &boot_params.efi_info; + struct efi_memory_map_data data; phys_addr_t pmap; + int rv; if (efi_enabled(EFI_PARAVIRT)) return 0; @@ -187,11 +189,17 @@ int __init efi_memblock_x86_reserve_range(void) #else pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); #endif - efi.memmap.phys_map = pmap; - efi.memmap.nr_map = e->efi_memmap_size / - e->efi_memdesc_size; - efi.memmap.desc_size = e->efi_memdesc_size; - efi.memmap.desc_version = e->efi_memdesc_version; + data.phys_map = pmap; + data.size = e->efi_memmap_size; + data.desc_size = e->efi_memdesc_size; + data.desc_version = e->efi_memdesc_version; + + rv = efi_memmap_init_early(&data); + if (rv) + return rv; + + if (add_efi_memmap) + do_add_efi_memmap(); WARN(efi.memmap.desc_version != 1, "Unexpected EFI_MEMORY_DESCRIPTOR version %ld", @@ -218,19 +226,6 @@ void __init efi_print_memmap(void) } } -void __init efi_unmap_memmap(void) -{ - unsigned long size; - - clear_bit(EFI_MEMMAP, &efi.flags); - - size = efi.memmap.nr_map * efi.memmap.desc_size; - if (efi.memmap.map) { - early_memunmap(efi.memmap.map, size); - efi.memmap.map = NULL; - } -} - static int __init efi_systab_init(void *phys) { if (efi_enabled(EFI_64BIT)) { @@ -414,33 +409,6 @@ static int __init efi_runtime_init(void) return 0; } -static int __init efi_memmap_init(void) -{ - unsigned long addr, size; - - if (efi_enabled(EFI_PARAVIRT)) - return 0; - - /* Map the EFI memory map */ - size = efi.memmap.nr_map * efi.memmap.desc_size; - addr = (unsigned long)efi.memmap.phys_map; - - efi.memmap.map = early_memremap(addr, size); - if (efi.memmap.map == NULL) { - pr_err("Could not map the memory map!\n"); - return -ENOMEM; - } - - efi.memmap.map_end = efi.memmap.map + size; - - if (add_efi_memmap) - do_add_efi_memmap(); - - set_bit(EFI_MEMMAP, &efi.flags); - - return 0; -} - void __init efi_init(void) { efi_char16_t *c16; @@ -498,11 +466,11 @@ void __init efi_init(void) if (!efi_runtime_supported()) pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n"); else { - if (efi_runtime_disabled() || efi_runtime_init()) + if (efi_runtime_disabled() || efi_runtime_init()) { + efi_memmap_unmap(); return; + } } - if (efi_memmap_init()) - return; if (efi_enabled(EFI_DBG)) efi_print_memmap(); @@ -839,7 +807,7 @@ static void __init kexec_enter_virtual_mode(void) * non-native EFI */ if (!efi_is_native()) { - efi_unmap_memmap(); + efi_memmap_unmap(); clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; } diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 89d1146f5a6f..47b99108ff8e 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -287,7 +287,7 @@ void __init efi_free_boot_services(void) free_bootmem_late(start, size); } - efi_unmap_memmap(); + efi_memmap_unmap(); } /* @@ -365,7 +365,7 @@ void __init efi_apply_memmap_quirks(void) */ if (!efi_runtime_supported()) { pr_info("Setup done, disabling due to 32/64-bit mismatch\n"); - efi_unmap_memmap(); + efi_memmap_unmap(); } /* UV2+ BIOS has a fix for this issue. UV1 still needs the quirk. */ diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c index c49d50e68aee..5a2df3fefccc 100644 --- a/drivers/firmware/efi/arm-init.c +++ b/drivers/firmware/efi/arm-init.c @@ -211,12 +211,11 @@ static __init void reserve_regions(void) memblock_mark_nomap(paddr, size); } - - set_bit(EFI_MEMMAP, &efi.flags); } void __init efi_init(void) { + struct efi_memory_map_data data; struct efi_fdt_params params; /* Grab UEFI information placed in FDT by stub */ @@ -225,9 +224,12 @@ void __init efi_init(void) efi_system_table = params.system_table; - efi.memmap.phys_map = params.mmap; - efi.memmap.map = early_memremap_ro(params.mmap, params.mmap_size); - if (efi.memmap.map == NULL) { + data.desc_version = params.desc_ver; + data.desc_size = params.desc_size; + data.size = params.mmap_size; + data.phys_map = params.mmap; + + if (efi_memmap_init_early(&data) < 0) { /* * If we are booting via UEFI, the UEFI memory map is the only * description of memory we have, so there is little point in @@ -235,9 +237,6 @@ void __init efi_init(void) */ panic("Unable to map EFI memory map.\n"); } - efi.memmap.map_end = efi.memmap.map + params.mmap_size; - efi.memmap.desc_size = params.desc_size; - efi.memmap.desc_version = params.desc_ver; WARN(efi.memmap.desc_version != 1, "Unexpected EFI_MEMORY_DESCRIPTOR version %ld", @@ -248,7 +247,7 @@ void __init efi_init(void) reserve_regions(); efi_memattr_init(); - early_memunmap(efi.memmap.map, params.mmap_size); + efi_memmap_unmap(); memblock_reserve(params.mmap & PAGE_MASK, PAGE_ALIGN(params.mmap_size + diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index c394b81fe452..eedb30351a68 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -114,7 +114,7 @@ static int __init arm_enable_runtime_services(void) pr_info("Remapping and enabling EFI services.\n"); - mapsize = efi.memmap.map_end - efi.memmap.map; + mapsize = efi.memmap.desc_size * efi.memmap.nr_map; efi.memmap.map = memremap(efi.memmap.phys_map, mapsize, MEMREMAP_WB); if (!efi.memmap.map) { diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 5a2631af7410..c1879999abe7 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -544,6 +544,52 @@ int __init efi_config_init(efi_config_table_type_t *arch_tables) return ret; } +/** + * efi_memmap_init_early - Map the EFI memory map data structure + * @data: EFI memory map data + * + * Use early_memremap() to map the passed in EFI memory map and assign + * it to efi.memmap. + */ +int __init efi_memmap_init_early(struct efi_memory_map_data *data) +{ + struct efi_memory_map map; + + if (efi_enabled(EFI_PARAVIRT)) + return 0; + + map.phys_map = data->phys_map; + + map.map = early_memremap(data->phys_map, data->size); + if (!map.map) { + pr_err("Could not map the memory map!\n"); + return -ENOMEM; + } + + map.nr_map = data->size / data->desc_size; + map.map_end = map.map + data->size; + + map.desc_version = data->desc_version; + map.desc_size = data->desc_size; + + set_bit(EFI_MEMMAP, &efi.flags); + + efi.memmap = map; + + return 0; +} + +void __init efi_memmap_unmap(void) +{ + unsigned long size; + + size = efi.memmap.desc_size * efi.memmap.nr_map; + + early_memunmap(efi.memmap.map, size); + efi.memmap.map = NULL; + clear_bit(EFI_MEMMAP, &efi.flags); +} + #ifdef CONFIG_EFI_VARS_MODULE static int __init efi_load_efivars(void) { diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c index c437388a7b85..939eec47139f 100644 --- a/drivers/firmware/efi/fake_mem.c +++ b/drivers/firmware/efi/fake_mem.c @@ -57,6 +57,7 @@ static int __init cmp_fake_mem(const void *x1, const void *x2) void __init efi_fake_memmap(void) { u64 start, end, m_start, m_end, m_attr; + struct efi_memory_map_data data; int new_nr_map = efi.memmap.nr_map; efi_memory_desc_t *md; phys_addr_t new_memmap_phy; @@ -180,12 +181,14 @@ void __init efi_fake_memmap(void) } /* swap into new EFI memmap */ - efi_unmap_memmap(); - efi.memmap.map = new_memmap; - efi.memmap.phys_map = new_memmap_phy; - efi.memmap.nr_map = new_nr_map; - efi.memmap.map_end = efi.memmap.map + efi.memmap.nr_map * efi.memmap.desc_size; - set_bit(EFI_MEMMAP, &efi.flags); + early_memunmap(new_memmap, efi.memmap.desc_size * new_nr_map); + efi_memmap_unmap(); + + data.phys_map = new_memmap_phy; + data.size = efi.memmap.desc_size * new_nr_map; + data.desc_version = efi.memmap.desc_version; + data.desc_size = efi.memmap.desc_size; + efi_memmap_init_early(&data); /* print new EFI memmap */ efi_print_memmap(); diff --git a/include/linux/efi.h b/include/linux/efi.h index 7f5a58225385..d862d4998580 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -669,6 +669,18 @@ typedef struct { unsigned long tables; } efi_system_table_t; +/* + * Architecture independent structure for describing a memory map for the + * benefit of efi_memmap_init_early(), saving us the need to pass four + * parameters. + */ +struct efi_memory_map_data { + phys_addr_t phys_map; + unsigned long size; + unsigned long desc_version; + unsigned long desc_size; +}; + struct efi_memory_map { phys_addr_t phys_map; void *map; @@ -900,6 +912,10 @@ static inline efi_status_t efi_query_variable_store(u32 attributes, } #endif extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr); + +extern int __init efi_memmap_init_early(struct efi_memory_map_data *data); +extern void __init efi_memmap_unmap(void); + extern int efi_config_init(efi_config_table_type_t *arch_tables); #ifdef CONFIG_EFI_ESRT extern void __init efi_esrt_init(void); From dca0f971ea6fcf2f1bb78f7995adf80da9f4767f Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Sat, 27 Feb 2016 15:52:50 +0000 Subject: [PATCH 225/538] efi: Add efi_memmap_init_late() for permanent EFI memmap Drivers need a way to access the EFI memory map at runtime. ARM and arm64 currently provide this by remapping the EFI memory map into the vmalloc space before setting up the EFI virtual mappings. x86 does not provide this functionality which has resulted in the code in efi_mem_desc_lookup() where it will manually map individual EFI memmap entries if the memmap has already been torn down on x86, /* * If a driver calls this after efi_free_boot_services, * ->map will be NULL, and the target may also not be mapped. * So just always get our own virtual map on the CPU. * */ md = early_memremap(p, sizeof (*md)); There isn't a good reason for not providing a permanent EFI memory map for runtime queries, especially since the EFI regions are not mapped into the standard kernel page tables. Tested-by: Dave Young [kexec/kdump] Tested-by: Ard Biesheuvel [arm] Acked-by: Ard Biesheuvel Cc: Leif Lindholm Cc: Peter Jones Cc: Borislav Petkov Cc: Mark Rutland Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 44 +++++++--- arch/x86/platform/efi/quirks.c | 2 - drivers/firmware/efi/arm-runtime.c | 4 +- drivers/firmware/efi/efi.c | 135 ++++++++++++++++++++--------- include/linux/efi.h | 2 + 5 files changed, 130 insertions(+), 57 deletions(-) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 5ccde8b6cdd1..33996987ac70 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -827,6 +827,19 @@ static void __init kexec_enter_virtual_mode(void) get_systab_virt_addr(md); } + /* + * Unregister the early EFI memmap from efi_init() and install + * the new EFI memory map. + */ + efi_memmap_unmap(); + + if (efi_memmap_init_late(efi.memmap.phys_map, + efi.memmap.desc_size * efi.memmap.nr_map)) { + pr_err("Failed to remap late EFI memory map\n"); + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); + return; + } + save_runtime_map(); BUG_ON(!efi.systab); @@ -888,6 +901,7 @@ static void __init __efi_enter_virtual_mode(void) int count = 0, pg_shift = 0; void *new_memmap = NULL; efi_status_t status; + phys_addr_t pa; efi.systab = NULL; @@ -905,11 +919,26 @@ static void __init __efi_enter_virtual_mode(void) return; } + pa = __pa(new_memmap); + + /* + * Unregister the early EFI memmap from efi_init() and install + * the new EFI memory map that we are about to pass to the + * firmware via SetVirtualAddressMap(). + */ + efi_memmap_unmap(); + + if (efi_memmap_init_late(pa, efi.memmap.desc_size * count)) { + pr_err("Failed to remap late EFI memory map\n"); + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); + return; + } + save_runtime_map(); BUG_ON(!efi.systab); - if (efi_setup_page_tables(__pa(new_memmap), 1 << pg_shift)) { + if (efi_setup_page_tables(pa, 1 << pg_shift)) { clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; } @@ -921,14 +950,14 @@ static void __init __efi_enter_virtual_mode(void) efi.memmap.desc_size * count, efi.memmap.desc_size, efi.memmap.desc_version, - (efi_memory_desc_t *)__pa(new_memmap)); + (efi_memory_desc_t *)pa); } else { status = efi_thunk_set_virtual_address_map( efi_phys.set_virtual_address_map, efi.memmap.desc_size * count, efi.memmap.desc_size, efi.memmap.desc_version, - (efi_memory_desc_t *)__pa(new_memmap)); + (efi_memory_desc_t *)pa); } if (status != EFI_SUCCESS) { @@ -960,15 +989,6 @@ static void __init __efi_enter_virtual_mode(void) efi_runtime_update_mappings(); efi_dump_pagetable(); - /* - * We mapped the descriptor array into the EFI pagetable above - * but we're not unmapping it here because if we're running in - * EFI mixed mode we need all of memory to be accessible when - * we pass parameters to the EFI runtime services in the - * thunking code. - */ - free_pages((unsigned long)new_memmap, pg_shift); - /* clean DUMMY object */ efi_delete_dummy_variable(); } diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 47b99108ff8e..9faf18874692 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -286,8 +286,6 @@ void __init efi_free_boot_services(void) free_bootmem_late(start, size); } - - efi_memmap_unmap(); } /* diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index eedb30351a68..ae001450545f 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -116,12 +116,10 @@ static int __init arm_enable_runtime_services(void) mapsize = efi.memmap.desc_size * efi.memmap.nr_map; - efi.memmap.map = memremap(efi.memmap.phys_map, mapsize, MEMREMAP_WB); - if (!efi.memmap.map) { + if (efi_memmap_init_late(efi.memmap.phys_map, mapsize)) { pr_err("Failed to remap EFI memory map\n"); return -ENOMEM; } - efi.memmap.map_end = efi.memmap.map + mapsize; if (!efi_virtmap_init()) { pr_err("UEFI virtual mapping missing or invalid -- runtime services will not be available\n"); diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index c1879999abe7..8a5e0db72b8f 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -347,56 +347,31 @@ subsys_initcall(efisubsys_init); /* * Find the efi memory descriptor for a given physical address. Given a - * physicall address, determine if it exists within an EFI Memory Map entry, + * physical address, determine if it exists within an EFI Memory Map entry, * and if so, populate the supplied memory descriptor with the appropriate * data. */ int __init efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md) { - struct efi_memory_map *map = &efi.memmap; - phys_addr_t p, e; + efi_memory_desc_t *md; if (!efi_enabled(EFI_MEMMAP)) { pr_err_once("EFI_MEMMAP is not enabled.\n"); return -EINVAL; } - if (!map) { - pr_err_once("efi.memmap is not set.\n"); - return -EINVAL; - } if (!out_md) { pr_err_once("out_md is null.\n"); return -EINVAL; } - if (WARN_ON_ONCE(!map->phys_map)) - return -EINVAL; - if (WARN_ON_ONCE(map->nr_map == 0) || WARN_ON_ONCE(map->desc_size == 0)) - return -EINVAL; - e = map->phys_map + map->nr_map * map->desc_size; - for (p = map->phys_map; p < e; p += map->desc_size) { - efi_memory_desc_t *md; + for_each_efi_memory_desc(md) { u64 size; u64 end; - /* - * If a driver calls this after efi_free_boot_services, - * ->map will be NULL, and the target may also not be mapped. - * So just always get our own virtual map on the CPU. - * - */ - md = early_memremap(p, sizeof (*md)); - if (!md) { - pr_err_once("early_memremap(%pa, %zu) failed.\n", - &p, sizeof (*md)); - return -ENOMEM; - } - if (!(md->attribute & EFI_MEMORY_RUNTIME) && md->type != EFI_BOOT_SERVICES_DATA && md->type != EFI_RUNTIME_SERVICES_DATA) { - early_memunmap(md, sizeof (*md)); continue; } @@ -404,11 +379,8 @@ int __init efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md) end = md->phys_addr + size; if (phys_addr >= md->phys_addr && phys_addr < end) { memcpy(out_md, md, sizeof(*out_md)); - early_memunmap(md, sizeof (*md)); return 0; } - - early_memunmap(md, sizeof (*md)); } pr_err_once("requested map not found.\n"); return -ENOENT; @@ -545,32 +517,49 @@ int __init efi_config_init(efi_config_table_type_t *arch_tables) } /** - * efi_memmap_init_early - Map the EFI memory map data structure + * __efi_memmap_init - Common code for mapping the EFI memory map * @data: EFI memory map data + * @late: Use early or late mapping function? * - * Use early_memremap() to map the passed in EFI memory map and assign - * it to efi.memmap. + * This function takes care of figuring out which function to use to + * map the EFI memory map in efi.memmap based on how far into the boot + * we are. + * + * During bootup @late should be %false since we only have access to + * the early_memremap*() functions as the vmalloc space isn't setup. + * Once the kernel is fully booted we can fallback to the more robust + * memremap*() API. + * + * Returns zero on success, a negative error code on failure. */ -int __init efi_memmap_init_early(struct efi_memory_map_data *data) +static int __init +__efi_memmap_init(struct efi_memory_map_data *data, bool late) { struct efi_memory_map map; + phys_addr_t phys_map; if (efi_enabled(EFI_PARAVIRT)) return 0; - map.phys_map = data->phys_map; + phys_map = data->phys_map; + + if (late) + map.map = memremap(phys_map, data->size, MEMREMAP_WB); + else + map.map = early_memremap(phys_map, data->size); - map.map = early_memremap(data->phys_map, data->size); if (!map.map) { pr_err("Could not map the memory map!\n"); return -ENOMEM; } + map.phys_map = data->phys_map; map.nr_map = data->size / data->desc_size; map.map_end = map.map + data->size; map.desc_version = data->desc_version; map.desc_size = data->desc_size; + map.late = late; set_bit(EFI_MEMMAP, &efi.flags); @@ -579,17 +568,83 @@ int __init efi_memmap_init_early(struct efi_memory_map_data *data) return 0; } +/** + * efi_memmap_init_early - Map the EFI memory map data structure + * @data: EFI memory map data + * + * Use early_memremap() to map the passed in EFI memory map and assign + * it to efi.memmap. + */ +int __init efi_memmap_init_early(struct efi_memory_map_data *data) +{ + /* Cannot go backwards */ + WARN_ON(efi.memmap.late); + + return __efi_memmap_init(data, false); +} + void __init efi_memmap_unmap(void) { - unsigned long size; + if (!efi.memmap.late) { + unsigned long size; - size = efi.memmap.desc_size * efi.memmap.nr_map; + size = efi.memmap.desc_size * efi.memmap.nr_map; + early_memunmap(efi.memmap.map, size); + } else { + memunmap(efi.memmap.map); + } - early_memunmap(efi.memmap.map, size); efi.memmap.map = NULL; clear_bit(EFI_MEMMAP, &efi.flags); } +/** + * efi_memmap_init_late - Map efi.memmap with memremap() + * @phys_addr: Physical address of the new EFI memory map + * @size: Size in bytes of the new EFI memory map + * + * Setup a mapping of the EFI memory map using ioremap_cache(). This + * function should only be called once the vmalloc space has been + * setup and is therefore not suitable for calling during early EFI + * initialise, e.g. in efi_init(). Additionally, it expects + * efi_memmap_init_early() to have already been called. + * + * The reason there are two EFI memmap initialisation + * (efi_memmap_init_early() and this late version) is because the + * early EFI memmap should be explicitly unmapped once EFI + * initialisation is complete as the fixmap space used to map the EFI + * memmap (via early_memremap()) is a scarce resource. + * + * This late mapping is intended to persist for the duration of + * runtime so that things like efi_mem_desc_lookup() and + * efi_mem_attributes() always work. + * + * Returns zero on success, a negative error code on failure. + */ +int __init efi_memmap_init_late(phys_addr_t addr, unsigned long size) +{ + struct efi_memory_map_data data = { + .phys_map = addr, + .size = size, + }; + + /* Did we forget to unmap the early EFI memmap? */ + WARN_ON(efi.memmap.map); + + /* Were we already called? */ + WARN_ON(efi.memmap.late); + + /* + * It makes no sense to allow callers to register different + * values for the following fields. Copy them out of the + * existing early EFI memmap. + */ + data.desc_version = efi.memmap.desc_version; + data.desc_size = efi.memmap.desc_size; + + return __efi_memmap_init(&data, true); +} + #ifdef CONFIG_EFI_VARS_MODULE static int __init efi_load_efivars(void) { diff --git a/include/linux/efi.h b/include/linux/efi.h index d862d4998580..f149676b2fcd 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -688,6 +688,7 @@ struct efi_memory_map { int nr_map; unsigned long desc_version; unsigned long desc_size; + bool late; }; struct efi_fdt_params { @@ -914,6 +915,7 @@ static inline efi_status_t efi_query_variable_store(u32 attributes, extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr); extern int __init efi_memmap_init_early(struct efi_memory_map_data *data); +extern int __init efi_memmap_init_late(phys_addr_t addr, unsigned long size); extern void __init efi_memmap_unmap(void); extern int efi_config_init(efi_config_table_type_t *arch_tables); From c8c1a4c5e4ead0d2dcf0f0bcb8bdbdcf877fb3bb Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Mon, 29 Feb 2016 16:58:18 +0000 Subject: [PATCH 226/538] efi/fake_mem: Refactor main two code chunks into functions There is a whole load of generic EFI memory map code inside of the fake_mem driver which is better suited to being grouped with the rest of the generic EFI code for manipulating EFI memory maps. In preparation for that, this patch refactors the core code, so that it's possible to move entire functions later. Tested-by: Dave Young [kexec/kdump] Tested-by: Ard Biesheuvel [arm] Acked-by: Ard Biesheuvel Cc: Leif Lindholm Cc: Peter Jones Cc: Borislav Petkov Cc: Mark Rutland Cc: Taku Izumi Signed-off-by: Matt Fleming --- drivers/firmware/efi/fake_mem.c | 229 +++++++++++++++++++------------- 1 file changed, 134 insertions(+), 95 deletions(-) diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c index 939eec47139f..446c669431c0 100644 --- a/drivers/firmware/efi/fake_mem.c +++ b/drivers/firmware/efi/fake_mem.c @@ -54,43 +54,151 @@ static int __init cmp_fake_mem(const void *x1, const void *x2) return 0; } +/** + * efi_fake_memmap_split_count - Count number of additional EFI memmap entries + * @md: EFI memory descriptor to split + * @range: Address range (start, end) to split around + * + * Returns the number of additional EFI memmap entries required to + * accomodate @range. + */ +static int efi_fake_memmap_split_count(efi_memory_desc_t *md, struct range *range) +{ + u64 m_start, m_end; + u64 start, end; + int count = 0; + + start = md->phys_addr; + end = start + (md->num_pages << EFI_PAGE_SHIFT) - 1; + + /* modifying range */ + m_start = range->start; + m_end = range->end; + + if (m_start <= start) { + /* split into 2 parts */ + if (start < m_end && m_end < end) + count++; + } + + if (start < m_start && m_start < end) { + /* split into 3 parts */ + if (m_end < end) + count += 2; + /* split into 2 parts */ + if (end <= m_end) + count++; + } + + return count; +} + +/** + * efi_fake_memmap_insert - Insert a fake memory region in an EFI memmap + * @old_memmap: The existing EFI memory map structure + * @buf: Address of buffer to store new map + * @mem: Fake memory map entry to insert + * + * It is suggested that you call efi_fake_memmap_split_count() first + * to see how large @buf needs to be. + */ +static void efi_fake_memmap_insert(struct efi_memory_map *old_memmap, + void *buf, struct fake_mem *mem) +{ + u64 m_start, m_end, m_attr; + efi_memory_desc_t *md; + u64 start, end; + void *old, *new; + + /* modifying range */ + m_start = mem->range.start; + m_end = mem->range.end; + m_attr = mem->attribute; + + for (old = old_memmap->map, new = buf; + old < old_memmap->map_end; + old += old_memmap->desc_size, new += old_memmap->desc_size) { + + /* copy original EFI memory descriptor */ + memcpy(new, old, old_memmap->desc_size); + md = new; + start = md->phys_addr; + end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1; + + if (m_start <= start && end <= m_end) + md->attribute |= m_attr; + + if (m_start <= start && + (start < m_end && m_end < end)) { + /* first part */ + md->attribute |= m_attr; + md->num_pages = (m_end - md->phys_addr + 1) >> + EFI_PAGE_SHIFT; + /* latter part */ + new += old_memmap->desc_size; + memcpy(new, old, old_memmap->desc_size); + md = new; + md->phys_addr = m_end + 1; + md->num_pages = (end - md->phys_addr + 1) >> + EFI_PAGE_SHIFT; + } + + if ((start < m_start && m_start < end) && m_end < end) { + /* first part */ + md->num_pages = (m_start - md->phys_addr) >> + EFI_PAGE_SHIFT; + /* middle part */ + new += old_memmap->desc_size; + memcpy(new, old, old_memmap->desc_size); + md = new; + md->attribute |= m_attr; + md->phys_addr = m_start; + md->num_pages = (m_end - m_start + 1) >> + EFI_PAGE_SHIFT; + /* last part */ + new += old_memmap->desc_size; + memcpy(new, old, old_memmap->desc_size); + md = new; + md->phys_addr = m_end + 1; + md->num_pages = (end - m_end) >> + EFI_PAGE_SHIFT; + } + + if ((start < m_start && m_start < end) && + (end <= m_end)) { + /* first part */ + md->num_pages = (m_start - md->phys_addr) >> + EFI_PAGE_SHIFT; + /* latter part */ + new += old_memmap->desc_size; + memcpy(new, old, old_memmap->desc_size); + md = new; + md->phys_addr = m_start; + md->num_pages = (end - md->phys_addr + 1) >> + EFI_PAGE_SHIFT; + md->attribute |= m_attr; + } + } +} + void __init efi_fake_memmap(void) { - u64 start, end, m_start, m_end, m_attr; struct efi_memory_map_data data; int new_nr_map = efi.memmap.nr_map; efi_memory_desc_t *md; phys_addr_t new_memmap_phy; void *new_memmap; - void *old, *new; int i; if (!nr_fake_mem) return; /* count up the number of EFI memory descriptor */ - for_each_efi_memory_desc(md) { - start = md->phys_addr; - end = start + (md->num_pages << EFI_PAGE_SHIFT) - 1; - - for (i = 0; i < nr_fake_mem; i++) { - /* modifying range */ - m_start = fake_mems[i].range.start; - m_end = fake_mems[i].range.end; - - if (m_start <= start) { - /* split into 2 parts */ - if (start < m_end && m_end < end) - new_nr_map++; - } - if (start < m_start && m_start < end) { - /* split into 3 parts */ - if (m_end < end) - new_nr_map += 2; - /* split into 2 parts */ - if (end <= m_end) - new_nr_map++; - } + for (i = 0; i < nr_fake_mem; i++) { + for_each_efi_memory_desc(md) { + struct range *r = &fake_mems[i].range; + + new_nr_map += efi_fake_memmap_split_count(md, r); } } @@ -108,77 +216,8 @@ void __init efi_fake_memmap(void) return; } - for (old = efi.memmap.map, new = new_memmap; - old < efi.memmap.map_end; - old += efi.memmap.desc_size, new += efi.memmap.desc_size) { - - /* copy original EFI memory descriptor */ - memcpy(new, old, efi.memmap.desc_size); - md = new; - start = md->phys_addr; - end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1; - - for (i = 0; i < nr_fake_mem; i++) { - /* modifying range */ - m_start = fake_mems[i].range.start; - m_end = fake_mems[i].range.end; - m_attr = fake_mems[i].attribute; - - if (m_start <= start && end <= m_end) - md->attribute |= m_attr; - - if (m_start <= start && - (start < m_end && m_end < end)) { - /* first part */ - md->attribute |= m_attr; - md->num_pages = (m_end - md->phys_addr + 1) >> - EFI_PAGE_SHIFT; - /* latter part */ - new += efi.memmap.desc_size; - memcpy(new, old, efi.memmap.desc_size); - md = new; - md->phys_addr = m_end + 1; - md->num_pages = (end - md->phys_addr + 1) >> - EFI_PAGE_SHIFT; - } - - if ((start < m_start && m_start < end) && m_end < end) { - /* first part */ - md->num_pages = (m_start - md->phys_addr) >> - EFI_PAGE_SHIFT; - /* middle part */ - new += efi.memmap.desc_size; - memcpy(new, old, efi.memmap.desc_size); - md = new; - md->attribute |= m_attr; - md->phys_addr = m_start; - md->num_pages = (m_end - m_start + 1) >> - EFI_PAGE_SHIFT; - /* last part */ - new += efi.memmap.desc_size; - memcpy(new, old, efi.memmap.desc_size); - md = new; - md->phys_addr = m_end + 1; - md->num_pages = (end - m_end) >> - EFI_PAGE_SHIFT; - } - - if ((start < m_start && m_start < end) && - (end <= m_end)) { - /* first part */ - md->num_pages = (m_start - md->phys_addr) >> - EFI_PAGE_SHIFT; - /* latter part */ - new += efi.memmap.desc_size; - memcpy(new, old, efi.memmap.desc_size); - md = new; - md->phys_addr = m_start; - md->num_pages = (end - md->phys_addr + 1) >> - EFI_PAGE_SHIFT; - md->attribute |= m_attr; - } - } - } + for (i = 0; i < nr_fake_mem; i++) + efi_fake_memmap_insert(&efi.memmap, new_memmap, &fake_mems[i]); /* swap into new EFI memmap */ early_memunmap(new_memmap, efi.memmap.desc_size * new_nr_map); From 60863c0d1a96b740048cc7d94a2d00d6f89ba3d8 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Mon, 29 Feb 2016 20:30:39 +0000 Subject: [PATCH 227/538] efi: Split out EFI memory map functions into new file Also move the functions from the EFI fake mem driver since future patches will require access to the memmap insertion code even if CONFIG_EFI_FAKE_MEM isn't enabled. This will be useful when we need to build custom EFI memory maps to allow drivers to mark regions as reserved. Tested-by: Dave Young [kexec/kdump] Tested-by: Ard Biesheuvel [arm] Acked-by: Ard Biesheuvel Cc: Leif Lindholm Cc: Peter Jones Cc: Borislav Petkov Cc: Mark Rutland Cc: Taku Izumi Signed-off-by: Matt Fleming --- drivers/firmware/efi/Makefile | 2 +- drivers/firmware/efi/efi.c | 129 --------------- drivers/firmware/efi/fake_mem.c | 143 +---------------- drivers/firmware/efi/memmap.c | 267 ++++++++++++++++++++++++++++++++ include/linux/efi.h | 10 ++ 5 files changed, 284 insertions(+), 267 deletions(-) create mode 100644 drivers/firmware/efi/memmap.c diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index a219640f881f..b3f5e2adc49f 100644 --- a/drivers/firmware/efi/Makefile +++ b/drivers/firmware/efi/Makefile @@ -10,7 +10,7 @@ KASAN_SANITIZE_runtime-wrappers.o := n obj-$(CONFIG_EFI) += efi.o vars.o reboot.o memattr.o -obj-$(CONFIG_EFI) += capsule.o +obj-$(CONFIG_EFI) += capsule.o memmap.o obj-$(CONFIG_EFI_VARS) += efivars.o obj-$(CONFIG_EFI_ESRT) += esrt.o obj-$(CONFIG_EFI_VARS_PSTORE) += efi-pstore.o diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 8a5e0db72b8f..d4886fd50c16 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -516,135 +516,6 @@ int __init efi_config_init(efi_config_table_type_t *arch_tables) return ret; } -/** - * __efi_memmap_init - Common code for mapping the EFI memory map - * @data: EFI memory map data - * @late: Use early or late mapping function? - * - * This function takes care of figuring out which function to use to - * map the EFI memory map in efi.memmap based on how far into the boot - * we are. - * - * During bootup @late should be %false since we only have access to - * the early_memremap*() functions as the vmalloc space isn't setup. - * Once the kernel is fully booted we can fallback to the more robust - * memremap*() API. - * - * Returns zero on success, a negative error code on failure. - */ -static int __init -__efi_memmap_init(struct efi_memory_map_data *data, bool late) -{ - struct efi_memory_map map; - phys_addr_t phys_map; - - if (efi_enabled(EFI_PARAVIRT)) - return 0; - - phys_map = data->phys_map; - - if (late) - map.map = memremap(phys_map, data->size, MEMREMAP_WB); - else - map.map = early_memremap(phys_map, data->size); - - if (!map.map) { - pr_err("Could not map the memory map!\n"); - return -ENOMEM; - } - - map.phys_map = data->phys_map; - map.nr_map = data->size / data->desc_size; - map.map_end = map.map + data->size; - - map.desc_version = data->desc_version; - map.desc_size = data->desc_size; - map.late = late; - - set_bit(EFI_MEMMAP, &efi.flags); - - efi.memmap = map; - - return 0; -} - -/** - * efi_memmap_init_early - Map the EFI memory map data structure - * @data: EFI memory map data - * - * Use early_memremap() to map the passed in EFI memory map and assign - * it to efi.memmap. - */ -int __init efi_memmap_init_early(struct efi_memory_map_data *data) -{ - /* Cannot go backwards */ - WARN_ON(efi.memmap.late); - - return __efi_memmap_init(data, false); -} - -void __init efi_memmap_unmap(void) -{ - if (!efi.memmap.late) { - unsigned long size; - - size = efi.memmap.desc_size * efi.memmap.nr_map; - early_memunmap(efi.memmap.map, size); - } else { - memunmap(efi.memmap.map); - } - - efi.memmap.map = NULL; - clear_bit(EFI_MEMMAP, &efi.flags); -} - -/** - * efi_memmap_init_late - Map efi.memmap with memremap() - * @phys_addr: Physical address of the new EFI memory map - * @size: Size in bytes of the new EFI memory map - * - * Setup a mapping of the EFI memory map using ioremap_cache(). This - * function should only be called once the vmalloc space has been - * setup and is therefore not suitable for calling during early EFI - * initialise, e.g. in efi_init(). Additionally, it expects - * efi_memmap_init_early() to have already been called. - * - * The reason there are two EFI memmap initialisation - * (efi_memmap_init_early() and this late version) is because the - * early EFI memmap should be explicitly unmapped once EFI - * initialisation is complete as the fixmap space used to map the EFI - * memmap (via early_memremap()) is a scarce resource. - * - * This late mapping is intended to persist for the duration of - * runtime so that things like efi_mem_desc_lookup() and - * efi_mem_attributes() always work. - * - * Returns zero on success, a negative error code on failure. - */ -int __init efi_memmap_init_late(phys_addr_t addr, unsigned long size) -{ - struct efi_memory_map_data data = { - .phys_map = addr, - .size = size, - }; - - /* Did we forget to unmap the early EFI memmap? */ - WARN_ON(efi.memmap.map); - - /* Were we already called? */ - WARN_ON(efi.memmap.late); - - /* - * It makes no sense to allow callers to register different - * values for the following fields. Copy them out of the - * existing early EFI memmap. - */ - data.desc_version = efi.memmap.desc_version; - data.desc_size = efi.memmap.desc_size; - - return __efi_memmap_init(&data, true); -} - #ifdef CONFIG_EFI_VARS_MODULE static int __init efi_load_efivars(void) { diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c index 446c669431c0..0054730f9bae 100644 --- a/drivers/firmware/efi/fake_mem.c +++ b/drivers/firmware/efi/fake_mem.c @@ -35,17 +35,13 @@ #define EFI_MAX_FAKEMEM CONFIG_EFI_MAX_FAKE_MEM -struct fake_mem { - struct range range; - u64 attribute; -}; -static struct fake_mem fake_mems[EFI_MAX_FAKEMEM]; +static struct efi_mem_range fake_mems[EFI_MAX_FAKEMEM]; static int nr_fake_mem; static int __init cmp_fake_mem(const void *x1, const void *x2) { - const struct fake_mem *m1 = x1; - const struct fake_mem *m2 = x2; + const struct efi_mem_range *m1 = x1; + const struct efi_mem_range *m2 = x2; if (m1->range.start < m2->range.start) return -1; @@ -54,133 +50,6 @@ static int __init cmp_fake_mem(const void *x1, const void *x2) return 0; } -/** - * efi_fake_memmap_split_count - Count number of additional EFI memmap entries - * @md: EFI memory descriptor to split - * @range: Address range (start, end) to split around - * - * Returns the number of additional EFI memmap entries required to - * accomodate @range. - */ -static int efi_fake_memmap_split_count(efi_memory_desc_t *md, struct range *range) -{ - u64 m_start, m_end; - u64 start, end; - int count = 0; - - start = md->phys_addr; - end = start + (md->num_pages << EFI_PAGE_SHIFT) - 1; - - /* modifying range */ - m_start = range->start; - m_end = range->end; - - if (m_start <= start) { - /* split into 2 parts */ - if (start < m_end && m_end < end) - count++; - } - - if (start < m_start && m_start < end) { - /* split into 3 parts */ - if (m_end < end) - count += 2; - /* split into 2 parts */ - if (end <= m_end) - count++; - } - - return count; -} - -/** - * efi_fake_memmap_insert - Insert a fake memory region in an EFI memmap - * @old_memmap: The existing EFI memory map structure - * @buf: Address of buffer to store new map - * @mem: Fake memory map entry to insert - * - * It is suggested that you call efi_fake_memmap_split_count() first - * to see how large @buf needs to be. - */ -static void efi_fake_memmap_insert(struct efi_memory_map *old_memmap, - void *buf, struct fake_mem *mem) -{ - u64 m_start, m_end, m_attr; - efi_memory_desc_t *md; - u64 start, end; - void *old, *new; - - /* modifying range */ - m_start = mem->range.start; - m_end = mem->range.end; - m_attr = mem->attribute; - - for (old = old_memmap->map, new = buf; - old < old_memmap->map_end; - old += old_memmap->desc_size, new += old_memmap->desc_size) { - - /* copy original EFI memory descriptor */ - memcpy(new, old, old_memmap->desc_size); - md = new; - start = md->phys_addr; - end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1; - - if (m_start <= start && end <= m_end) - md->attribute |= m_attr; - - if (m_start <= start && - (start < m_end && m_end < end)) { - /* first part */ - md->attribute |= m_attr; - md->num_pages = (m_end - md->phys_addr + 1) >> - EFI_PAGE_SHIFT; - /* latter part */ - new += old_memmap->desc_size; - memcpy(new, old, old_memmap->desc_size); - md = new; - md->phys_addr = m_end + 1; - md->num_pages = (end - md->phys_addr + 1) >> - EFI_PAGE_SHIFT; - } - - if ((start < m_start && m_start < end) && m_end < end) { - /* first part */ - md->num_pages = (m_start - md->phys_addr) >> - EFI_PAGE_SHIFT; - /* middle part */ - new += old_memmap->desc_size; - memcpy(new, old, old_memmap->desc_size); - md = new; - md->attribute |= m_attr; - md->phys_addr = m_start; - md->num_pages = (m_end - m_start + 1) >> - EFI_PAGE_SHIFT; - /* last part */ - new += old_memmap->desc_size; - memcpy(new, old, old_memmap->desc_size); - md = new; - md->phys_addr = m_end + 1; - md->num_pages = (end - m_end) >> - EFI_PAGE_SHIFT; - } - - if ((start < m_start && m_start < end) && - (end <= m_end)) { - /* first part */ - md->num_pages = (m_start - md->phys_addr) >> - EFI_PAGE_SHIFT; - /* latter part */ - new += old_memmap->desc_size; - memcpy(new, old, old_memmap->desc_size); - md = new; - md->phys_addr = m_start; - md->num_pages = (end - md->phys_addr + 1) >> - EFI_PAGE_SHIFT; - md->attribute |= m_attr; - } - } -} - void __init efi_fake_memmap(void) { struct efi_memory_map_data data; @@ -198,7 +67,7 @@ void __init efi_fake_memmap(void) for_each_efi_memory_desc(md) { struct range *r = &fake_mems[i].range; - new_nr_map += efi_fake_memmap_split_count(md, r); + new_nr_map += efi_memmap_split_count(md, r); } } @@ -217,7 +86,7 @@ void __init efi_fake_memmap(void) } for (i = 0; i < nr_fake_mem; i++) - efi_fake_memmap_insert(&efi.memmap, new_memmap, &fake_mems[i]); + efi_memmap_insert(&efi.memmap, new_memmap, &fake_mems[i]); /* swap into new EFI memmap */ early_memunmap(new_memmap, efi.memmap.desc_size * new_nr_map); @@ -265,7 +134,7 @@ static int __init setup_fake_mem(char *p) p++; } - sort(fake_mems, nr_fake_mem, sizeof(struct fake_mem), + sort(fake_mems, nr_fake_mem, sizeof(struct efi_mem_range), cmp_fake_mem, NULL); for (i = 0; i < nr_fake_mem; i++) diff --git a/drivers/firmware/efi/memmap.c b/drivers/firmware/efi/memmap.c new file mode 100644 index 000000000000..2df7238eb44e --- /dev/null +++ b/drivers/firmware/efi/memmap.c @@ -0,0 +1,267 @@ +/* + * Common EFI memory map functions. + */ + +#define pr_fmt(fmt) "efi: " fmt + +#include +#include +#include +#include +#include + +/** + * __efi_memmap_init - Common code for mapping the EFI memory map + * @data: EFI memory map data + * @late: Use early or late mapping function? + * + * This function takes care of figuring out which function to use to + * map the EFI memory map in efi.memmap based on how far into the boot + * we are. + * + * During bootup @late should be %false since we only have access to + * the early_memremap*() functions as the vmalloc space isn't setup. + * Once the kernel is fully booted we can fallback to the more robust + * memremap*() API. + * + * Returns zero on success, a negative error code on failure. + */ +static int __init +__efi_memmap_init(struct efi_memory_map_data *data, bool late) +{ + struct efi_memory_map map; + phys_addr_t phys_map; + + if (efi_enabled(EFI_PARAVIRT)) + return 0; + + phys_map = data->phys_map; + + if (late) + map.map = memremap(phys_map, data->size, MEMREMAP_WB); + else + map.map = early_memremap(phys_map, data->size); + + if (!map.map) { + pr_err("Could not map the memory map!\n"); + return -ENOMEM; + } + + map.phys_map = data->phys_map; + map.nr_map = data->size / data->desc_size; + map.map_end = map.map + data->size; + + map.desc_version = data->desc_version; + map.desc_size = data->desc_size; + map.late = late; + + set_bit(EFI_MEMMAP, &efi.flags); + + efi.memmap = map; + + return 0; +} + +/** + * efi_memmap_init_early - Map the EFI memory map data structure + * @data: EFI memory map data + * + * Use early_memremap() to map the passed in EFI memory map and assign + * it to efi.memmap. + */ +int __init efi_memmap_init_early(struct efi_memory_map_data *data) +{ + /* Cannot go backwards */ + WARN_ON(efi.memmap.late); + + return __efi_memmap_init(data, false); +} + +void __init efi_memmap_unmap(void) +{ + if (!efi.memmap.late) { + unsigned long size; + + size = efi.memmap.desc_size * efi.memmap.nr_map; + early_memunmap(efi.memmap.map, size); + } else { + memunmap(efi.memmap.map); + } + + efi.memmap.map = NULL; + clear_bit(EFI_MEMMAP, &efi.flags); +} + +/** + * efi_memmap_init_late - Map efi.memmap with memremap() + * @phys_addr: Physical address of the new EFI memory map + * @size: Size in bytes of the new EFI memory map + * + * Setup a mapping of the EFI memory map using ioremap_cache(). This + * function should only be called once the vmalloc space has been + * setup and is therefore not suitable for calling during early EFI + * initialise, e.g. in efi_init(). Additionally, it expects + * efi_memmap_init_early() to have already been called. + * + * The reason there are two EFI memmap initialisation + * (efi_memmap_init_early() and this late version) is because the + * early EFI memmap should be explicitly unmapped once EFI + * initialisation is complete as the fixmap space used to map the EFI + * memmap (via early_memremap()) is a scarce resource. + * + * This late mapping is intended to persist for the duration of + * runtime so that things like efi_mem_desc_lookup() and + * efi_mem_attributes() always work. + * + * Returns zero on success, a negative error code on failure. + */ +int __init efi_memmap_init_late(phys_addr_t addr, unsigned long size) +{ + struct efi_memory_map_data data = { + .phys_map = addr, + .size = size, + }; + + /* Did we forget to unmap the early EFI memmap? */ + WARN_ON(efi.memmap.map); + + /* Were we already called? */ + WARN_ON(efi.memmap.late); + + /* + * It makes no sense to allow callers to register different + * values for the following fields. Copy them out of the + * existing early EFI memmap. + */ + data.desc_version = efi.memmap.desc_version; + data.desc_size = efi.memmap.desc_size; + + return __efi_memmap_init(&data, true); +} + +/** + * efi_memmap_split_count - Count number of additional EFI memmap entries + * @md: EFI memory descriptor to split + * @range: Address range (start, end) to split around + * + * Returns the number of additional EFI memmap entries required to + * accomodate @range. + */ +int __init efi_memmap_split_count(efi_memory_desc_t *md, struct range *range) +{ + u64 m_start, m_end; + u64 start, end; + int count = 0; + + start = md->phys_addr; + end = start + (md->num_pages << EFI_PAGE_SHIFT) - 1; + + /* modifying range */ + m_start = range->start; + m_end = range->end; + + if (m_start <= start) { + /* split into 2 parts */ + if (start < m_end && m_end < end) + count++; + } + + if (start < m_start && m_start < end) { + /* split into 3 parts */ + if (m_end < end) + count += 2; + /* split into 2 parts */ + if (end <= m_end) + count++; + } + + return count; +} + +/** + * efi_memmap_insert - Insert a memory region in an EFI memmap + * @old_memmap: The existing EFI memory map structure + * @buf: Address of buffer to store new map + * @mem: Memory map entry to insert + * + * It is suggested that you call efi_memmap_split_count() first + * to see how large @buf needs to be. + */ +void __init efi_memmap_insert(struct efi_memory_map *old_memmap, void *buf, + struct efi_mem_range *mem) +{ + u64 m_start, m_end, m_attr; + efi_memory_desc_t *md; + u64 start, end; + void *old, *new; + + /* modifying range */ + m_start = mem->range.start; + m_end = mem->range.end; + m_attr = mem->attribute; + + for (old = old_memmap->map, new = buf; + old < old_memmap->map_end; + old += old_memmap->desc_size, new += old_memmap->desc_size) { + + /* copy original EFI memory descriptor */ + memcpy(new, old, old_memmap->desc_size); + md = new; + start = md->phys_addr; + end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1; + + if (m_start <= start && end <= m_end) + md->attribute |= m_attr; + + if (m_start <= start && + (start < m_end && m_end < end)) { + /* first part */ + md->attribute |= m_attr; + md->num_pages = (m_end - md->phys_addr + 1) >> + EFI_PAGE_SHIFT; + /* latter part */ + new += old_memmap->desc_size; + memcpy(new, old, old_memmap->desc_size); + md = new; + md->phys_addr = m_end + 1; + md->num_pages = (end - md->phys_addr + 1) >> + EFI_PAGE_SHIFT; + } + + if ((start < m_start && m_start < end) && m_end < end) { + /* first part */ + md->num_pages = (m_start - md->phys_addr) >> + EFI_PAGE_SHIFT; + /* middle part */ + new += old_memmap->desc_size; + memcpy(new, old, old_memmap->desc_size); + md = new; + md->attribute |= m_attr; + md->phys_addr = m_start; + md->num_pages = (m_end - m_start + 1) >> + EFI_PAGE_SHIFT; + /* last part */ + new += old_memmap->desc_size; + memcpy(new, old, old_memmap->desc_size); + md = new; + md->phys_addr = m_end + 1; + md->num_pages = (end - m_end) >> + EFI_PAGE_SHIFT; + } + + if ((start < m_start && m_start < end) && + (end <= m_end)) { + /* first part */ + md->num_pages = (m_start - md->phys_addr) >> + EFI_PAGE_SHIFT; + /* latter part */ + new += old_memmap->desc_size; + memcpy(new, old, old_memmap->desc_size); + md = new; + md->phys_addr = m_start; + md->num_pages = (end - md->phys_addr + 1) >> + EFI_PAGE_SHIFT; + md->attribute |= m_attr; + } + } +} diff --git a/include/linux/efi.h b/include/linux/efi.h index f149676b2fcd..84c8638c7a8b 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -691,6 +692,11 @@ struct efi_memory_map { bool late; }; +struct efi_mem_range { + struct range range; + u64 attribute; +}; + struct efi_fdt_params { u64 system_table; u64 mmap; @@ -917,6 +923,10 @@ extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr); extern int __init efi_memmap_init_early(struct efi_memory_map_data *data); extern int __init efi_memmap_init_late(phys_addr_t addr, unsigned long size); extern void __init efi_memmap_unmap(void); +extern int __init efi_memmap_split_count(efi_memory_desc_t *md, + struct range *range); +extern void __init efi_memmap_insert(struct efi_memory_map *old_memmap, + void *buf, struct efi_mem_range *mem); extern int efi_config_init(efi_config_table_type_t *arch_tables); #ifdef CONFIG_EFI_ESRT From c45f4da33a297f85435f8dccb26a24852ea01bb9 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Wed, 22 Jun 2016 16:54:00 +0100 Subject: [PATCH 228/538] efi: Add efi_memmap_install() for installing new EFI memory maps While efi_memmap_init_{early,late}() exist for architecture code to install memory maps from firmware data and for the virtual memory regions respectively, drivers don't care which stage of the boot we're at and just want to swap the existing memmap for a modified one. efi_memmap_install() abstracts the details of how the new memory map should be mapped and the existing one unmapped. Tested-by: Dave Young [kexec/kdump] Tested-by: Ard Biesheuvel [arm] Acked-by: Ard Biesheuvel Cc: Leif Lindholm Cc: Peter Jones Cc: Borislav Petkov Cc: Mark Rutland Cc: Taku Izumi Signed-off-by: Matt Fleming --- drivers/firmware/efi/fake_mem.c | 8 +------- drivers/firmware/efi/memmap.c | 25 +++++++++++++++++++++++++ include/linux/efi.h | 1 + 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c index 0054730f9bae..520a40e5e0e4 100644 --- a/drivers/firmware/efi/fake_mem.c +++ b/drivers/firmware/efi/fake_mem.c @@ -52,7 +52,6 @@ static int __init cmp_fake_mem(const void *x1, const void *x2) void __init efi_fake_memmap(void) { - struct efi_memory_map_data data; int new_nr_map = efi.memmap.nr_map; efi_memory_desc_t *md; phys_addr_t new_memmap_phy; @@ -90,13 +89,8 @@ void __init efi_fake_memmap(void) /* swap into new EFI memmap */ early_memunmap(new_memmap, efi.memmap.desc_size * new_nr_map); - efi_memmap_unmap(); - data.phys_map = new_memmap_phy; - data.size = efi.memmap.desc_size * new_nr_map; - data.desc_version = efi.memmap.desc_version; - data.desc_size = efi.memmap.desc_size; - efi_memmap_init_early(&data); + efi_memmap_install(new_memmap_phy, new_nr_map); /* print new EFI memmap */ efi_print_memmap(); diff --git a/drivers/firmware/efi/memmap.c b/drivers/firmware/efi/memmap.c index 2df7238eb44e..cd96086fd851 100644 --- a/drivers/firmware/efi/memmap.c +++ b/drivers/firmware/efi/memmap.c @@ -139,6 +139,31 @@ int __init efi_memmap_init_late(phys_addr_t addr, unsigned long size) return __efi_memmap_init(&data, true); } +/** + * efi_memmap_install - Install a new EFI memory map in efi.memmap + * @addr: Physical address of the memory map + * @nr_map: Number of entries in the memory map + * + * Unlike efi_memmap_init_*(), this function does not allow the caller + * to switch from early to late mappings. It simply uses the existing + * mapping function and installs the new memmap. + * + * Returns zero on success, a negative error code on failure. + */ +int __init efi_memmap_install(phys_addr_t addr, unsigned int nr_map) +{ + struct efi_memory_map_data data; + + efi_memmap_unmap(); + + data.phys_map = addr; + data.size = efi.memmap.desc_size * nr_map; + data.desc_version = efi.memmap.desc_version; + data.desc_size = efi.memmap.desc_size; + + return __efi_memmap_init(&data, efi.memmap.late); +} + /** * efi_memmap_split_count - Count number of additional EFI memmap entries * @md: EFI memory descriptor to split diff --git a/include/linux/efi.h b/include/linux/efi.h index 84c8638c7a8b..987c18f6fcae 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -923,6 +923,7 @@ extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr); extern int __init efi_memmap_init_early(struct efi_memory_map_data *data); extern int __init efi_memmap_init_late(phys_addr_t addr, unsigned long size); extern void __init efi_memmap_unmap(void); +extern int __init efi_memmap_install(phys_addr_t addr, unsigned int nr_map); extern int __init efi_memmap_split_count(efi_memory_desc_t *md, struct range *range); extern void __init efi_memmap_insert(struct efi_memory_map *old_memmap, From 816e76129ed5fadd28e526c43397c79775194b5c Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Mon, 29 Feb 2016 21:22:52 +0000 Subject: [PATCH 229/538] efi: Allow drivers to reserve boot services forever Today, it is not possible for drivers to reserve EFI boot services for access after efi_free_boot_services() has been called on x86. For ARM/arm64 it can be done simply by calling memblock_reserve(). Having this ability for all three architectures is desirable for a couple of reasons, 1) It saves drivers copying data out of those regions 2) kexec reboot can now make use of things like ESRT Instead of using the standard memblock_reserve() which is insufficient to reserve the region on x86 (see efi_reserve_boot_services()), a new API is introduced in this patch; efi_mem_reserve(). efi.memmap now always represents which EFI memory regions are available. On x86 the EFI boot services regions that have not been reserved via efi_mem_reserve() will be removed from efi.memmap during efi_free_boot_services(). This has implications for kexec, since it is not possible for a newly kexec'd kernel to access the same boot services regions that the initial boot kernel had access to unless they are reserved by every kexec kernel in the chain. Tested-by: Dave Young [kexec/kdump] Tested-by: Ard Biesheuvel [arm] Acked-by: Ard Biesheuvel Cc: Leif Lindholm Cc: Peter Jones Cc: Borislav Petkov Cc: Mark Rutland Signed-off-by: Matt Fleming --- arch/x86/platform/efi/quirks.c | 121 ++++++++++++++++++++++++++++++--- drivers/firmware/efi/efi.c | 30 ++++++++ include/linux/efi.h | 1 + 3 files changed, 141 insertions(+), 11 deletions(-) diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 9faf18874692..f14b7a9da24b 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -163,6 +163,71 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size, } EXPORT_SYMBOL_GPL(efi_query_variable_store); +/* + * The UEFI specification makes it clear that the operating system is + * free to do whatever it wants with boot services code after + * ExitBootServices() has been called. Ignoring this recommendation a + * significant bunch of EFI implementations continue calling into boot + * services code (SetVirtualAddressMap). In order to work around such + * buggy implementations we reserve boot services region during EFI + * init and make sure it stays executable. Then, after + * SetVirtualAddressMap(), it is discarded. + * + * However, some boot services regions contain data that is required + * by drivers, so we need to track which memory ranges can never be + * freed. This is done by tagging those regions with the + * EFI_MEMORY_RUNTIME attribute. + * + * Any driver that wants to mark a region as reserved must use + * efi_mem_reserve() which will insert a new EFI memory descriptor + * into efi.memmap (splitting existing regions if necessary) and tag + * it with EFI_MEMORY_RUNTIME. + */ +void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size) +{ + phys_addr_t new_phys, new_size; + struct efi_mem_range mr; + efi_memory_desc_t md; + int num_entries; + void *new; + + if (efi_mem_desc_lookup(addr, &md)) { + pr_err("Failed to lookup EFI memory descriptor for %pa\n", &addr); + return; + } + + if (addr + size > md.phys_addr + (md.num_pages << EFI_PAGE_SHIFT)) { + pr_err("Region spans EFI memory descriptors, %pa\n", &addr); + return; + } + + mr.range.start = addr; + mr.range.end = addr + size; + mr.attribute = md.attribute | EFI_MEMORY_RUNTIME; + + num_entries = efi_memmap_split_count(&md, &mr.range); + num_entries += efi.memmap.nr_map; + + new_size = efi.memmap.desc_size * num_entries; + + new_phys = memblock_alloc(new_size, 0); + if (!new_phys) { + pr_err("Could not allocate boot services memmap\n"); + return; + } + + new = early_memremap(new_phys, new_size); + if (!new) { + pr_err("Failed to map new boot services memmap\n"); + return; + } + + efi_memmap_insert(&efi.memmap, new, &mr); + early_memunmap(new, new_size); + + efi_memmap_install(new_phys, num_entries); +} + /* * Helper function for efi_reserve_boot_services() to figure out if we * can free regions in efi_free_boot_services(). @@ -184,15 +249,6 @@ static bool can_free_region(u64 start, u64 size) return true; } -/* - * The UEFI specification makes it clear that the operating system is free to do - * whatever it wants with boot services code after ExitBootServices() has been - * called. Ignoring this recommendation a significant bunch of EFI implementations - * continue calling into boot services code (SetVirtualAddressMap). In order to - * work around such buggy implementations we reserve boot services region during - * EFI init and make sure it stays executable. Then, after SetVirtualAddressMap(), it -* is discarded. -*/ void __init efi_reserve_boot_services(void) { efi_memory_desc_t *md; @@ -249,7 +305,10 @@ void __init efi_reserve_boot_services(void) void __init efi_free_boot_services(void) { + phys_addr_t new_phys, new_size; efi_memory_desc_t *md; + int num_entries = 0; + void *new, *new_md; for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; @@ -257,12 +316,16 @@ void __init efi_free_boot_services(void) size_t rm_size; if (md->type != EFI_BOOT_SERVICES_CODE && - md->type != EFI_BOOT_SERVICES_DATA) + md->type != EFI_BOOT_SERVICES_DATA) { + num_entries++; continue; + } /* Do not free, someone else owns it: */ - if (md->attribute & EFI_MEMORY_RUNTIME) + if (md->attribute & EFI_MEMORY_RUNTIME) { + num_entries++; continue; + } /* * Nasty quirk: if all sub-1MB memory is used for boot @@ -286,6 +349,42 @@ void __init efi_free_boot_services(void) free_bootmem_late(start, size); } + + new_size = efi.memmap.desc_size * num_entries; + new_phys = memblock_alloc(new_size, 0); + if (!new_phys) { + pr_err("Failed to allocate new EFI memmap\n"); + return; + } + + new = memremap(new_phys, new_size, MEMREMAP_WB); + if (!new) { + pr_err("Failed to map new EFI memmap\n"); + return; + } + + /* + * Build a new EFI memmap that excludes any boot services + * regions that are not tagged EFI_MEMORY_RUNTIME, since those + * regions have now been freed. + */ + new_md = new; + for_each_efi_memory_desc(md) { + if (!(md->attribute & EFI_MEMORY_RUNTIME) && + (md->type == EFI_BOOT_SERVICES_CODE || + md->type == EFI_BOOT_SERVICES_DATA)) + continue; + + memcpy(new_md, md, efi.memmap.desc_size); + new_md += efi.memmap.desc_size; + } + + memunmap(new); + + if (efi_memmap_install(new_phys, num_entries)) { + pr_err("Could not install new EFI memmap\n"); + return; + } } /* diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index d4886fd50c16..dfe07316cae5 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -396,6 +397,35 @@ u64 __init efi_mem_desc_end(efi_memory_desc_t *md) return end; } +void __init __weak efi_arch_mem_reserve(phys_addr_t addr, u64 size) {} + +/** + * efi_mem_reserve - Reserve an EFI memory region + * @addr: Physical address to reserve + * @size: Size of reservation + * + * Mark a region as reserved from general kernel allocation and + * prevent it being released by efi_free_boot_services(). + * + * This function should be called drivers once they've parsed EFI + * configuration tables to figure out where their data lives, e.g. + * efi_esrt_init(). + */ +void __init efi_mem_reserve(phys_addr_t addr, u64 size) +{ + if (!memblock_is_region_reserved(addr, size)) + memblock_reserve(addr, size); + + /* + * Some architectures (x86) reserve all boot services ranges + * until efi_free_boot_services() because of buggy firmware + * implementations. This means the above memblock_reserve() is + * superfluous on x86 and instead what it needs to do is + * ensure the @start, @size is not freed. + */ + efi_arch_mem_reserve(addr, size); +} + static __initdata efi_config_table_type_t common_tables[] = { {ACPI_20_TABLE_GUID, "ACPI 2.0", &efi.acpi20}, {ACPI_TABLE_GUID, "ACPI", &efi.acpi}, diff --git a/include/linux/efi.h b/include/linux/efi.h index 987c18f6fcae..3fe4f3c47834 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -944,6 +944,7 @@ extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size); extern int __init efi_uart_console_only (void); extern u64 efi_mem_desc_end(efi_memory_desc_t *md); extern int efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md); +extern void efi_mem_reserve(phys_addr_t addr, u64 size); extern void efi_initialize_iomem_resources(struct resource *code_resource, struct resource *data_resource, struct resource *bss_resource); extern void efi_reserve_boot_services(void); From 31ce8cc68180803aa481c0c1daac29d8eaceca9d Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 1 Mar 2016 23:02:56 +0000 Subject: [PATCH 230/538] efi/runtime-map: Use efi.memmap directly instead of a copy Now that efi.memmap is available all of the time there's no need to allocate and build a separate copy of the EFI memory map. Furthermore, efi.memmap contains boot services regions but only those regions that have been reserved via efi_mem_reserve(). Using efi.memmap allows us to pass boot services across kexec reboot so that the ESRT and BGRT drivers will now work. Tested-by: Dave Young [kexec/kdump] Tested-by: Ard Biesheuvel [arm] Acked-by: Ard Biesheuvel Cc: Leif Lindholm Cc: Peter Jones Cc: Borislav Petkov Cc: Mark Rutland Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 40 ------------------------------ drivers/firmware/efi/runtime-map.c | 35 ++++++++++---------------- include/linux/efi.h | 4 --- 3 files changed, 13 insertions(+), 66 deletions(-) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 33996987ac70..342cebd1e17c 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -592,42 +592,6 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md) } } -static void __init save_runtime_map(void) -{ -#ifdef CONFIG_KEXEC_CORE - unsigned long desc_size; - efi_memory_desc_t *md; - void *tmp, *q = NULL; - int count = 0; - - if (efi_enabled(EFI_OLD_MEMMAP)) - return; - - desc_size = efi.memmap.desc_size; - - for_each_efi_memory_desc(md) { - if (!(md->attribute & EFI_MEMORY_RUNTIME) || - (md->type == EFI_BOOT_SERVICES_CODE) || - (md->type == EFI_BOOT_SERVICES_DATA)) - continue; - tmp = krealloc(q, (count + 1) * desc_size, GFP_KERNEL); - if (!tmp) - goto out; - q = tmp; - - memcpy(q + count * desc_size, md, desc_size); - count++; - } - - efi_runtime_map_setup(q, count, desc_size); - return; - -out: - kfree(q); - pr_err("Error saving runtime map, efi runtime on kexec non-functional!!\n"); -#endif -} - static void *realloc_pages(void *old_memmap, int old_shift) { void *ret; @@ -840,8 +804,6 @@ static void __init kexec_enter_virtual_mode(void) return; } - save_runtime_map(); - BUG_ON(!efi.systab); num_pages = ALIGN(efi.memmap.nr_map * efi.memmap.desc_size, PAGE_SIZE); @@ -934,8 +896,6 @@ static void __init __efi_enter_virtual_mode(void) return; } - save_runtime_map(); - BUG_ON(!efi.systab); if (efi_setup_page_tables(pa, 1 << pg_shift)) { diff --git a/drivers/firmware/efi/runtime-map.c b/drivers/firmware/efi/runtime-map.c index 5c55227a34c8..8e64b77aeac9 100644 --- a/drivers/firmware/efi/runtime-map.c +++ b/drivers/firmware/efi/runtime-map.c @@ -14,10 +14,6 @@ #include -static void *efi_runtime_map; -static int nr_efi_runtime_map; -static u32 efi_memdesc_size; - struct efi_runtime_map_entry { efi_memory_desc_t md; struct kobject kobj; /* kobject for each entry */ @@ -106,7 +102,8 @@ static struct kobj_type __refdata map_ktype = { static struct kset *map_kset; static struct efi_runtime_map_entry * -add_sysfs_runtime_map_entry(struct kobject *kobj, int nr) +add_sysfs_runtime_map_entry(struct kobject *kobj, int nr, + efi_memory_desc_t *md) { int ret; struct efi_runtime_map_entry *entry; @@ -124,8 +121,7 @@ add_sysfs_runtime_map_entry(struct kobject *kobj, int nr) return ERR_PTR(-ENOMEM); } - memcpy(&entry->md, efi_runtime_map + nr * efi_memdesc_size, - sizeof(efi_memory_desc_t)); + memcpy(&entry->md, md, sizeof(efi_memory_desc_t)); kobject_init(&entry->kobj, &map_ktype); entry->kobj.kset = map_kset; @@ -142,12 +138,12 @@ add_sysfs_runtime_map_entry(struct kobject *kobj, int nr) int efi_get_runtime_map_size(void) { - return nr_efi_runtime_map * efi_memdesc_size; + return efi.memmap.nr_map * efi.memmap.desc_size; } int efi_get_runtime_map_desc_size(void) { - return efi_memdesc_size; + return efi.memmap.desc_size; } int efi_runtime_map_copy(void *buf, size_t bufsz) @@ -157,38 +153,33 @@ int efi_runtime_map_copy(void *buf, size_t bufsz) if (sz > bufsz) sz = bufsz; - memcpy(buf, efi_runtime_map, sz); + memcpy(buf, efi.memmap.map, sz); return 0; } -void efi_runtime_map_setup(void *map, int nr_entries, u32 desc_size) -{ - efi_runtime_map = map; - nr_efi_runtime_map = nr_entries; - efi_memdesc_size = desc_size; -} - int __init efi_runtime_map_init(struct kobject *efi_kobj) { int i, j, ret = 0; struct efi_runtime_map_entry *entry; + efi_memory_desc_t *md; - if (!efi_runtime_map) + if (!efi_enabled(EFI_MEMMAP)) return 0; - map_entries = kzalloc(nr_efi_runtime_map * sizeof(entry), GFP_KERNEL); + map_entries = kzalloc(efi.memmap.nr_map * sizeof(entry), GFP_KERNEL); if (!map_entries) { ret = -ENOMEM; goto out; } - for (i = 0; i < nr_efi_runtime_map; i++) { - entry = add_sysfs_runtime_map_entry(efi_kobj, i); + i = 0; + for_each_efi_memory_desc(md) { + entry = add_sysfs_runtime_map_entry(efi_kobj, i, md); if (IS_ERR(entry)) { ret = PTR_ERR(entry); goto out_add_entry; } - *(map_entries + i) = entry; + *(map_entries + i++) = entry; } return 0; diff --git a/include/linux/efi.h b/include/linux/efi.h index 3fe4f3c47834..d8b555db81c7 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1357,7 +1357,6 @@ extern int efi_capsule_update(efi_capsule_header_t *capsule, #ifdef CONFIG_EFI_RUNTIME_MAP int efi_runtime_map_init(struct kobject *); -void efi_runtime_map_setup(void *, int, u32); int efi_get_runtime_map_size(void); int efi_get_runtime_map_desc_size(void); int efi_runtime_map_copy(void *buf, size_t bufsz); @@ -1367,9 +1366,6 @@ static inline int efi_runtime_map_init(struct kobject *kobj) return 0; } -static inline void -efi_runtime_map_setup(void *map, int nr_entries, u32 desc_size) {} - static inline int efi_get_runtime_map_size(void) { return 0; From 8e80632fb23f021ce5a6957f2edcdae4645a7030 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 1 Mar 2016 23:08:03 +0000 Subject: [PATCH 231/538] efi/esrt: Use efi_mem_reserve() and avoid a kmalloc() We can use the new efi_mem_reserve() API to mark the ESRT table as reserved forever and save ourselves the trouble of copying the data out into a kmalloc buffer. The added advantage is that now the ESRT driver will work across kexec reboot. Tested-by: Dave Young [kexec/kdump] Tested-by: Ard Biesheuvel [arm] Acked-by: Ard Biesheuvel Cc: Leif Lindholm Cc: Peter Jones Cc: Borislav Petkov Cc: Mark Rutland Signed-off-by: Matt Fleming --- drivers/firmware/efi/esrt.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/drivers/firmware/efi/esrt.c b/drivers/firmware/efi/esrt.c index 75feb3f5829b..b93cd11f9bcc 100644 --- a/drivers/firmware/efi/esrt.c +++ b/drivers/firmware/efi/esrt.c @@ -235,7 +235,7 @@ static struct attribute_group esrt_attr_group = { }; /* - * remap the table, copy it to kmalloced pages, and unmap it. + * remap the table, validate it, mark it reserved and unmap it. */ void __init efi_esrt_init(void) { @@ -335,7 +335,7 @@ void __init efi_esrt_init(void) end = esrt_data + size; pr_info("Reserving ESRT space from %pa to %pa.\n", &esrt_data, &end); - memblock_reserve(esrt_data, esrt_data_size); + efi_mem_reserve(esrt_data, esrt_data_size); pr_debug("esrt-init: loaded.\n"); err_memunmap: @@ -382,28 +382,18 @@ static void cleanup_entry_list(void) static int __init esrt_sysfs_init(void) { int error; - struct efi_system_resource_table __iomem *ioesrt; pr_debug("esrt-sysfs: loading.\n"); if (!esrt_data || !esrt_data_size) return -ENOSYS; - ioesrt = ioremap(esrt_data, esrt_data_size); - if (!ioesrt) { + esrt = ioremap(esrt_data, esrt_data_size); + if (!esrt) { pr_err("ioremap(%pa, %zu) failed.\n", &esrt_data, esrt_data_size); return -ENOMEM; } - esrt = kmalloc(esrt_data_size, GFP_KERNEL); - if (!esrt) { - pr_err("kmalloc failed. (wanted %zu bytes)\n", esrt_data_size); - iounmap(ioesrt); - return -ENOMEM; - } - - memcpy_fromio(esrt, ioesrt, esrt_data_size); - esrt_kobj = kobject_create_and_add("esrt", efi_kobj); if (!esrt_kobj) { pr_err("Firmware table registration failed.\n"); @@ -429,8 +419,6 @@ static int __init esrt_sysfs_init(void) if (error) goto err_cleanup_list; - memblock_remove(esrt_data, esrt_data_size); - pr_debug("esrt-sysfs: loaded.\n"); return 0; From 4bc9f92e64c81192dcca1c495354bcc7c3b43e7d Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Thu, 23 Jun 2016 11:36:32 +0100 Subject: [PATCH 232/538] x86/efi-bgrt: Use efi_mem_reserve() to avoid copying image data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit efi_mem_reserve() allows us to permanently mark EFI boot services regions as reserved, which means we no longer need to copy the image data out and into a separate buffer. Leaving the data in the original boot services region has the added benefit that BGRT images can now be passed across kexec reboot. Reviewed-by: Josh Triplett Tested-by: Dave Young [kexec/kdump] Tested-by: Ard Biesheuvel [arm] Acked-by: Ard Biesheuvel Cc: Leif Lindholm Cc: Peter Jones Cc: Borislav Petkov Cc: Mark Rutland Cc: Josh Boyer Cc: Andy Lutomirski Cc: Môshe van der Sterre Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi-bgrt.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c index 6a2f5691b1ab..6aad870e8962 100644 --- a/arch/x86/platform/efi/efi-bgrt.c +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -82,21 +82,12 @@ void __init efi_bgrt_init(void) } bgrt_image_size = bmp_header.size; - bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL | __GFP_NOWARN); + bgrt_image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB); if (!bgrt_image) { - pr_notice("Ignoring BGRT: failed to allocate memory for image (wanted %zu bytes)\n", - bgrt_image_size); - return; - } - - image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB); - if (!image) { pr_notice("Ignoring BGRT: failed to map image memory\n"); - kfree(bgrt_image); bgrt_image = NULL; return; } - memcpy(bgrt_image, image, bgrt_image_size); - memunmap(image); + efi_mem_reserve(bgrt_tab->image_address, bgrt_image_size); } From f58a37b2e01f91c23af457a7662f6b5a1e9f41e0 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 11 Jul 2016 21:00:45 +0200 Subject: [PATCH 233/538] efi/esrt: Use memremap not ioremap to access ESRT table in memory On ARM and arm64, ioremap() and memremap() are not interchangeable like on x86, and the use of ioremap() on ordinary RAM is typically flagged as an error if the memory region being mapped is also covered by the linear mapping, since that would lead to aliases with conflicting cacheability attributes. Since what we are dealing with is not an I/O region with side effects, using ioremap() here is arguably incorrect anyway, so let's replace it with memremap() instead. Acked-by: Peter Jones Signed-off-by: Ard Biesheuvel Cc: Leif Lindholm Cc: Mark Rutland Signed-off-by: Matt Fleming --- drivers/firmware/efi/esrt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/efi/esrt.c b/drivers/firmware/efi/esrt.c index b93cd11f9bcc..14914074f716 100644 --- a/drivers/firmware/efi/esrt.c +++ b/drivers/firmware/efi/esrt.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -387,9 +388,9 @@ static int __init esrt_sysfs_init(void) if (!esrt_data || !esrt_data_size) return -ENOSYS; - esrt = ioremap(esrt_data, esrt_data_size); + esrt = memremap(esrt_data, esrt_data_size, MEMREMAP_WB); if (!esrt) { - pr_err("ioremap(%pa, %zu) failed.\n", &esrt_data, + pr_err("memremap(%pa, %zu) failed.\n", &esrt_data, esrt_data_size); return -ENOMEM; } From 2ead3084e3fc37d42f379cca8753b458d8f9ba25 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 11 Jul 2016 21:00:46 +0200 Subject: [PATCH 234/538] efi/arm*: esrt: Add missing call to efi_esrt_init() ESRT support is built by default for all architectures that define CONFIG_EFI. However, this support was not wired up yet for ARM/arm64, since efi_esrt_init() was never called. So add the missing call. Signed-off-by: Ard Biesheuvel Cc: Leif Lindholm Cc: Mark Rutland Cc: Peter Jones Signed-off-by: Matt Fleming --- drivers/firmware/efi/arm-init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c index 5a2df3fefccc..e0a511d4074f 100644 --- a/drivers/firmware/efi/arm-init.c +++ b/drivers/firmware/efi/arm-init.c @@ -247,6 +247,7 @@ void __init efi_init(void) reserve_regions(); efi_memattr_init(); + efi_esrt_init(); efi_memmap_unmap(); memblock_reserve(params.mmap & PAGE_MASK, From 217b27d4671a0a3f34147f1b341683d36b7457db Mon Sep 17 00:00:00 2001 From: Sylvain Chouleur Date: Fri, 15 Jul 2016 21:36:29 +0200 Subject: [PATCH 235/538] efi: Use a file local lock for efivars This patch replaces the spinlock in the efivars struct with a single lock for the whole vars.c file. The goal of this lock is to protect concurrent calls to efi variable services, registering and unregistering. This allows us to register new efivars operations without having in-progress call. Signed-off-by: Sylvain Chouleur Signed-off-by: Ard Biesheuvel Cc: Leif Lindholm Cc: Mark Rutland Cc: Sylvain Chouleur Signed-off-by: Matt Fleming --- drivers/firmware/efi/vars.c | 83 +++++++++++++++++++++---------------- include/linux/efi.h | 6 --- 2 files changed, 47 insertions(+), 42 deletions(-) diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c index d3b751383286..d0d807e1287e 100644 --- a/drivers/firmware/efi/vars.c +++ b/drivers/firmware/efi/vars.c @@ -37,6 +37,14 @@ /* Private pointer to registered efivars */ static struct efivars *__efivars; +/* + * efivars_lock protects three things: + * 1) efivarfs_list and efivars_sysfs_list + * 2) ->ops calls + * 3) (un)registration of __efivars + */ +static DEFINE_SPINLOCK(efivars_lock); + static bool efivar_wq_enabled = true; DECLARE_WORK(efivar_work, NULL); EXPORT_SYMBOL_GPL(efivar_work); @@ -434,7 +442,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), return -ENOMEM; } - spin_lock_irq(&__efivars->lock); + spin_lock_irq(&efivars_lock); /* * Per EFI spec, the maximum storage allocated for both @@ -450,7 +458,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), switch (status) { case EFI_SUCCESS: if (duplicates) - spin_unlock_irq(&__efivars->lock); + spin_unlock_irq(&efivars_lock); variable_name_size = var_name_strnsize(variable_name, variable_name_size); @@ -477,7 +485,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), } if (duplicates) - spin_lock_irq(&__efivars->lock); + spin_lock_irq(&efivars_lock); break; case EFI_NOT_FOUND: @@ -491,7 +499,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), } while (status != EFI_NOT_FOUND); - spin_unlock_irq(&__efivars->lock); + spin_unlock_irq(&efivars_lock); kfree(variable_name); @@ -506,9 +514,9 @@ EXPORT_SYMBOL_GPL(efivar_init); */ void efivar_entry_add(struct efivar_entry *entry, struct list_head *head) { - spin_lock_irq(&__efivars->lock); + spin_lock_irq(&efivars_lock); list_add(&entry->list, head); - spin_unlock_irq(&__efivars->lock); + spin_unlock_irq(&efivars_lock); } EXPORT_SYMBOL_GPL(efivar_entry_add); @@ -518,9 +526,9 @@ EXPORT_SYMBOL_GPL(efivar_entry_add); */ void efivar_entry_remove(struct efivar_entry *entry) { - spin_lock_irq(&__efivars->lock); + spin_lock_irq(&efivars_lock); list_del(&entry->list); - spin_unlock_irq(&__efivars->lock); + spin_unlock_irq(&efivars_lock); } EXPORT_SYMBOL_GPL(efivar_entry_remove); @@ -537,10 +545,10 @@ EXPORT_SYMBOL_GPL(efivar_entry_remove); */ static void efivar_entry_list_del_unlock(struct efivar_entry *entry) { - lockdep_assert_held(&__efivars->lock); + lockdep_assert_held(&efivars_lock); list_del(&entry->list); - spin_unlock_irq(&__efivars->lock); + spin_unlock_irq(&efivars_lock); } /** @@ -563,7 +571,7 @@ int __efivar_entry_delete(struct efivar_entry *entry) const struct efivar_operations *ops = __efivars->ops; efi_status_t status; - lockdep_assert_held(&__efivars->lock); + lockdep_assert_held(&efivars_lock); status = ops->set_variable(entry->var.VariableName, &entry->var.VendorGuid, @@ -589,12 +597,12 @@ int efivar_entry_delete(struct efivar_entry *entry) const struct efivar_operations *ops = __efivars->ops; efi_status_t status; - spin_lock_irq(&__efivars->lock); + spin_lock_irq(&efivars_lock); status = ops->set_variable(entry->var.VariableName, &entry->var.VendorGuid, 0, 0, NULL); if (!(status == EFI_SUCCESS || status == EFI_NOT_FOUND)) { - spin_unlock_irq(&__efivars->lock); + spin_unlock_irq(&efivars_lock); return efi_status_to_err(status); } @@ -632,10 +640,10 @@ int efivar_entry_set(struct efivar_entry *entry, u32 attributes, efi_char16_t *name = entry->var.VariableName; efi_guid_t vendor = entry->var.VendorGuid; - spin_lock_irq(&__efivars->lock); + spin_lock_irq(&efivars_lock); if (head && efivar_entry_find(name, vendor, head, false)) { - spin_unlock_irq(&__efivars->lock); + spin_unlock_irq(&efivars_lock); return -EEXIST; } @@ -644,7 +652,7 @@ int efivar_entry_set(struct efivar_entry *entry, u32 attributes, status = ops->set_variable(name, &vendor, attributes, size, data); - spin_unlock_irq(&__efivars->lock); + spin_unlock_irq(&efivars_lock); return efi_status_to_err(status); @@ -658,7 +666,7 @@ EXPORT_SYMBOL_GPL(efivar_entry_set); * from crash/panic handlers. * * Crucially, this function will not block if it cannot acquire - * __efivars->lock. Instead, it returns -EBUSY. + * efivars_lock. Instead, it returns -EBUSY. */ static int efivar_entry_set_nonblocking(efi_char16_t *name, efi_guid_t vendor, @@ -668,20 +676,20 @@ efivar_entry_set_nonblocking(efi_char16_t *name, efi_guid_t vendor, unsigned long flags; efi_status_t status; - if (!spin_trylock_irqsave(&__efivars->lock, flags)) + if (!spin_trylock_irqsave(&efivars_lock, flags)) return -EBUSY; status = check_var_size_nonblocking(attributes, size + ucs2_strsize(name, 1024)); if (status != EFI_SUCCESS) { - spin_unlock_irqrestore(&__efivars->lock, flags); + spin_unlock_irqrestore(&efivars_lock, flags); return -ENOSPC; } status = ops->set_variable_nonblocking(name, &vendor, attributes, size, data); - spin_unlock_irqrestore(&__efivars->lock, flags); + spin_unlock_irqrestore(&efivars_lock, flags); return efi_status_to_err(status); } @@ -727,21 +735,21 @@ int efivar_entry_set_safe(efi_char16_t *name, efi_guid_t vendor, u32 attributes, size, data); if (!block) { - if (!spin_trylock_irqsave(&__efivars->lock, flags)) + if (!spin_trylock_irqsave(&efivars_lock, flags)) return -EBUSY; } else { - spin_lock_irqsave(&__efivars->lock, flags); + spin_lock_irqsave(&efivars_lock, flags); } status = check_var_size(attributes, size + ucs2_strsize(name, 1024)); if (status != EFI_SUCCESS) { - spin_unlock_irqrestore(&__efivars->lock, flags); + spin_unlock_irqrestore(&efivars_lock, flags); return -ENOSPC; } status = ops->set_variable(name, &vendor, attributes, size, data); - spin_unlock_irqrestore(&__efivars->lock, flags); + spin_unlock_irqrestore(&efivars_lock, flags); return efi_status_to_err(status); } @@ -771,7 +779,7 @@ struct efivar_entry *efivar_entry_find(efi_char16_t *name, efi_guid_t guid, int strsize1, strsize2; bool found = false; - lockdep_assert_held(&__efivars->lock); + lockdep_assert_held(&efivars_lock); list_for_each_entry_safe(entry, n, head, list) { strsize1 = ucs2_strsize(name, 1024); @@ -814,10 +822,10 @@ int efivar_entry_size(struct efivar_entry *entry, unsigned long *size) *size = 0; - spin_lock_irq(&__efivars->lock); + spin_lock_irq(&efivars_lock); status = ops->get_variable(entry->var.VariableName, &entry->var.VendorGuid, NULL, size, NULL); - spin_unlock_irq(&__efivars->lock); + spin_unlock_irq(&efivars_lock); if (status != EFI_BUFFER_TOO_SMALL) return efi_status_to_err(status); @@ -843,7 +851,7 @@ int __efivar_entry_get(struct efivar_entry *entry, u32 *attributes, const struct efivar_operations *ops = __efivars->ops; efi_status_t status; - lockdep_assert_held(&__efivars->lock); + lockdep_assert_held(&efivars_lock); status = ops->get_variable(entry->var.VariableName, &entry->var.VendorGuid, @@ -866,11 +874,11 @@ int efivar_entry_get(struct efivar_entry *entry, u32 *attributes, const struct efivar_operations *ops = __efivars->ops; efi_status_t status; - spin_lock_irq(&__efivars->lock); + spin_lock_irq(&efivars_lock); status = ops->get_variable(entry->var.VariableName, &entry->var.VendorGuid, attributes, size, data); - spin_unlock_irq(&__efivars->lock); + spin_unlock_irq(&efivars_lock); return efi_status_to_err(status); } @@ -917,7 +925,7 @@ int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes, * set_variable call, and removal of the variable from the efivars * list (in the case of an authenticated delete). */ - spin_lock_irq(&__efivars->lock); + spin_lock_irq(&efivars_lock); /* * Ensure that the available space hasn't shrunk below the safe level @@ -957,7 +965,7 @@ int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes, if (status == EFI_NOT_FOUND) efivar_entry_list_del_unlock(entry); else - spin_unlock_irq(&__efivars->lock); + spin_unlock_irq(&efivars_lock); if (status && status != EFI_BUFFER_TOO_SMALL) return efi_status_to_err(status); @@ -965,7 +973,7 @@ int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes, return 0; out: - spin_unlock_irq(&__efivars->lock); + spin_unlock_irq(&efivars_lock); return err; } @@ -980,7 +988,7 @@ EXPORT_SYMBOL_GPL(efivar_entry_set_get_size); */ void efivar_entry_iter_begin(void) { - spin_lock_irq(&__efivars->lock); + spin_lock_irq(&efivars_lock); } EXPORT_SYMBOL_GPL(efivar_entry_iter_begin); @@ -991,7 +999,7 @@ EXPORT_SYMBOL_GPL(efivar_entry_iter_begin); */ void efivar_entry_iter_end(void) { - spin_unlock_irq(&__efivars->lock); + spin_unlock_irq(&efivars_lock); } EXPORT_SYMBOL_GPL(efivar_entry_iter_end); @@ -1112,11 +1120,12 @@ int efivars_register(struct efivars *efivars, const struct efivar_operations *ops, struct kobject *kobject) { - spin_lock_init(&efivars->lock); + spin_lock_irq(&efivars_lock); efivars->ops = ops; efivars->kobject = kobject; __efivars = efivars; + spin_unlock_irq(&efivars_lock); return 0; } @@ -1133,6 +1142,7 @@ int efivars_unregister(struct efivars *efivars) { int rv; + spin_lock_irq(&efivars_lock); if (!__efivars) { printk(KERN_ERR "efivars not registered\n"); rv = -EINVAL; @@ -1148,6 +1158,7 @@ int efivars_unregister(struct efivars *efivars) rv = 0; out: + spin_unlock_irq(&efivars_lock); return rv; } EXPORT_SYMBOL_GPL(efivars_unregister); diff --git a/include/linux/efi.h b/include/linux/efi.h index d8b555db81c7..deecb2902715 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1157,12 +1157,6 @@ struct efivar_operations { }; struct efivars { - /* - * ->lock protects two things: - * 1) efivarfs_list and efivars_sysfs_list - * 2) ->ops calls - */ - spinlock_t lock; struct kset *kset; struct kobject *kobject; const struct efivar_operations *ops; From 21b3ddd39feecd2f4d6c52bcd30f0a4fa14f125a Mon Sep 17 00:00:00 2001 From: Sylvain Chouleur Date: Fri, 15 Jul 2016 21:36:30 +0200 Subject: [PATCH 236/538] efi: Don't use spinlocks for efi vars All efivars operations are protected by a spinlock which prevents interruptions and preemption. This is too restricted, we just need a lock preventing concurrency. The idea is to use a semaphore of count 1 and to have two ways of locking, depending on the context: - In interrupt context, we call down_trylock(), if it fails we return an error - In normal context, we call down_interruptible() We don't use a mutex here because the mutex_trylock() function must not be called from interrupt context, whereas the down_trylock() can. Signed-off-by: Sylvain Chouleur Signed-off-by: Ard Biesheuvel Cc: Leif Lindholm Cc: Mark Rutland Cc: Sylvain Chouleur Signed-off-by: Matt Fleming --- drivers/firmware/efi/efi-pstore.c | 36 ++++++-- drivers/firmware/efi/efivars.c | 22 ++++- drivers/firmware/efi/vars.c | 137 +++++++++++++++++------------- fs/efivarfs/inode.c | 5 +- fs/efivarfs/super.c | 9 +- include/linux/efi.h | 6 +- 6 files changed, 139 insertions(+), 76 deletions(-) diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c index 30a24d09ea6c..1c33d7469e4a 100644 --- a/drivers/firmware/efi/efi-pstore.c +++ b/drivers/firmware/efi/efi-pstore.c @@ -125,16 +125,19 @@ static void efi_pstore_scan_sysfs_enter(struct efivar_entry *pos, * @entry: deleting entry * @turn_off_scanning: Check if a scanning flag should be turned off */ -static inline void __efi_pstore_scan_sysfs_exit(struct efivar_entry *entry, +static inline int __efi_pstore_scan_sysfs_exit(struct efivar_entry *entry, bool turn_off_scanning) { if (entry->deleting) { list_del(&entry->list); efivar_entry_iter_end(); efivar_unregister(entry); - efivar_entry_iter_begin(); + if (efivar_entry_iter_begin()) + return -EINTR; } else if (turn_off_scanning) entry->scanning = false; + + return 0; } /** @@ -144,13 +147,18 @@ static inline void __efi_pstore_scan_sysfs_exit(struct efivar_entry *entry, * @head: list head * @stop: a flag checking if scanning will stop */ -static void efi_pstore_scan_sysfs_exit(struct efivar_entry *pos, +static int efi_pstore_scan_sysfs_exit(struct efivar_entry *pos, struct efivar_entry *next, struct list_head *head, bool stop) { - __efi_pstore_scan_sysfs_exit(pos, true); + int ret = __efi_pstore_scan_sysfs_exit(pos, true); + + if (ret) + return ret; + if (stop) - __efi_pstore_scan_sysfs_exit(next, &next->list != head); + ret = __efi_pstore_scan_sysfs_exit(next, &next->list != head); + return ret; } /** @@ -172,13 +180,17 @@ static int efi_pstore_sysfs_entry_iter(void *data, struct efivar_entry **pos) struct efivar_entry *entry, *n; struct list_head *head = &efivar_sysfs_list; int size = 0; + int ret; if (!*pos) { list_for_each_entry_safe(entry, n, head, list) { efi_pstore_scan_sysfs_enter(entry, n, head); size = efi_pstore_read_func(entry, data); - efi_pstore_scan_sysfs_exit(entry, n, head, size < 0); + ret = efi_pstore_scan_sysfs_exit(entry, n, head, + size < 0); + if (ret) + return ret; if (size) break; } @@ -190,7 +202,9 @@ static int efi_pstore_sysfs_entry_iter(void *data, struct efivar_entry **pos) efi_pstore_scan_sysfs_enter((*pos), n, head); size = efi_pstore_read_func((*pos), data); - efi_pstore_scan_sysfs_exit((*pos), n, head, size < 0); + ret = efi_pstore_scan_sysfs_exit((*pos), n, head, size < 0); + if (ret) + return ret; if (size) break; } @@ -232,7 +246,10 @@ static ssize_t efi_pstore_read(u64 *id, enum pstore_type_id *type, if (!*data.buf) return -ENOMEM; - efivar_entry_iter_begin(); + if (efivar_entry_iter_begin()) { + kfree(*data.buf); + return -EINTR; + } size = efi_pstore_sysfs_entry_iter(&data, (struct efivar_entry **)&psi->data); efivar_entry_iter_end(); @@ -347,7 +364,8 @@ static int efi_pstore_erase(enum pstore_type_id type, u64 id, int count, edata.time = time; edata.name = efi_name; - efivar_entry_iter_begin(); + if (efivar_entry_iter_begin()) + return -EINTR; found = __efivar_entry_iter(efi_pstore_erase_func, &efivar_sysfs_list, &edata, &entry); if (found && !entry->scanning) { diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c index 116b244dee68..3e626fd9bd4e 100644 --- a/drivers/firmware/efi/efivars.c +++ b/drivers/firmware/efi/efivars.c @@ -510,7 +510,8 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj, vendor = del_var->VendorGuid; } - efivar_entry_iter_begin(); + if (efivar_entry_iter_begin()) + return -EINTR; entry = efivar_entry_find(name, vendor, &efivar_sysfs_list, true); if (!entry) err = -EINVAL; @@ -575,7 +576,10 @@ efivar_create_sysfs_entry(struct efivar_entry *new_var) return ret; kobject_uevent(&new_var->kobj, KOBJ_ADD); - efivar_entry_add(new_var, &efivar_sysfs_list); + if (efivar_entry_add(new_var, &efivar_sysfs_list)) { + efivar_unregister(new_var); + return -EINTR; + } return 0; } @@ -690,7 +694,10 @@ static int efivars_sysfs_callback(efi_char16_t *name, efi_guid_t vendor, static int efivar_sysfs_destroy(struct efivar_entry *entry, void *data) { - efivar_entry_remove(entry); + int err = efivar_entry_remove(entry); + + if (err) + return err; efivar_unregister(entry); return 0; } @@ -698,7 +705,14 @@ static int efivar_sysfs_destroy(struct efivar_entry *entry, void *data) static void efivars_sysfs_exit(void) { /* Remove all entries and destroy */ - __efivar_entry_iter(efivar_sysfs_destroy, &efivar_sysfs_list, NULL, NULL); + int err; + + err = __efivar_entry_iter(efivar_sysfs_destroy, &efivar_sysfs_list, + NULL, NULL); + if (err) { + pr_err("efivars: Failed to destroy sysfs entries\n"); + return; + } if (efivars_new_var) sysfs_remove_bin_file(&efivars_kset->kobj, efivars_new_var); diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c index d0d807e1287e..9336ffdf6e2c 100644 --- a/drivers/firmware/efi/vars.c +++ b/drivers/firmware/efi/vars.c @@ -43,7 +43,7 @@ static struct efivars *__efivars; * 2) ->ops calls * 3) (un)registration of __efivars */ -static DEFINE_SPINLOCK(efivars_lock); +static DEFINE_SEMAPHORE(efivars_lock); static bool efivar_wq_enabled = true; DECLARE_WORK(efivar_work, NULL); @@ -442,7 +442,10 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), return -ENOMEM; } - spin_lock_irq(&efivars_lock); + if (down_interruptible(&efivars_lock)) { + err = -EINTR; + goto free; + } /* * Per EFI spec, the maximum storage allocated for both @@ -458,7 +461,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), switch (status) { case EFI_SUCCESS: if (duplicates) - spin_unlock_irq(&efivars_lock); + up(&efivars_lock); variable_name_size = var_name_strnsize(variable_name, variable_name_size); @@ -484,8 +487,12 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), status = EFI_NOT_FOUND; } - if (duplicates) - spin_lock_irq(&efivars_lock); + if (duplicates) { + if (down_interruptible(&efivars_lock)) { + err = -EINTR; + goto free; + } + } break; case EFI_NOT_FOUND: @@ -499,8 +506,8 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), } while (status != EFI_NOT_FOUND); - spin_unlock_irq(&efivars_lock); - + up(&efivars_lock); +free: kfree(variable_name); return err; @@ -511,24 +518,34 @@ EXPORT_SYMBOL_GPL(efivar_init); * efivar_entry_add - add entry to variable list * @entry: entry to add to list * @head: list head + * + * Returns 0 on success, or a kernel error code on failure. */ -void efivar_entry_add(struct efivar_entry *entry, struct list_head *head) +int efivar_entry_add(struct efivar_entry *entry, struct list_head *head) { - spin_lock_irq(&efivars_lock); + if (down_interruptible(&efivars_lock)) + return -EINTR; list_add(&entry->list, head); - spin_unlock_irq(&efivars_lock); + up(&efivars_lock); + + return 0; } EXPORT_SYMBOL_GPL(efivar_entry_add); /** * efivar_entry_remove - remove entry from variable list * @entry: entry to remove from list + * + * Returns 0 on success, or a kernel error code on failure. */ -void efivar_entry_remove(struct efivar_entry *entry) +int efivar_entry_remove(struct efivar_entry *entry) { - spin_lock_irq(&efivars_lock); + if (down_interruptible(&efivars_lock)) + return -EINTR; list_del(&entry->list); - spin_unlock_irq(&efivars_lock); + up(&efivars_lock); + + return 0; } EXPORT_SYMBOL_GPL(efivar_entry_remove); @@ -545,10 +562,8 @@ EXPORT_SYMBOL_GPL(efivar_entry_remove); */ static void efivar_entry_list_del_unlock(struct efivar_entry *entry) { - lockdep_assert_held(&efivars_lock); - list_del(&entry->list); - spin_unlock_irq(&efivars_lock); + up(&efivars_lock); } /** @@ -571,8 +586,6 @@ int __efivar_entry_delete(struct efivar_entry *entry) const struct efivar_operations *ops = __efivars->ops; efi_status_t status; - lockdep_assert_held(&efivars_lock); - status = ops->set_variable(entry->var.VariableName, &entry->var.VendorGuid, 0, 0, NULL); @@ -589,20 +602,22 @@ EXPORT_SYMBOL_GPL(__efivar_entry_delete); * variable list. It is the caller's responsibility to free @entry * once we return. * - * Returns 0 on success, or a converted EFI status code if - * set_variable() fails. + * Returns 0 on success, -EINTR if we can't grab the semaphore, + * converted EFI status code if set_variable() fails. */ int efivar_entry_delete(struct efivar_entry *entry) { const struct efivar_operations *ops = __efivars->ops; efi_status_t status; - spin_lock_irq(&efivars_lock); + if (down_interruptible(&efivars_lock)) + return -EINTR; + status = ops->set_variable(entry->var.VariableName, &entry->var.VendorGuid, 0, 0, NULL); if (!(status == EFI_SUCCESS || status == EFI_NOT_FOUND)) { - spin_unlock_irq(&efivars_lock); + up(&efivars_lock); return efi_status_to_err(status); } @@ -628,9 +643,9 @@ EXPORT_SYMBOL_GPL(efivar_entry_delete); * If @head is not NULL a lookup is performed to determine whether * the entry is already on the list. * - * Returns 0 on success, -EEXIST if a lookup is performed and the entry - * already exists on the list, or a converted EFI status code if - * set_variable() fails. + * Returns 0 on success, -EINTR if we can't grab the semaphore, + * -EEXIST if a lookup is performed and the entry already exists on + * the list, or a converted EFI status code if set_variable() fails. */ int efivar_entry_set(struct efivar_entry *entry, u32 attributes, unsigned long size, void *data, struct list_head *head) @@ -640,10 +655,10 @@ int efivar_entry_set(struct efivar_entry *entry, u32 attributes, efi_char16_t *name = entry->var.VariableName; efi_guid_t vendor = entry->var.VendorGuid; - spin_lock_irq(&efivars_lock); - + if (down_interruptible(&efivars_lock)) + return -EINTR; if (head && efivar_entry_find(name, vendor, head, false)) { - spin_unlock_irq(&efivars_lock); + up(&efivars_lock); return -EEXIST; } @@ -652,7 +667,7 @@ int efivar_entry_set(struct efivar_entry *entry, u32 attributes, status = ops->set_variable(name, &vendor, attributes, size, data); - spin_unlock_irq(&efivars_lock); + up(&efivars_lock); return efi_status_to_err(status); @@ -673,23 +688,22 @@ efivar_entry_set_nonblocking(efi_char16_t *name, efi_guid_t vendor, u32 attributes, unsigned long size, void *data) { const struct efivar_operations *ops = __efivars->ops; - unsigned long flags; efi_status_t status; - if (!spin_trylock_irqsave(&efivars_lock, flags)) + if (down_trylock(&efivars_lock)) return -EBUSY; status = check_var_size_nonblocking(attributes, size + ucs2_strsize(name, 1024)); if (status != EFI_SUCCESS) { - spin_unlock_irqrestore(&efivars_lock, flags); + up(&efivars_lock); return -ENOSPC; } status = ops->set_variable_nonblocking(name, &vendor, attributes, size, data); - spin_unlock_irqrestore(&efivars_lock, flags); + up(&efivars_lock); return efi_status_to_err(status); } @@ -714,7 +728,6 @@ int efivar_entry_set_safe(efi_char16_t *name, efi_guid_t vendor, u32 attributes, bool block, unsigned long size, void *data) { const struct efivar_operations *ops = __efivars->ops; - unsigned long flags; efi_status_t status; if (!ops->query_variable_store) @@ -735,21 +748,22 @@ int efivar_entry_set_safe(efi_char16_t *name, efi_guid_t vendor, u32 attributes, size, data); if (!block) { - if (!spin_trylock_irqsave(&efivars_lock, flags)) + if (down_trylock(&efivars_lock)) return -EBUSY; } else { - spin_lock_irqsave(&efivars_lock, flags); + if (down_interruptible(&efivars_lock)) + return -EINTR; } status = check_var_size(attributes, size + ucs2_strsize(name, 1024)); if (status != EFI_SUCCESS) { - spin_unlock_irqrestore(&efivars_lock, flags); + up(&efivars_lock); return -ENOSPC; } status = ops->set_variable(name, &vendor, attributes, size, data); - spin_unlock_irqrestore(&efivars_lock, flags); + up(&efivars_lock); return efi_status_to_err(status); } @@ -779,8 +793,6 @@ struct efivar_entry *efivar_entry_find(efi_char16_t *name, efi_guid_t guid, int strsize1, strsize2; bool found = false; - lockdep_assert_held(&efivars_lock); - list_for_each_entry_safe(entry, n, head, list) { strsize1 = ucs2_strsize(name, 1024); strsize2 = ucs2_strsize(entry->var.VariableName, 1024); @@ -822,10 +834,11 @@ int efivar_entry_size(struct efivar_entry *entry, unsigned long *size) *size = 0; - spin_lock_irq(&efivars_lock); + if (down_interruptible(&efivars_lock)) + return -EINTR; status = ops->get_variable(entry->var.VariableName, &entry->var.VendorGuid, NULL, size, NULL); - spin_unlock_irq(&efivars_lock); + up(&efivars_lock); if (status != EFI_BUFFER_TOO_SMALL) return efi_status_to_err(status); @@ -851,8 +864,6 @@ int __efivar_entry_get(struct efivar_entry *entry, u32 *attributes, const struct efivar_operations *ops = __efivars->ops; efi_status_t status; - lockdep_assert_held(&efivars_lock); - status = ops->get_variable(entry->var.VariableName, &entry->var.VendorGuid, attributes, size, data); @@ -874,11 +885,12 @@ int efivar_entry_get(struct efivar_entry *entry, u32 *attributes, const struct efivar_operations *ops = __efivars->ops; efi_status_t status; - spin_lock_irq(&efivars_lock); + if (down_interruptible(&efivars_lock)) + return -EINTR; status = ops->get_variable(entry->var.VariableName, &entry->var.VendorGuid, attributes, size, data); - spin_unlock_irq(&efivars_lock); + up(&efivars_lock); return efi_status_to_err(status); } @@ -925,7 +937,8 @@ int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes, * set_variable call, and removal of the variable from the efivars * list (in the case of an authenticated delete). */ - spin_lock_irq(&efivars_lock); + if (down_interruptible(&efivars_lock)) + return -EINTR; /* * Ensure that the available space hasn't shrunk below the safe level @@ -965,7 +978,7 @@ int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes, if (status == EFI_NOT_FOUND) efivar_entry_list_del_unlock(entry); else - spin_unlock_irq(&efivars_lock); + up(&efivars_lock); if (status && status != EFI_BUFFER_TOO_SMALL) return efi_status_to_err(status); @@ -973,7 +986,7 @@ int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes, return 0; out: - spin_unlock_irq(&efivars_lock); + up(&efivars_lock); return err; } @@ -986,9 +999,9 @@ EXPORT_SYMBOL_GPL(efivar_entry_set_get_size); * efivar_entry_iter_end() is called. This function is usually used in * conjunction with __efivar_entry_iter() or efivar_entry_iter(). */ -void efivar_entry_iter_begin(void) +int efivar_entry_iter_begin(void) { - spin_lock_irq(&efivars_lock); + return down_interruptible(&efivars_lock); } EXPORT_SYMBOL_GPL(efivar_entry_iter_begin); @@ -999,7 +1012,7 @@ EXPORT_SYMBOL_GPL(efivar_entry_iter_begin); */ void efivar_entry_iter_end(void) { - spin_unlock_irq(&efivars_lock); + up(&efivars_lock); } EXPORT_SYMBOL_GPL(efivar_entry_iter_end); @@ -1075,7 +1088,9 @@ int efivar_entry_iter(int (*func)(struct efivar_entry *, void *), { int err = 0; - efivar_entry_iter_begin(); + err = efivar_entry_iter_begin(); + if (err) + return err; err = __efivar_entry_iter(func, head, data, NULL); efivar_entry_iter_end(); @@ -1120,12 +1135,17 @@ int efivars_register(struct efivars *efivars, const struct efivar_operations *ops, struct kobject *kobject) { - spin_lock_irq(&efivars_lock); + if (down_interruptible(&efivars_lock)) + return -EINTR; + efivars->ops = ops; efivars->kobject = kobject; __efivars = efivars; - spin_unlock_irq(&efivars_lock); + + pr_info("Registered efivars operations\n"); + + up(&efivars_lock); return 0; } @@ -1142,7 +1162,9 @@ int efivars_unregister(struct efivars *efivars) { int rv; - spin_lock_irq(&efivars_lock); + if (down_interruptible(&efivars_lock)) + return -EINTR; + if (!__efivars) { printk(KERN_ERR "efivars not registered\n"); rv = -EINVAL; @@ -1154,11 +1176,12 @@ int efivars_unregister(struct efivars *efivars) goto out; } + pr_info("Unregistered efivars operations\n"); __efivars = NULL; rv = 0; out: - spin_unlock_irq(&efivars_lock); + up(&efivars_lock); return rv; } EXPORT_SYMBOL_GPL(efivars_unregister); diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 1d73fc6dba13..cbb50cadcffc 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -105,7 +105,10 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry, inode->i_private = var; - efivar_entry_add(var, &efivarfs_list); + err = efivar_entry_add(var, &efivarfs_list); + if (err) + goto out; + d_instantiate(dentry, inode); dget(dentry); out: diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 688ccc16b702..01e3d6e53944 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -161,7 +161,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, kfree(name); efivar_entry_size(entry, &size); - efivar_entry_add(entry, &efivarfs_list); + err = efivar_entry_add(entry, &efivarfs_list); + if (err) + goto fail_inode; inode_lock(inode); inode->i_private = entry; @@ -182,7 +184,10 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, static int efivarfs_destroy(struct efivar_entry *entry, void *data) { - efivar_entry_remove(entry); + int err = efivar_entry_remove(entry); + + if (err) + return err; kfree(entry); return 0; } diff --git a/include/linux/efi.h b/include/linux/efi.h index deecb2902715..4d6da7b66c19 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1297,8 +1297,8 @@ struct kobject *efivars_kobject(void); int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), void *data, bool duplicates, struct list_head *head); -void efivar_entry_add(struct efivar_entry *entry, struct list_head *head); -void efivar_entry_remove(struct efivar_entry *entry); +int efivar_entry_add(struct efivar_entry *entry, struct list_head *head); +int efivar_entry_remove(struct efivar_entry *entry); int __efivar_entry_delete(struct efivar_entry *entry); int efivar_entry_delete(struct efivar_entry *entry); @@ -1315,7 +1315,7 @@ int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes, int efivar_entry_set_safe(efi_char16_t *name, efi_guid_t vendor, u32 attributes, bool block, unsigned long size, void *data); -void efivar_entry_iter_begin(void); +int efivar_entry_iter_begin(void); void efivar_entry_iter_end(void); int __efivar_entry_iter(int (*func)(struct efivar_entry *, void *), From dce48e351c0d42014e5fb16ac3eb099e11b7e716 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 15 Jul 2016 21:36:31 +0200 Subject: [PATCH 237/538] efi: Replace runtime services spinlock with semaphore The purpose of the efi_runtime_lock is to prevent concurrent calls into the firmware. There is no need to use spinlocks here, as long as we ensure that runtime service invocations from an atomic context (i.e., EFI pstore) cannot block. So use a semaphore instead, and use down_trylock() in the nonblocking case. We don't use a mutex here because the mutex_trylock() function must not be called from interrupt context, whereas the down_trylock() can. Signed-off-by: Ard Biesheuvel Cc: Leif Lindholm Cc: Mark Rutland Cc: Sylvain Chouleur Signed-off-by: Matt Fleming --- drivers/firmware/efi/efi.c | 3 + drivers/firmware/efi/runtime-wrappers.c | 81 +++++++++++++++---------- include/linux/efi.h | 1 + 3 files changed, 53 insertions(+), 32 deletions(-) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index dfe07316cae5..97d98e82f0f4 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -810,6 +810,9 @@ int efi_status_to_err(efi_status_t status) case EFI_NOT_FOUND: err = -ENOENT; break; + case EFI_ABORTED: + err = -EINTR; + break; default: err = -EINVAL; } diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c index 41958774cde3..ae54870b2788 100644 --- a/drivers/firmware/efi/runtime-wrappers.c +++ b/drivers/firmware/efi/runtime-wrappers.c @@ -14,11 +14,13 @@ * This file is released under the GPLv2. */ +#define pr_fmt(fmt) "efi: " fmt + #include #include #include #include -#include +#include #include #include @@ -81,20 +83,21 @@ void efi_call_virt_check_flags(unsigned long flags, const char *call) * +------------------------------------+-------------------------------+ * * Due to the fact that the EFI pstore may write to the variable store in - * interrupt context, we need to use a spinlock for at least the groups that + * interrupt context, we need to use a lock for at least the groups that * contain SetVariable() and QueryVariableInfo(). That leaves little else, as * none of the remaining functions are actually ever called at runtime. - * So let's just use a single spinlock to serialize all Runtime Services calls. + * So let's just use a single lock to serialize all Runtime Services calls. */ -static DEFINE_SPINLOCK(efi_runtime_lock); +static DEFINE_SEMAPHORE(efi_runtime_lock); static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) { efi_status_t status; - spin_lock(&efi_runtime_lock); + if (down_interruptible(&efi_runtime_lock)) + return EFI_ABORTED; status = efi_call_virt(get_time, tm, tc); - spin_unlock(&efi_runtime_lock); + up(&efi_runtime_lock); return status; } @@ -102,9 +105,10 @@ static efi_status_t virt_efi_set_time(efi_time_t *tm) { efi_status_t status; - spin_lock(&efi_runtime_lock); + if (down_interruptible(&efi_runtime_lock)) + return EFI_ABORTED; status = efi_call_virt(set_time, tm); - spin_unlock(&efi_runtime_lock); + up(&efi_runtime_lock); return status; } @@ -114,9 +118,10 @@ static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled, { efi_status_t status; - spin_lock(&efi_runtime_lock); + if (down_interruptible(&efi_runtime_lock)) + return EFI_ABORTED; status = efi_call_virt(get_wakeup_time, enabled, pending, tm); - spin_unlock(&efi_runtime_lock); + up(&efi_runtime_lock); return status; } @@ -124,9 +129,10 @@ static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) { efi_status_t status; - spin_lock(&efi_runtime_lock); + if (down_interruptible(&efi_runtime_lock)) + return EFI_ABORTED; status = efi_call_virt(set_wakeup_time, enabled, tm); - spin_unlock(&efi_runtime_lock); + up(&efi_runtime_lock); return status; } @@ -138,10 +144,11 @@ static efi_status_t virt_efi_get_variable(efi_char16_t *name, { efi_status_t status; - spin_lock(&efi_runtime_lock); + if (down_interruptible(&efi_runtime_lock)) + return EFI_ABORTED; status = efi_call_virt(get_variable, name, vendor, attr, data_size, data); - spin_unlock(&efi_runtime_lock); + up(&efi_runtime_lock); return status; } @@ -151,9 +158,10 @@ static efi_status_t virt_efi_get_next_variable(unsigned long *name_size, { efi_status_t status; - spin_lock(&efi_runtime_lock); + if (down_interruptible(&efi_runtime_lock)) + return EFI_ABORTED; status = efi_call_virt(get_next_variable, name_size, name, vendor); - spin_unlock(&efi_runtime_lock); + up(&efi_runtime_lock); return status; } @@ -165,10 +173,11 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name, { efi_status_t status; - spin_lock(&efi_runtime_lock); + if (down_interruptible(&efi_runtime_lock)) + return EFI_ABORTED; status = efi_call_virt(set_variable, name, vendor, attr, data_size, data); - spin_unlock(&efi_runtime_lock); + up(&efi_runtime_lock); return status; } @@ -179,12 +188,12 @@ virt_efi_set_variable_nonblocking(efi_char16_t *name, efi_guid_t *vendor, { efi_status_t status; - if (!spin_trylock(&efi_runtime_lock)) + if (down_trylock(&efi_runtime_lock)) return EFI_NOT_READY; status = efi_call_virt(set_variable, name, vendor, attr, data_size, data); - spin_unlock(&efi_runtime_lock); + up(&efi_runtime_lock); return status; } @@ -199,10 +208,11 @@ static efi_status_t virt_efi_query_variable_info(u32 attr, if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; - spin_lock(&efi_runtime_lock); + if (down_interruptible(&efi_runtime_lock)) + return EFI_ABORTED; status = efi_call_virt(query_variable_info, attr, storage_space, remaining_space, max_variable_size); - spin_unlock(&efi_runtime_lock); + up(&efi_runtime_lock); return status; } @@ -217,12 +227,12 @@ virt_efi_query_variable_info_nonblocking(u32 attr, if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; - if (!spin_trylock(&efi_runtime_lock)) + if (down_trylock(&efi_runtime_lock)) return EFI_NOT_READY; status = efi_call_virt(query_variable_info, attr, storage_space, remaining_space, max_variable_size); - spin_unlock(&efi_runtime_lock); + up(&efi_runtime_lock); return status; } @@ -230,9 +240,10 @@ static efi_status_t virt_efi_get_next_high_mono_count(u32 *count) { efi_status_t status; - spin_lock(&efi_runtime_lock); + if (down_interruptible(&efi_runtime_lock)) + return EFI_ABORTED; status = efi_call_virt(get_next_high_mono_count, count); - spin_unlock(&efi_runtime_lock); + up(&efi_runtime_lock); return status; } @@ -241,9 +252,13 @@ static void virt_efi_reset_system(int reset_type, unsigned long data_size, efi_char16_t *data) { - spin_lock(&efi_runtime_lock); + if (down_interruptible(&efi_runtime_lock)) { + pr_warn("failed to invoke the reset_system() runtime service:\n" + "could not get exclusive access to the firmware\n"); + return; + } __efi_call_virt(reset_system, reset_type, status, data_size, data); - spin_unlock(&efi_runtime_lock); + up(&efi_runtime_lock); } static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules, @@ -255,9 +270,10 @@ static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules, if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; - spin_lock(&efi_runtime_lock); + if (down_interruptible(&efi_runtime_lock)) + return EFI_ABORTED; status = efi_call_virt(update_capsule, capsules, count, sg_list); - spin_unlock(&efi_runtime_lock); + up(&efi_runtime_lock); return status; } @@ -271,10 +287,11 @@ static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules, if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; - spin_lock(&efi_runtime_lock); + if (down_interruptible(&efi_runtime_lock)) + return EFI_ABORTED; status = efi_call_virt(query_capsule_caps, capsules, count, max_size, reset_type); - spin_unlock(&efi_runtime_lock); + up(&efi_runtime_lock); return status; } diff --git a/include/linux/efi.h b/include/linux/efi.h index 4d6da7b66c19..4c92c0630c45 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -38,6 +38,7 @@ #define EFI_WRITE_PROTECTED ( 8 | (1UL << (BITS_PER_LONG-1))) #define EFI_OUT_OF_RESOURCES ( 9 | (1UL << (BITS_PER_LONG-1))) #define EFI_NOT_FOUND (14 | (1UL << (BITS_PER_LONG-1))) +#define EFI_ABORTED (21 | (1UL << (BITS_PER_LONG-1))) #define EFI_SECURITY_VIOLATION (26 | (1UL << (BITS_PER_LONG-1))) typedef unsigned long efi_status_t; From ac0e94b63e65f9c6d2f3c49107118e2228236db4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 20 Jul 2016 11:11:06 +0100 Subject: [PATCH 238/538] x86/efi: Initialize status to ensure garbage is not returned on small size Although very unlikey, if size is too small or zero, then we end up with status not being set and returning garbage. Instead, initializing status to EFI_INVALID_PARAMETER to indicate that size is invalid in the calls to setup_uga32 and setup_uga64. Signed-off-by: Colin Ian King Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Matt Fleming --- arch/x86/boot/compressed/eboot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index ff574dad95cc..ec6d2ef12baf 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -578,7 +578,7 @@ setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height) efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID; unsigned long nr_ugas; u32 *handles = (u32 *)uga_handle;; - efi_status_t status; + efi_status_t status = EFI_INVALID_PARAMETER; int i; first_uga = NULL; @@ -623,7 +623,7 @@ setup_uga64(void **uga_handle, unsigned long size, u32 *width, u32 *height) efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID; unsigned long nr_ugas; u64 *handles = (u64 *)uga_handle;; - efi_status_t status; + efi_status_t status = EFI_INVALID_PARAMETER; int i; first_uga = NULL; From d520dd1f348dcaafcb8ce804b2a5ebb1be004719 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sun, 24 Jul 2016 10:16:56 +0200 Subject: [PATCH 239/538] firmware-gsmi: Delete an unnecessary check before the function call "dma_pool_destroy" The dma_pool_destroy() function tests whether its argument is NULL and then returns immediately. Thus the test around the call is not needed. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Cc: Greg KH Cc: Julia Lawall Cc: Mike Waychison Cc: Michel Lespinasse Signed-off-by: Matt Fleming --- drivers/firmware/google/gsmi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/firmware/google/gsmi.c b/drivers/firmware/google/gsmi.c index f1ab05ea56bb..c46387160976 100644 --- a/drivers/firmware/google/gsmi.c +++ b/drivers/firmware/google/gsmi.c @@ -910,8 +910,7 @@ static __init int gsmi_init(void) gsmi_buf_free(gsmi_dev.param_buf); gsmi_buf_free(gsmi_dev.data_buf); gsmi_buf_free(gsmi_dev.name_buf); - if (gsmi_dev.dma_pool) - dma_pool_destroy(gsmi_dev.dma_pool); + dma_pool_destroy(gsmi_dev.dma_pool); platform_device_unregister(gsmi_dev.pdev); pr_info("gsmi: failed to load: %d\n", ret); return ret; From cf289cefbfde519bbc179a86cdc5e8cc91a0a08d Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Wed, 3 Aug 2016 10:16:02 +0200 Subject: [PATCH 240/538] lib/ucs2_string: Speed up ucs2_utf8size() No need to calculate the string length on every loop iteration. Signed-off-by: Lukas Wunner Cc: Peter Jones Signed-off-by: Matt Fleming --- lib/ucs2_string.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ucs2_string.c b/lib/ucs2_string.c index f0b323abb4c6..ae8d2491133c 100644 --- a/lib/ucs2_string.c +++ b/lib/ucs2_string.c @@ -56,7 +56,7 @@ ucs2_utf8size(const ucs2_char_t *src) unsigned long i; unsigned long j = 0; - for (i = 0; i < ucs2_strlen(src); i++) { + for (i = 0; src[i]; i++) { u16 c = src[i]; if (c >= 0x800) From 0513fe1d28e45deb39159dbeedf0660c3f0effd2 Mon Sep 17 00:00:00 2001 From: Alex Thorlton Date: Fri, 5 Aug 2016 18:59:35 -0500 Subject: [PATCH 241/538] x86/efi: Map in physical addresses in efi_map_region_fixed This is a simple change to add in the physical mappings as well as the virtual mappings in efi_map_region_fixed. The motivation here is to get access to EFI runtime code that is only available via the 1:1 mappings on a kexec'd kernel. The added call is essentially the kexec analog of the first __map_region that Boris put in efi_map_region in commit d2f7cbe7b26a ("x86/efi: Runtime services virtual mapping"). Signed-off-by: Alex Thorlton Cc: Russ Anderson Cc: Dimitri Sivanich Cc: Mike Travis Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Dave Young Cc: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi_64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 45434ea345e9..e1ca71259468 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -339,6 +339,7 @@ void __init efi_map_region(efi_memory_desc_t *md) */ void __init efi_map_region_fixed(efi_memory_desc_t *md) { + __map_region(md, md->phys_addr); __map_region(md, md->virt_addr); } From 22c2b77f419bdc9317f00b395283abd33157368e Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Mon, 15 Aug 2016 15:29:20 +0100 Subject: [PATCH 242/538] fs/efivarfs: Fix double kfree() in error path Julia reported that we may double free 'name' in efivarfs_callback(), and that this bug was introduced by commit 0d22f33bc37c ("efi: Don't use spinlocks for efi vars"). Move one of the kfree()s until after the point at which we know we are definitely on the success path. Reported-by: Julia Lawall Acked-by: Julia Lawall Cc: Ard Biesheuvel Cc: Sylvain Chouleur Signed-off-by: Matt Fleming --- fs/efivarfs/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 01e3d6e53944..d7a7c53803c1 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -157,14 +157,14 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, goto fail_inode; } - /* copied by the above to local storage in the dentry. */ - kfree(name); - efivar_entry_size(entry, &size); err = efivar_entry_add(entry, &efivarfs_list); if (err) goto fail_inode; + /* copied by the above to local storage in the dentry. */ + kfree(name); + inode_lock(inode); inode->i_private = entry; i_size_write(inode, size + sizeof(entry->var.Attributes)); From 15cf7cae087a2eaf5e1feeef2bbba1b5a94c7639 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Mon, 15 Aug 2016 13:52:34 +0200 Subject: [PATCH 243/538] x86/efi: Remove unused find_bits() function Left behind by commit fc37206427ce ("efi/libstub: Move Graphics Output Protocol handling to generic code"). Signed-off-by: Lukas Wunner Cc: Ard Biesheuvel Signed-off-by: Matt Fleming --- arch/x86/boot/compressed/eboot.c | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index ec6d2ef12baf..f7fc85bf8221 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -286,29 +286,6 @@ void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str) } } -static void find_bits(unsigned long mask, u8 *pos, u8 *size) -{ - u8 first, len; - - first = 0; - len = 0; - - if (mask) { - while (!(mask & 0x1)) { - mask = mask >> 1; - first++; - } - - while (mask & 0x1) { - mask = mask >> 1; - len++; - } - } - - *pos = first; - *size = len; -} - static efi_status_t __setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom) { From 9d80448ac92b720512c415265597d349d8b5c3e8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Aug 2016 14:13:21 +0200 Subject: [PATCH 244/538] efi/arm64: Add debugfs node to dump UEFI runtime page tables Register the debugfs node 'efi_page_tables' to allow the UEFI runtime page tables to be inspected. Note that ARM does not have 'asm/ptdump.h' [yet] so for now, this is arm64 only. Signed-off-by: Ard Biesheuvel Acked-by: Mark Rutland Cc: Leif Lindholm Signed-off-by: Matt Fleming --- drivers/firmware/efi/arm-runtime.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index ae001450545f..7c75a8d9091a 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -39,6 +39,26 @@ static struct mm_struct efi_mm = { .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), }; +#ifdef CONFIG_ARM64_PTDUMP +#include + +static struct ptdump_info efi_ptdump_info = { + .mm = &efi_mm, + .markers = (struct addr_marker[]){ + { 0, "UEFI runtime start" }, + { TASK_SIZE_64, "UEFI runtime end" } + }, + .base_addr = 0, +}; + +static int __init ptdump_init(void) +{ + return ptdump_register(&efi_ptdump_info, "efi_page_tables"); +} +device_initcall(ptdump_init); + +#endif + static bool __init efi_virtmap_init(void) { efi_memory_desc_t *md; From 3dad6f7f6975387f53f1a772f29f54335563d93d Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Tue, 16 Aug 2016 17:32:31 -0700 Subject: [PATCH 245/538] x86/efi: Defer efi_esrt_init until after memblock_x86_fill Commit 7b02d53e7852 ("efi: Allow drivers to reserve boot services forever") introduced a new efi_mem_reserve to reserve the boot services memory regions forever. This reservation involves allocating a new EFI memory range descriptor. However, allocation can only succeed if there is memory available for the allocation. Otherwise, error such as the following may occur: esrt: Reserving ESRT space from 0x000000003dd6a000 to 0x000000003dd6a010. Kernel panic - not syncing: ERROR: Failed to allocate 0x9f0 bytes below \ 0x0. CPU: 0 PID: 0 Comm: swapper Not tainted 4.7.0-rc5+ #503 0000000000000000 ffffffff81e03ce0 ffffffff8131dae8 ffffffff81bb6c50 ffffffff81e03d70 ffffffff81e03d60 ffffffff8111f4df 0000000000000018 ffffffff81e03d70 ffffffff81e03d08 00000000000009f0 00000000000009f0 Call Trace: [] dump_stack+0x4d/0x65 [] panic+0xc5/0x206 [] memblock_alloc_base+0x29/0x2e [] memblock_alloc+0xb/0xd [] efi_arch_mem_reserve+0xbc/0x134 [] efi_mem_reserve+0x2c/0x31 [] ? efi_mem_reserve+0x2c/0x31 [] efi_esrt_init+0x19e/0x1b4 [] efi_init+0x398/0x44a [] setup_arch+0x415/0xc30 [] start_kernel+0x5b/0x3ef [] x86_64_start_reservations+0x2f/0x31 [] x86_64_start_kernel+0xea/0xed ---[ end Kernel panic - not syncing: ERROR: Failed to allocate 0x9f0 bytes below 0x0. An inspection of the memblock configuration reveals that there is no memory available for the allocation: MEMBLOCK configuration: memory size = 0x0 reserved size = 0x4f339c0 memory.cnt = 0x1 memory[0x0] [0x00000000000000-0xffffffffffffffff], 0x0 bytes on node 0\ flags: 0x0 reserved.cnt = 0x4 reserved[0x0] [0x0000000008c000-0x0000000008c9bf], 0x9c0 bytes flags: 0x0 reserved[0x1] [0x0000000009f000-0x000000000fffff], 0x61000 bytes\ flags: 0x0 reserved[0x2] [0x00000002800000-0x0000000394bfff], 0x114c000 bytes\ flags: 0x0 reserved[0x3] [0x000000304e4000-0x00000034269fff], 0x3d86000 bytes\ flags: 0x0 This situation can be avoided if we call efi_esrt_init after memblock has memory regions for the allocation. Also, the EFI ESRT driver makes use of early_memremap'pings. Therfore, we do not want to defer efi_esrt_init for too long. We must call such function while calls to early_memremap are still valid. A good place to meet the two aforementioned conditions is right after memblock_x86_fill, grouped with other EFI-related functions. Reported-by: Scott Lawson Signed-off-by: Ricardo Neri Cc: Ard Biesheuvel Cc: Peter Jones Signed-off-by: Matt Fleming --- arch/x86/kernel/setup.c | 1 + arch/x86/platform/efi/efi.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4fd69e532c15..528b8eb24a04 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1101,6 +1101,7 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_MEMMAP)) { efi_fake_memmap(); efi_find_mirror(); + efi_esrt_init(); /* * The EFI specification says that boot service code won't be diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 342cebd1e17c..0955c70897ae 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -474,8 +474,6 @@ void __init efi_init(void) if (efi_enabled(EFI_DBG)) efi_print_memmap(); - - efi_esrt_init(); } void __init efi_late_init(void) From ff6301dabc3ca20ab8f50f8d0252ac05da610d89 Mon Sep 17 00:00:00 2001 From: Ivan Hu Date: Thu, 25 Aug 2016 11:15:31 +0800 Subject: [PATCH 246/538] efi: Add efi_test driver for exporting UEFI runtime service interfaces This driver is used by the Firmware Test Suite (FWTS) for testing the UEFI runtime interfaces readiness of the firmware. This driver exports UEFI runtime service interfaces into userspace, which allows to use and test UEFI runtime services provided by the firmware. This driver uses the efi. function pointers directly instead of going through the efivar API to allow for direct testing of the UEFI runtime service interfaces provided by the firmware. Details for FWTS are available from, Signed-off-by: Ivan Hu Cc: joeyli Cc: Ricardo Neri Cc: Ard Biesheuvel Signed-off-by: Matt Fleming --- MAINTAINERS | 7 + drivers/firmware/efi/Kconfig | 17 + drivers/firmware/efi/Makefile | 1 + drivers/firmware/efi/test/Makefile | 1 + drivers/firmware/efi/test/efi_test.c | 749 +++++++++++++++++++++++++++ drivers/firmware/efi/test/efi_test.h | 110 ++++ 6 files changed, 885 insertions(+) create mode 100644 drivers/firmware/efi/test/Makefile create mode 100644 drivers/firmware/efi/test/efi_test.c create mode 100644 drivers/firmware/efi/test/efi_test.h diff --git a/MAINTAINERS b/MAINTAINERS index db814a89599c..007d05acbb5f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4583,6 +4583,13 @@ M: Peter Jones S: Maintained F: drivers/video/fbdev/efifb.c +EFI TEST DRIVER +L: linux-efi@vger.kernel.org +M: Ivan Hu +M: Matt Fleming +S: Maintained +F: drivers/firmware/efi/test/ + EFS FILESYSTEM W: http://aeschi.ch.eu.org/efs/ S: Orphan diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index 6394152f648f..c981be17d3c0 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -112,6 +112,23 @@ config EFI_CAPSULE_LOADER Most users should say N. +config EFI_TEST + tristate "EFI Runtime Service Tests Support" + depends on EFI + default n + help + This driver uses the efi. function pointers directly instead + of going through the efivar API, because it is not trying to test the + kernel subsystem, just for testing the UEFI runtime service + interfaces which are provided by the firmware. This driver is used + by the Firmware Test Suite (FWTS) for testing the UEFI runtime + interfaces readiness of the firmware. + Details for FWTS are available from: + + + Say Y here to enable the runtime services support via /dev/efi_test. + If unsure, say N. + endmenu config UEFI_CPER diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index b3f5e2adc49f..c8a439f6d715 100644 --- a/drivers/firmware/efi/Makefile +++ b/drivers/firmware/efi/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_EFI_RUNTIME_WRAPPERS) += runtime-wrappers.o obj-$(CONFIG_EFI_STUB) += libstub/ obj-$(CONFIG_EFI_FAKE_MEMMAP) += fake_mem.o obj-$(CONFIG_EFI_BOOTLOADER_CONTROL) += efibc.o +obj-$(CONFIG_EFI_TEST) += test/ arm-obj-$(CONFIG_EFI) := arm-init.o arm-runtime.o obj-$(CONFIG_ARM) += $(arm-obj-y) diff --git a/drivers/firmware/efi/test/Makefile b/drivers/firmware/efi/test/Makefile new file mode 100644 index 000000000000..bcd4577d40e6 --- /dev/null +++ b/drivers/firmware/efi/test/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_EFI_TEST) += efi_test.o diff --git a/drivers/firmware/efi/test/efi_test.c b/drivers/firmware/efi/test/efi_test.c new file mode 100644 index 000000000000..f61bb52be318 --- /dev/null +++ b/drivers/firmware/efi/test/efi_test.c @@ -0,0 +1,749 @@ +/* + * EFI Test Driver for Runtime Services + * + * Copyright(C) 2012-2016 Canonical Ltd. + * + * This driver exports EFI runtime services interfaces into userspace, which + * allow to use and test UEFI runtime services provided by firmware. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "efi_test.h" + +MODULE_AUTHOR("Ivan Hu "); +MODULE_DESCRIPTION("EFI Test Driver"); +MODULE_LICENSE("GPL"); + +/* + * Count the bytes in 'str', including the terminating NULL. + * + * Note this function returns the number of *bytes*, not the number of + * ucs2 characters. + */ +static inline size_t user_ucs2_strsize(efi_char16_t __user *str) +{ + efi_char16_t *s = str, c; + size_t len; + + if (!str) + return 0; + + /* Include terminating NULL */ + len = sizeof(efi_char16_t); + + if (get_user(c, s++)) { + /* Can't read userspace memory for size */ + return 0; + } + + while (c != 0) { + if (get_user(c, s++)) { + /* Can't read userspace memory for size */ + return 0; + } + len += sizeof(efi_char16_t); + } + return len; +} + +/* + * Allocate a buffer and copy a ucs2 string from user space into it. + */ +static inline int +copy_ucs2_from_user_len(efi_char16_t **dst, efi_char16_t __user *src, + size_t len) +{ + efi_char16_t *buf; + + if (!src) { + *dst = NULL; + return 0; + } + + if (!access_ok(VERIFY_READ, src, 1)) + return -EFAULT; + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) { + *dst = NULL; + return -ENOMEM; + } + *dst = buf; + + if (copy_from_user(*dst, src, len)) { + kfree(buf); + return -EFAULT; + } + + return 0; +} + +/* + * Count the bytes in 'str', including the terminating NULL. + * + * Just a wrap for user_ucs2_strsize + */ +static inline int +get_ucs2_strsize_from_user(efi_char16_t __user *src, size_t *len) +{ + if (!access_ok(VERIFY_READ, src, 1)) + return -EFAULT; + + *len = user_ucs2_strsize(src); + if (*len == 0) + return -EFAULT; + + return 0; +} + +/* + * Calculate the required buffer allocation size and copy a ucs2 string + * from user space into it. + * + * This function differs from copy_ucs2_from_user_len() because it + * calculates the size of the buffer to allocate by taking the length of + * the string 'src'. + * + * If a non-zero value is returned, the caller MUST NOT access 'dst'. + * + * It is the caller's responsibility to free 'dst'. + */ +static inline int +copy_ucs2_from_user(efi_char16_t **dst, efi_char16_t __user *src) +{ + size_t len; + + if (!access_ok(VERIFY_READ, src, 1)) + return -EFAULT; + + len = user_ucs2_strsize(src); + if (len == 0) + return -EFAULT; + return copy_ucs2_from_user_len(dst, src, len); +} + +/* + * Copy a ucs2 string to a user buffer. + * + * This function is a simple wrapper around copy_to_user() that does + * nothing if 'src' is NULL, which is useful for reducing the amount of + * NULL checking the caller has to do. + * + * 'len' specifies the number of bytes to copy. + */ +static inline int +copy_ucs2_to_user_len(efi_char16_t __user *dst, efi_char16_t *src, size_t len) +{ + if (!src) + return 0; + + if (!access_ok(VERIFY_WRITE, dst, 1)) + return -EFAULT; + + return copy_to_user(dst, src, len); +} + +static long efi_runtime_get_variable(unsigned long arg) +{ + struct efi_getvariable __user *getvariable_user; + struct efi_getvariable getvariable; + unsigned long datasize, prev_datasize, *dz; + efi_guid_t vendor_guid, *vd = NULL; + efi_status_t status; + efi_char16_t *name = NULL; + u32 attr, *at; + void *data = NULL; + int rv = 0; + + getvariable_user = (struct efi_getvariable __user *)arg; + + if (copy_from_user(&getvariable, getvariable_user, + sizeof(getvariable))) + return -EFAULT; + if (getvariable.data_size && + get_user(datasize, getvariable.data_size)) + return -EFAULT; + if (getvariable.vendor_guid) { + if (copy_from_user(&vendor_guid, getvariable.vendor_guid, + sizeof(vendor_guid))) + return -EFAULT; + vd = &vendor_guid; + } + + if (getvariable.variable_name) { + rv = copy_ucs2_from_user(&name, getvariable.variable_name); + if (rv) + return rv; + } + + at = getvariable.attributes ? &attr : NULL; + dz = getvariable.data_size ? &datasize : NULL; + + if (getvariable.data_size && getvariable.data) { + data = kmalloc(datasize, GFP_KERNEL); + if (!data) { + kfree(name); + return -ENOMEM; + } + } + + prev_datasize = datasize; + status = efi.get_variable(name, vd, at, dz, data); + kfree(name); + + if (put_user(status, getvariable.status)) { + rv = -EFAULT; + goto out; + } + + if (status != EFI_SUCCESS) { + if (status == EFI_BUFFER_TOO_SMALL) { + if (dz && put_user(datasize, getvariable.data_size)) { + rv = -EFAULT; + goto out; + } + } + rv = -EINVAL; + goto out; + } + + if (prev_datasize < datasize) { + rv = -EINVAL; + goto out; + } + + if (data) { + if (copy_to_user(getvariable.data, data, datasize)) { + rv = -EFAULT; + goto out; + } + } + + if (at && put_user(attr, getvariable.attributes)) { + rv = -EFAULT; + goto out; + } + + if (dz && put_user(datasize, getvariable.data_size)) + rv = -EFAULT; + +out: + kfree(data); + return rv; + +} + +static long efi_runtime_set_variable(unsigned long arg) +{ + struct efi_setvariable __user *setvariable_user; + struct efi_setvariable setvariable; + efi_guid_t vendor_guid; + efi_status_t status; + efi_char16_t *name = NULL; + void *data; + int rv = 0; + + setvariable_user = (struct efi_setvariable __user *)arg; + + if (copy_from_user(&setvariable, setvariable_user, sizeof(setvariable))) + return -EFAULT; + if (copy_from_user(&vendor_guid, setvariable.vendor_guid, + sizeof(vendor_guid))) + return -EFAULT; + + if (setvariable.variable_name) { + rv = copy_ucs2_from_user(&name, setvariable.variable_name); + if (rv) + return rv; + } + + data = kmalloc(setvariable.data_size, GFP_KERNEL); + if (!data) { + kfree(name); + return -ENOMEM; + } + if (copy_from_user(data, setvariable.data, setvariable.data_size)) { + rv = -EFAULT; + goto out; + } + + status = efi.set_variable(name, &vendor_guid, + setvariable.attributes, + setvariable.data_size, data); + + if (put_user(status, setvariable.status)) { + rv = -EFAULT; + goto out; + } + + rv = status == EFI_SUCCESS ? 0 : -EINVAL; + +out: + kfree(data); + kfree(name); + + return rv; +} + +static long efi_runtime_get_time(unsigned long arg) +{ + struct efi_gettime __user *gettime_user; + struct efi_gettime gettime; + efi_status_t status; + efi_time_cap_t cap; + efi_time_t efi_time; + + gettime_user = (struct efi_gettime __user *)arg; + if (copy_from_user(&gettime, gettime_user, sizeof(gettime))) + return -EFAULT; + + status = efi.get_time(gettime.time ? &efi_time : NULL, + gettime.capabilities ? &cap : NULL); + + if (put_user(status, gettime.status)) + return -EFAULT; + + if (status != EFI_SUCCESS) + return -EINVAL; + + if (gettime.capabilities) { + efi_time_cap_t __user *cap_local; + + cap_local = (efi_time_cap_t *)gettime.capabilities; + if (put_user(cap.resolution, &(cap_local->resolution)) || + put_user(cap.accuracy, &(cap_local->accuracy)) || + put_user(cap.sets_to_zero, &(cap_local->sets_to_zero))) + return -EFAULT; + } + if (gettime.time) { + if (copy_to_user(gettime.time, &efi_time, sizeof(efi_time_t))) + return -EFAULT; + } + + return 0; +} + +static long efi_runtime_set_time(unsigned long arg) +{ + struct efi_settime __user *settime_user; + struct efi_settime settime; + efi_status_t status; + efi_time_t efi_time; + + settime_user = (struct efi_settime __user *)arg; + if (copy_from_user(&settime, settime_user, sizeof(settime))) + return -EFAULT; + if (copy_from_user(&efi_time, settime.time, + sizeof(efi_time_t))) + return -EFAULT; + status = efi.set_time(&efi_time); + + if (put_user(status, settime.status)) + return -EFAULT; + + return status == EFI_SUCCESS ? 0 : -EINVAL; +} + +static long efi_runtime_get_waketime(unsigned long arg) +{ + struct efi_getwakeuptime __user *getwakeuptime_user; + struct efi_getwakeuptime getwakeuptime; + efi_bool_t enabled, pending; + efi_status_t status; + efi_time_t efi_time; + + getwakeuptime_user = (struct efi_getwakeuptime __user *)arg; + if (copy_from_user(&getwakeuptime, getwakeuptime_user, + sizeof(getwakeuptime))) + return -EFAULT; + + status = efi.get_wakeup_time( + getwakeuptime.enabled ? (efi_bool_t *)&enabled : NULL, + getwakeuptime.pending ? (efi_bool_t *)&pending : NULL, + getwakeuptime.time ? &efi_time : NULL); + + if (put_user(status, getwakeuptime.status)) + return -EFAULT; + + if (status != EFI_SUCCESS) + return -EINVAL; + + if (getwakeuptime.enabled && put_user(enabled, + getwakeuptime.enabled)) + return -EFAULT; + + if (getwakeuptime.time) { + if (copy_to_user(getwakeuptime.time, &efi_time, + sizeof(efi_time_t))) + return -EFAULT; + } + + return 0; +} + +static long efi_runtime_set_waketime(unsigned long arg) +{ + struct efi_setwakeuptime __user *setwakeuptime_user; + struct efi_setwakeuptime setwakeuptime; + efi_bool_t enabled; + efi_status_t status; + efi_time_t efi_time; + + setwakeuptime_user = (struct efi_setwakeuptime __user *)arg; + + if (copy_from_user(&setwakeuptime, setwakeuptime_user, + sizeof(setwakeuptime))) + return -EFAULT; + + enabled = setwakeuptime.enabled; + if (setwakeuptime.time) { + if (copy_from_user(&efi_time, setwakeuptime.time, + sizeof(efi_time_t))) + return -EFAULT; + + status = efi.set_wakeup_time(enabled, &efi_time); + } else + status = efi.set_wakeup_time(enabled, NULL); + + if (put_user(status, setwakeuptime.status)) + return -EFAULT; + + return status == EFI_SUCCESS ? 0 : -EINVAL; +} + +static long efi_runtime_get_nextvariablename(unsigned long arg) +{ + struct efi_getnextvariablename __user *getnextvariablename_user; + struct efi_getnextvariablename getnextvariablename; + unsigned long name_size, prev_name_size = 0, *ns = NULL; + efi_status_t status; + efi_guid_t *vd = NULL; + efi_guid_t vendor_guid; + efi_char16_t *name = NULL; + int rv; + + getnextvariablename_user = (struct efi_getnextvariablename __user *)arg; + + if (copy_from_user(&getnextvariablename, getnextvariablename_user, + sizeof(getnextvariablename))) + return -EFAULT; + + if (getnextvariablename.variable_name_size) { + if (get_user(name_size, getnextvariablename.variable_name_size)) + return -EFAULT; + ns = &name_size; + prev_name_size = name_size; + } + + if (getnextvariablename.vendor_guid) { + if (copy_from_user(&vendor_guid, + getnextvariablename.vendor_guid, + sizeof(vendor_guid))) + return -EFAULT; + vd = &vendor_guid; + } + + if (getnextvariablename.variable_name) { + size_t name_string_size = 0; + + rv = get_ucs2_strsize_from_user( + getnextvariablename.variable_name, + &name_string_size); + if (rv) + return rv; + /* + * The name_size may be smaller than the real buffer size where + * variable name located in some use cases. The most typical + * case is passing a 0 to get the required buffer size for the + * 1st time call. So we need to copy the content from user + * space for at least the string size of variable name, or else + * the name passed to UEFI may not be terminated as we expected. + */ + rv = copy_ucs2_from_user_len(&name, + getnextvariablename.variable_name, + prev_name_size > name_string_size ? + prev_name_size : name_string_size); + if (rv) + return rv; + } + + status = efi.get_next_variable(ns, name, vd); + + if (put_user(status, getnextvariablename.status)) { + rv = -EFAULT; + goto out; + } + + if (status != EFI_SUCCESS) { + if (status == EFI_BUFFER_TOO_SMALL) { + if (ns && put_user(*ns, + getnextvariablename.variable_name_size)) { + rv = -EFAULT; + goto out; + } + } + rv = -EINVAL; + goto out; + } + + if (name) { + if (copy_ucs2_to_user_len(getnextvariablename.variable_name, + name, prev_name_size)) { + rv = -EFAULT; + goto out; + } + } + + if (ns) { + if (put_user(*ns, getnextvariablename.variable_name_size)) { + rv = -EFAULT; + goto out; + } + } + + if (vd) { + if (copy_to_user(getnextvariablename.vendor_guid, vd, + sizeof(efi_guid_t))) + rv = -EFAULT; + } + +out: + kfree(name); + return rv; +} + +static long efi_runtime_get_nexthighmonocount(unsigned long arg) +{ + struct efi_getnexthighmonotoniccount __user *getnexthighmonocount_user; + struct efi_getnexthighmonotoniccount getnexthighmonocount; + efi_status_t status; + u32 count; + + getnexthighmonocount_user = (struct + efi_getnexthighmonotoniccount __user *)arg; + + if (copy_from_user(&getnexthighmonocount, + getnexthighmonocount_user, + sizeof(getnexthighmonocount))) + return -EFAULT; + + status = efi.get_next_high_mono_count( + getnexthighmonocount.high_count ? &count : NULL); + + if (put_user(status, getnexthighmonocount.status)) + return -EFAULT; + + if (status != EFI_SUCCESS) + return -EINVAL; + + if (getnexthighmonocount.high_count && + put_user(count, getnexthighmonocount.high_count)) + return -EFAULT; + + return 0; +} + +static long efi_runtime_query_variableinfo(unsigned long arg) +{ + struct efi_queryvariableinfo __user *queryvariableinfo_user; + struct efi_queryvariableinfo queryvariableinfo; + efi_status_t status; + u64 max_storage, remaining, max_size; + + queryvariableinfo_user = (struct efi_queryvariableinfo __user *)arg; + + if (copy_from_user(&queryvariableinfo, queryvariableinfo_user, + sizeof(queryvariableinfo))) + return -EFAULT; + + status = efi.query_variable_info(queryvariableinfo.attributes, + &max_storage, &remaining, &max_size); + + if (put_user(status, queryvariableinfo.status)) + return -EFAULT; + + if (status != EFI_SUCCESS) + return -EINVAL; + + if (put_user(max_storage, + queryvariableinfo.maximum_variable_storage_size)) + return -EFAULT; + + if (put_user(remaining, + queryvariableinfo.remaining_variable_storage_size)) + return -EFAULT; + + if (put_user(max_size, queryvariableinfo.maximum_variable_size)) + return -EFAULT; + + return 0; +} + +static long efi_runtime_query_capsulecaps(unsigned long arg) +{ + struct efi_querycapsulecapabilities __user *qcaps_user; + struct efi_querycapsulecapabilities qcaps; + efi_capsule_header_t *capsules; + efi_status_t status; + u64 max_size; + int i, reset_type; + int rv = 0; + + qcaps_user = (struct efi_querycapsulecapabilities __user *)arg; + + if (copy_from_user(&qcaps, qcaps_user, sizeof(qcaps))) + return -EFAULT; + + capsules = kcalloc(qcaps.capsule_count + 1, + sizeof(efi_capsule_header_t), GFP_KERNEL); + if (!capsules) + return -ENOMEM; + + for (i = 0; i < qcaps.capsule_count; i++) { + efi_capsule_header_t *c; + /* + * We cannot dereference qcaps.capsule_header_array directly to + * obtain the address of the capsule as it resides in the + * user space + */ + if (get_user(c, qcaps.capsule_header_array + i)) { + rv = -EFAULT; + goto out; + } + if (copy_from_user(&capsules[i], c, + sizeof(efi_capsule_header_t))) { + rv = -EFAULT; + goto out; + } + } + + qcaps.capsule_header_array = &capsules; + + status = efi.query_capsule_caps((efi_capsule_header_t **) + qcaps.capsule_header_array, + qcaps.capsule_count, + &max_size, &reset_type); + + if (put_user(status, qcaps.status)) { + rv = -EFAULT; + goto out; + } + + if (status != EFI_SUCCESS) { + rv = -EINVAL; + goto out; + } + + if (put_user(max_size, qcaps.maximum_capsule_size)) { + rv = -EFAULT; + goto out; + } + + if (put_user(reset_type, qcaps.reset_type)) + rv = -EFAULT; + +out: + kfree(capsules); + return rv; +} + +static long efi_test_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case EFI_RUNTIME_GET_VARIABLE: + return efi_runtime_get_variable(arg); + + case EFI_RUNTIME_SET_VARIABLE: + return efi_runtime_set_variable(arg); + + case EFI_RUNTIME_GET_TIME: + return efi_runtime_get_time(arg); + + case EFI_RUNTIME_SET_TIME: + return efi_runtime_set_time(arg); + + case EFI_RUNTIME_GET_WAKETIME: + return efi_runtime_get_waketime(arg); + + case EFI_RUNTIME_SET_WAKETIME: + return efi_runtime_set_waketime(arg); + + case EFI_RUNTIME_GET_NEXTVARIABLENAME: + return efi_runtime_get_nextvariablename(arg); + + case EFI_RUNTIME_GET_NEXTHIGHMONOTONICCOUNT: + return efi_runtime_get_nexthighmonocount(arg); + + case EFI_RUNTIME_QUERY_VARIABLEINFO: + return efi_runtime_query_variableinfo(arg); + + case EFI_RUNTIME_QUERY_CAPSULECAPABILITIES: + return efi_runtime_query_capsulecaps(arg); + } + + return -ENOTTY; +} + +static int efi_test_open(struct inode *inode, struct file *file) +{ + /* + * nothing special to do here + * We do accept multiple open files at the same time as we + * synchronize on the per call operation. + */ + return 0; +} + +static int efi_test_close(struct inode *inode, struct file *file) +{ + return 0; +} + +/* + * The various file operations we support. + */ +static const struct file_operations efi_test_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = efi_test_ioctl, + .open = efi_test_open, + .release = efi_test_close, + .llseek = no_llseek, +}; + +static struct miscdevice efi_test_dev = { + MISC_DYNAMIC_MINOR, + "efi_test", + &efi_test_fops +}; + +static int __init efi_test_init(void) +{ + int ret; + + ret = misc_register(&efi_test_dev); + if (ret) { + pr_err("efi_test: can't misc_register on minor=%d\n", + MISC_DYNAMIC_MINOR); + return ret; + } + + return 0; +} + +static void __exit efi_test_exit(void) +{ + misc_deregister(&efi_test_dev); +} + +module_init(efi_test_init); +module_exit(efi_test_exit); diff --git a/drivers/firmware/efi/test/efi_test.h b/drivers/firmware/efi/test/efi_test.h new file mode 100644 index 000000000000..a33a6c633852 --- /dev/null +++ b/drivers/firmware/efi/test/efi_test.h @@ -0,0 +1,110 @@ +/* + * EFI Test driver Header + * + * Copyright(C) 2012-2016 Canonical Ltd. + * + */ + +#ifndef _DRIVERS_FIRMWARE_EFI_TEST_H_ +#define _DRIVERS_FIRMWARE_EFI_TEST_H_ + +#include + +struct efi_getvariable { + efi_char16_t *variable_name; + efi_guid_t *vendor_guid; + u32 *attributes; + unsigned long *data_size; + void *data; + efi_status_t *status; +} __packed; + +struct efi_setvariable { + efi_char16_t *variable_name; + efi_guid_t *vendor_guid; + u32 attributes; + unsigned long data_size; + void *data; + efi_status_t *status; +} __packed; + +struct efi_getnextvariablename { + unsigned long *variable_name_size; + efi_char16_t *variable_name; + efi_guid_t *vendor_guid; + efi_status_t *status; +} __packed; + +struct efi_queryvariableinfo { + u32 attributes; + u64 *maximum_variable_storage_size; + u64 *remaining_variable_storage_size; + u64 *maximum_variable_size; + efi_status_t *status; +} __packed; + +struct efi_gettime { + efi_time_t *time; + efi_time_cap_t *capabilities; + efi_status_t *status; +} __packed; + +struct efi_settime { + efi_time_t *time; + efi_status_t *status; +} __packed; + +struct efi_getwakeuptime { + efi_bool_t *enabled; + efi_bool_t *pending; + efi_time_t *time; + efi_status_t *status; +} __packed; + +struct efi_setwakeuptime { + efi_bool_t enabled; + efi_time_t *time; + efi_status_t *status; +} __packed; + +struct efi_getnexthighmonotoniccount { + u32 *high_count; + efi_status_t *status; +} __packed; + +struct efi_querycapsulecapabilities { + efi_capsule_header_t **capsule_header_array; + unsigned long capsule_count; + u64 *maximum_capsule_size; + int *reset_type; + efi_status_t *status; +} __packed; + +#define EFI_RUNTIME_GET_VARIABLE \ + _IOWR('p', 0x01, struct efi_getvariable) +#define EFI_RUNTIME_SET_VARIABLE \ + _IOW('p', 0x02, struct efi_setvariable) + +#define EFI_RUNTIME_GET_TIME \ + _IOR('p', 0x03, struct efi_gettime) +#define EFI_RUNTIME_SET_TIME \ + _IOW('p', 0x04, struct efi_settime) + +#define EFI_RUNTIME_GET_WAKETIME \ + _IOR('p', 0x05, struct efi_getwakeuptime) +#define EFI_RUNTIME_SET_WAKETIME \ + _IOW('p', 0x06, struct efi_setwakeuptime) + +#define EFI_RUNTIME_GET_NEXTVARIABLENAME \ + _IOWR('p', 0x07, struct efi_getnextvariablename) + +#define EFI_RUNTIME_QUERY_VARIABLEINFO \ + _IOR('p', 0x08, struct efi_queryvariableinfo) + +#define EFI_RUNTIME_GET_NEXTHIGHMONOTONICCOUNT \ + _IOR('p', 0x09, struct efi_getnexthighmonotoniccount) + +#define EFI_RUNTIME_QUERY_CAPSULECAPABILITIES \ + _IOR('p', 0x0A, struct efi_querycapsulecapabilities) + +#endif /* _DRIVERS_FIRMWARE_EFI_TEST_H_ */ From cb82cce7035ec22a69ab3bd4d2fe6729527ce1ca Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 25 Aug 2016 18:17:09 +0200 Subject: [PATCH 247/538] efi/arm64: Treat regions with WT/WC set but WB cleared as memory Currently, memory regions are only recorded in the memblock memory table if they have the EFI_MEMORY_WB memory type attribute set. In case the region is of a reserved type, it is also marked as MEMBLOCK_NOMAP, which will leave it out of the linear mapping. However, memory regions may legally have the EFI_MEMORY_WT or EFI_MEMORY_WC attributes set, and the EFI_MEMORY_WB cleared, in which case the region in question is obviously backed by normal memory, but is not recorded in the memblock memory table at all. Since it would be useful to be able to identify any UEFI reported memory region using memblock_is_memory(), it makes sense to add all memory to the memblock memory table, and simply mark it as MEMBLOCK_NOMAP if it lacks the EFI_MEMORY_WB attribute. While implementing this, let's refactor the code slightly to make it easier to understand: replace is_normal_ram() with is_memory(), and make it return true for each region that has any of the WB|WT|WC bits set. (This follows the AArch64 bindings in the UEFI spec, which state that those are the attributes that map to normal memory) Also, replace is_reserve_region() with is_usable_memory(), and only invoke it if the region in question was identified as memory by is_memory() in the first place. The net result is the same (only reserved regions that are backed by memory end up in the memblock memory table with the MEMBLOCK_NOMAP flag set) but carried out in a more straightforward way. Finally, we remove the trailing asterisk in the EFI debug output. Keeping it clutters the code, and it serves no real purpose now that we no longer temporarily reserve BootServices code and data regions like we did in the early days of EFI support on arm64 Linux (which it inherited from the x86 implementation) Signed-off-by: Ard Biesheuvel Reviewed-by: Leif Lindholm Tested-by: James Morse Reviewed-by: James Morse Signed-off-by: Matt Fleming --- drivers/firmware/efi/arm-init.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c index e0a511d4074f..8efe13075c92 100644 --- a/drivers/firmware/efi/arm-init.c +++ b/drivers/firmware/efi/arm-init.c @@ -26,9 +26,9 @@ u64 efi_system_table; -static int __init is_normal_ram(efi_memory_desc_t *md) +static int __init is_memory(efi_memory_desc_t *md) { - if (md->attribute & EFI_MEMORY_WB) + if (md->attribute & (EFI_MEMORY_WB|EFI_MEMORY_WT|EFI_MEMORY_WC)) return 1; return 0; } @@ -152,9 +152,9 @@ static int __init uefi_init(void) } /* - * Return true for RAM regions we want to permanently reserve. + * Return true for regions that can be used as System RAM. */ -static __init int is_reserve_region(efi_memory_desc_t *md) +static __init int is_usable_memory(efi_memory_desc_t *md) { switch (md->type) { case EFI_LOADER_CODE: @@ -163,18 +163,22 @@ static __init int is_reserve_region(efi_memory_desc_t *md) case EFI_BOOT_SERVICES_DATA: case EFI_CONVENTIONAL_MEMORY: case EFI_PERSISTENT_MEMORY: - return 0; + /* + * According to the spec, these regions are no longer reserved + * after calling ExitBootServices(). However, we can only use + * them as System RAM if they can be mapped writeback cacheable. + */ + return (md->attribute & EFI_MEMORY_WB); default: break; } - return is_normal_ram(md); + return false; } static __init void reserve_regions(void) { efi_memory_desc_t *md; u64 paddr, npages, size; - int resv; if (efi_enabled(EFI_DBG)) pr_info("Processing EFI memory map:\n"); @@ -191,25 +195,23 @@ static __init void reserve_regions(void) paddr = md->phys_addr; npages = md->num_pages; - resv = is_reserve_region(md); if (efi_enabled(EFI_DBG)) { char buf[64]; - pr_info(" 0x%012llx-0x%012llx %s%s\n", + pr_info(" 0x%012llx-0x%012llx %s\n", paddr, paddr + (npages << EFI_PAGE_SHIFT) - 1, - efi_md_typeattr_format(buf, sizeof(buf), md), - resv ? "*" : ""); + efi_md_typeattr_format(buf, sizeof(buf), md)); } memrange_efi_to_native(&paddr, &npages); size = npages << PAGE_SHIFT; - if (is_normal_ram(md)) + if (is_memory(md)) { early_init_dt_add_memory_arch(paddr, size); - if (resv) - memblock_mark_nomap(paddr, size); - + if (!is_usable_memory(md)) + memblock_mark_nomap(paddr, size); + } } } From 20ebc15e6c8f9772804fa10110bf074a7b1d25fa Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Thu, 25 Aug 2016 11:34:03 +0200 Subject: [PATCH 248/538] x86/efi: Use kmalloc_array() in efi_call_phys_prolog() * A multiplication for the size determination of a memory allocation indicated that an array data structure should be processed. Thus reuse the corresponding function "kmalloc_array". This issue was detected by using the Coccinelle software. * Replace the specification of a data type by a pointer dereference to make the corresponding size determination a bit safer according to the Linux coding style convention. Signed-off-by: Markus Elfring Reviewed-by: Paolo Bonzini Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Julia Lawall Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index e1ca71259468..d65cdadaa6b6 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -85,7 +85,7 @@ pgd_t * __init efi_call_phys_prolog(void) early_code_mapping_set_exec(1); n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE); - save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL); + save_pgd = kmalloc_array(n_pgds, sizeof(*save_pgd), GFP_KERNEL); for (pgd = 0; pgd < n_pgds; pgd++) { save_pgd[pgd] = *pgd_offset_k(pgd * PGDIR_SIZE); From 27571616385af9c2d6a3e570b06baf86f5aa04b1 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 6 Sep 2016 08:05:32 +0200 Subject: [PATCH 249/538] x86/efi: Optimize away setup_gop32/64 if unused Commit 2c23b73c2d02 ("x86/efi: Prepare GOP handling code for reuse as generic code") introduced an efi_is_64bit() macro to x86 which previously only existed for arm arches. The macro is used to choose between the 64 bit or 32 bit code path in gop.c at runtime. However the code path that's going to be taken is known at compile time when compiling for x86_32 or for x86_64 with mixed mode disabled. Amend the macro to eliminate the unused code path in those cases. Size of gop.o text section: CONFIG_X86_32: 1758 before, 1299 after CONFIG_X86_64 && !CONFIG_EFI_MIXED: 2201 before, 1406 after CONFIG_X86_64 && CONFIG_EFI_MIXED: 2201 before and after Signed-off-by: Lukas Wunner Reviewed-by: Ard Biesheuvel Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 4630e2bfa8fb..f14655e7726a 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -206,14 +206,23 @@ struct efi_config { __pure const struct efi_config *__efi_early(void); +static inline bool efi_is_64bit(void) +{ + if (!IS_ENABLED(CONFIG_X86_64)) + return false; + + if (!IS_ENABLED(CONFIG_EFI_MIXED)) + return true; + + return __efi_early()->is64; +} + #define efi_call_early(f, ...) \ __efi_early()->call(__efi_early()->f, __VA_ARGS__); #define __efi_call_early(f, ...) \ __efi_early()->call((unsigned long)f, __VA_ARGS__); -#define efi_is_64bit() __efi_early()->is64 - extern bool efi_reboot_required(void); #else From 0a637ee61247bd4bed9b2a07568ef7a1cfc76187 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Mon, 22 Aug 2016 12:01:21 +0200 Subject: [PATCH 250/538] x86/efi: Allow invocation of arbitrary boot services We currently allow invocation of 8 boot services with efi_call_early(). Not included are LocateHandleBuffer and LocateProtocol in particular. For graphics output or to retrieve PCI ROMs and Apple device properties, we're thus forced to use the LocateHandle + AllocatePool + LocateHandle combo, which is cumbersome and needs more code. The ARM folks allow invocation of the full set of boot services but are restricted to our 8 boot services in functions shared across arches. Thus, rather than adding just LocateHandleBuffer and LocateProtocol to struct efi_config, let's rework efi_call_early() to allow invocation of arbitrary boot services by selecting the 64 bit vs 32 bit code path in the macro itself. When compiling for 32 bit or for 64 bit without mixed mode, the unused code path is optimized away and the binary code is the same as before. But on 64 bit with mixed mode enabled, this commit adds one compare instruction to each invocation of a boot service and, depending on the code path selected, two jump instructions. (Most of the time gcc arranges the jumps in the 32 bit code path.) The result is a minuscule performance penalty and the binary code becomes slightly larger and more difficult to read when disassembled. This isn't a hot path, so these drawbacks are arguably outweighed by the attainable simplification of the C code. We have some overhead anyway for thunking or conversion between calling conventions. The 8 boot services can consequently be removed from struct efi_config. No functional change intended (for now). Example -- invocation of free_pool before (64 bit code path): 0x2d4 movq %ds:efi_early, %rdx ; efi_early 0x2db movq %ss:arg_0-0x20(%rsp), %rsi 0x2e0 xorl %eax, %eax 0x2e2 movq %ds:0x28(%rdx), %rdi ; efi_early->free_pool 0x2e6 callq *%ds:0x58(%rdx) ; efi_early->call() Example -- invocation of free_pool after (64 / 32 bit mixed code path): 0x0dc movq %ds:efi_early, %rax ; efi_early 0x0e3 cmpb $0, %ds:0x28(%rax) ; !efi_early->is64 ? 0x0e7 movq %ds:0x20(%rax), %rdx ; efi_early->call() 0x0eb movq %ds:0x10(%rax), %rax ; efi_early->boot_services 0x0ef je $0x150 0x0f1 movq %ds:0x48(%rax), %rdi ; free_pool (64 bit) 0x0f5 xorl %eax, %eax 0x0f7 callq *%rdx ... 0x150 movl %ds:0x30(%rax), %edi ; free_pool (32 bit) 0x153 jmp $0x0f5 Size of eboot.o text section: CONFIG_X86_32: 6464 before, 6318 after CONFIG_X86_64 && !CONFIG_EFI_MIXED: 7670 before, 7573 after CONFIG_X86_64 && CONFIG_EFI_MIXED: 7670 before, 8319 after Signed-off-by: Lukas Wunner Signed-off-by: Matt Fleming --- arch/x86/boot/compressed/eboot.c | 13 +------------ arch/x86/boot/compressed/head_32.S | 6 +++--- arch/x86/boot/compressed/head_64.S | 8 ++++---- arch/x86/include/asm/efi.h | 15 ++++++--------- 4 files changed, 14 insertions(+), 28 deletions(-) diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index f7fc85bf8221..447a6a2df5ae 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -29,22 +29,11 @@ __pure const struct efi_config *__efi_early(void) static void setup_boot_services##bits(struct efi_config *c) \ { \ efi_system_table_##bits##_t *table; \ - efi_boot_services_##bits##_t *bt; \ \ table = (typeof(table))sys_table; \ \ + c->boot_services = table->boottime; \ c->text_output = table->con_out; \ - \ - bt = (typeof(bt))(unsigned long)(table->boottime); \ - \ - c->allocate_pool = bt->allocate_pool; \ - c->allocate_pages = bt->allocate_pages; \ - c->get_memory_map = bt->get_memory_map; \ - c->free_pool = bt->free_pool; \ - c->free_pages = bt->free_pages; \ - c->locate_handle = bt->locate_handle; \ - c->handle_protocol = bt->handle_protocol; \ - c->exit_boot_services = bt->exit_boot_services; \ } BOOT_SERVICES(32); BOOT_SERVICES(64); diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 1038524270e7..fd0b6a272dd5 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -82,7 +82,7 @@ ENTRY(efi_pe_entry) /* Relocate efi_config->call() */ leal efi32_config(%esi), %eax - add %esi, 88(%eax) + add %esi, 32(%eax) pushl %eax call make_boot_params @@ -108,7 +108,7 @@ ENTRY(efi32_stub_entry) /* Relocate efi_config->call() */ leal efi32_config(%esi), %eax - add %esi, 88(%eax) + add %esi, 32(%eax) pushl %eax 2: call efi_main @@ -264,7 +264,7 @@ relocated: #ifdef CONFIG_EFI_STUB .data efi32_config: - .fill 11,8,0 + .fill 4,8,0 .long efi_call_phys .long 0 .byte 0 diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 0d80a7ad65cd..efdfba21a5b2 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -265,7 +265,7 @@ ENTRY(efi_pe_entry) /* * Relocate efi_config->call(). */ - addq %rbp, efi64_config+88(%rip) + addq %rbp, efi64_config+32(%rip) movq %rax, %rdi call make_boot_params @@ -285,7 +285,7 @@ handover_entry: * Relocate efi_config->call(). */ movq efi_config(%rip), %rax - addq %rbp, 88(%rax) + addq %rbp, 32(%rax) 2: movq efi_config(%rip), %rdi call efi_main @@ -457,14 +457,14 @@ efi_config: #ifdef CONFIG_EFI_MIXED .global efi32_config efi32_config: - .fill 11,8,0 + .fill 4,8,0 .quad efi64_thunk .byte 0 #endif .global efi64_config efi64_config: - .fill 11,8,0 + .fill 4,8,0 .quad efi_call .byte 1 #endif /* CONFIG_EFI_STUB */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index f14655e7726a..389d700b961e 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -191,14 +191,7 @@ static inline efi_status_t efi_thunk_set_virtual_address_map( struct efi_config { u64 image_handle; u64 table; - u64 allocate_pool; - u64 allocate_pages; - u64 get_memory_map; - u64 free_pool; - u64 free_pages; - u64 locate_handle; - u64 handle_protocol; - u64 exit_boot_services; + u64 boot_services; u64 text_output; efi_status_t (*call)(unsigned long, ...); bool is64; @@ -218,7 +211,11 @@ static inline bool efi_is_64bit(void) } #define efi_call_early(f, ...) \ - __efi_early()->call(__efi_early()->f, __VA_ARGS__); + __efi_early()->call(efi_is_64bit() ? \ + ((efi_boot_services_64_t *)(unsigned long) \ + __efi_early()->boot_services)->f : \ + ((efi_boot_services_32_t *)(unsigned long) \ + __efi_early()->boot_services)->f, __VA_ARGS__) #define __efi_call_early(f, ...) \ __efi_early()->call((unsigned long)f, __VA_ARGS__); From de58af878d9146e5decc0cdd7acabaa82881cbe4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 7 Sep 2016 10:29:05 +0200 Subject: [PATCH 251/538] Revert "sched/fair: Make update_min_vruntime() more readable" There's a bug in this commit: 97a7142f157a ("sched/fair: Make update_min_vruntime() more readable") ... when !rb_leftmost && curr we fail to advance min_vruntime. So revert it. Reported-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a6820b3771e2..986c10c25176 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -464,17 +464,20 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) { u64 vruntime = cfs_rq->min_vruntime; + if (cfs_rq->curr) + vruntime = cfs_rq->curr->vruntime; + if (cfs_rq->rb_leftmost) { struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, struct sched_entity, run_node); - vruntime = se->vruntime; + if (!cfs_rq->curr) + vruntime = se->vruntime; + else + vruntime = min_vruntime(vruntime, se->vruntime); } - if (cfs_rq->curr) - vruntime = min_vruntime(vruntime, cfs_rq->curr->vruntime); - /* ensure we never gain time by being placed backwards. */ cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); #ifndef CONFIG_64BIT From b95202a3b6bb8715a716dbdb15cdb82bf622260b Mon Sep 17 00:00:00 2001 From: Tommaso Cucinotta Date: Fri, 9 Sep 2016 19:45:17 +0200 Subject: [PATCH 252/538] sched/deadline: Document behavior of sched_yield() This is a documentation only patch, explaining the behavior of sched_yield() when a SCHED_DEADLINE task calls it (give up remaining runtime and be throttled until next period begins). Signed-off-by: Tommaso Cucinotta Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Reviewed-by: Luca Abeni Reviewed-by: Daniel Bristot de Oliveira Cc: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-dl@retis.sssup.it Link: http://lkml.kernel.org/r/1473443117-11794-2-git-send-email-tommaso.cucinotta@sssup.it Signed-off-by: Ingo Molnar --- Documentation/scheduler/sched-deadline.txt | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt index 53a2fe1ae8b8..8e37b0ba2c9d 100644 --- a/Documentation/scheduler/sched-deadline.txt +++ b/Documentation/scheduler/sched-deadline.txt @@ -16,6 +16,7 @@ CONTENTS 4.1 System-wide settings 4.2 Task interface 4.3 Default behavior + 4.4 Behavior of sched_yield() 5. Tasks CPU affinity 5.1 SCHED_DEADLINE and cpusets HOWTO 6. Future plans @@ -426,6 +427,23 @@ CONTENTS Finally, notice that in order not to jeopardize the admission control a -deadline task cannot fork. + +4.4 Behavior of sched_yield() +----------------------------- + + When a SCHED_DEADLINE task calls sched_yield(), it gives up its + remaining runtime and is immediately throttled, until the next + period, when its runtime will be replenished (a special flag + dl_yielded is set and used to handle correctly throttling and runtime + replenishment after a call to sched_yield()). + + This behavior of sched_yield() allows the task to wake-up exactly at + the beginning of the next period. Also, this may be useful in the + future with bandwidth reclaiming mechanisms, where sched_yield() will + make the leftoever runtime available for reclamation by other + SCHED_DEADLINE tasks. + + 5. Tasks CPU affinity ===================== From 43ba588346455dcc984dc98a49af1c2eb1e9aa75 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 7 Sep 2016 19:25:37 -0700 Subject: [PATCH 253/538] Input: silead_gsl1680 - document firmware-name, fix implementation The driver has supported touchscreen-fw-name to specify the firmware to load since it has been merged, but this was omitted from the dt-binding documentation. During review of adding touchscreen-fw-name to the binding documentation it was brought up that there is a standard property name called "firmware-name" for this, which should be used. Since there are no users of touchscreen-fw-name yet, this commit adds documentation of "firmware-name" to the dt-binding documentation and switches the driver over to use this. This commit also makes the driver add a "silead/" prefix to the firmware name from dt before calling request_firmware. That the firmware files are stored under /lib/firmware/silead under Linux is an implementation detail and does not belong in devicetree. Signed-off-by: Hans de Goede Acked-by: Rob Herring Signed-off-by: Dmitry Torokhov --- .../devicetree/bindings/input/touchscreen/silead_gsl1680.txt | 1 + drivers/input/touchscreen/silead.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/input/touchscreen/silead_gsl1680.txt b/Documentation/devicetree/bindings/input/touchscreen/silead_gsl1680.txt index 1112e0d794e1..820fee4b77b6 100644 --- a/Documentation/devicetree/bindings/input/touchscreen/silead_gsl1680.txt +++ b/Documentation/devicetree/bindings/input/touchscreen/silead_gsl1680.txt @@ -13,6 +13,7 @@ Required properties: - touchscreen-size-y : See touchscreen.txt Optional properties: +- firmware-name : File basename (string) for board specific firmware - touchscreen-inverted-x : See touchscreen.txt - touchscreen-inverted-y : See touchscreen.txt - touchscreen-swapped-x-y : See touchscreen.txt diff --git a/drivers/input/touchscreen/silead.c b/drivers/input/touchscreen/silead.c index b2744a64e933..c7ab116a16b3 100644 --- a/drivers/input/touchscreen/silead.c +++ b/drivers/input/touchscreen/silead.c @@ -390,9 +390,10 @@ static void silead_ts_read_props(struct i2c_client *client) data->max_fingers = 5; /* Most devices handle up-to 5 fingers */ } - error = device_property_read_string(dev, "touchscreen-fw-name", &str); + error = device_property_read_string(dev, "firmware-name", &str); if (!error) - snprintf(data->fw_name, sizeof(data->fw_name), "%s", str); + snprintf(data->fw_name, sizeof(data->fw_name), + "silead/%s", str); else dev_dbg(dev, "Firmware file name read error. Using default."); } From 4af2ff91ec3f42b538a65cf12df5f9faf6aaa914 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 7 Sep 2016 19:32:14 -0700 Subject: [PATCH 254/538] Input: silead_gsl1680 - use "silead/" prefix for firmware loading The silead touch-controller ICs use a different firmware per digitizer / tablet model. So there are going to be quite a few of then and they really should be under a separate subdir. This commit prefixes the default firmware names with "silead/" just like we are already doing for devicetree specified firmware names. Signed-off-by: Hans de Goede Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/silead.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/input/touchscreen/silead.c b/drivers/input/touchscreen/silead.c index c7ab116a16b3..f502c8488be8 100644 --- a/drivers/input/touchscreen/silead.c +++ b/drivers/input/touchscreen/silead.c @@ -411,14 +411,14 @@ static int silead_ts_set_default_fw_name(struct silead_ts_data *data, if (!acpi_id) return -ENODEV; - snprintf(data->fw_name, sizeof(data->fw_name), "%s.fw", - acpi_id->id); + snprintf(data->fw_name, sizeof(data->fw_name), + "silead/%s.fw", acpi_id->id); for (i = 0; i < strlen(data->fw_name); i++) data->fw_name[i] = tolower(data->fw_name[i]); } else { - snprintf(data->fw_name, sizeof(data->fw_name), "%s.fw", - id->name); + snprintf(data->fw_name, sizeof(data->fw_name), + "silead/%s.fw", id->name); } return 0; @@ -427,7 +427,8 @@ static int silead_ts_set_default_fw_name(struct silead_ts_data *data, static int silead_ts_set_default_fw_name(struct silead_ts_data *data, const struct i2c_device_id *id) { - snprintf(data->fw_name, sizeof(data->fw_name), "%s.fw", id->name); + snprintf(data->fw_name, sizeof(data->fw_name), + "silead/%s.fw", id->name); return 0; } #endif From d9c149d6ce1a94de578a4e323f6881fcb6b986ab Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Sat, 10 Sep 2016 23:40:45 +0800 Subject: [PATCH 255/538] x86/ioapic: Ignore root bridges without a companion ACPI device Some PCI root bridges don't have a corresponding ACPI device. This can be the case on some old platforms. Don't call acpi_ioapic_add() on these bridges because they can't support ioapic hotplug. Reported-and-tested-by: Borislav Petkov Signed-off-by: Rui Wang Reviewed-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bhelgaas@google.com Cc: helgaas@kernel.org Cc: linux-acpi@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: rjw@rjwysocki.net Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/1473522046-31329-1-git-send-email-rui.y.wang@intel.com Signed-off-by: Ingo Molnar --- drivers/pci/setup-bus.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index ec538d3d2bd5..f30ca75b5b6c 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1855,7 +1855,10 @@ void __init pci_assign_unassigned_resources(void) list_for_each_entry(root_bus, &pci_root_buses, node) { pci_assign_unassigned_root_bus_resources(root_bus); - acpi_ioapic_add(ACPI_HANDLE(root_bus->bridge)); + + /* Make sure the root bridge has a companion ACPI device: */ + if (ACPI_HANDLE(root_bus->bridge)) + acpi_ioapic_add(ACPI_HANDLE(root_bus->bridge)); } } From cf1e929c8a389bc0be63f86100f962217ea99455 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 8 Sep 2016 14:17:00 +0200 Subject: [PATCH 256/538] clocksource/drivers/moxart: Replace setup_irq by request_irq Save memory space and line of code by replacing setup_irq by request_irq. Signed-off-by: Daniel Lezcano Acked-by: Joel Stanley --- drivers/clocksource/moxart_timer.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/clocksource/moxart_timer.c b/drivers/clocksource/moxart_timer.c index ad2bead9ce45..cba25b75c29b 100644 --- a/drivers/clocksource/moxart_timer.c +++ b/drivers/clocksource/moxart_timer.c @@ -79,7 +79,6 @@ struct moxart_timer { unsigned int t1_enable_val; unsigned int count_per_tick; struct clock_event_device clkevt; - struct irqaction act; }; static inline struct moxart_timer *to_moxart(struct clock_event_device *evt) @@ -201,10 +200,6 @@ static int __init moxart_timer_init(struct device_node *node) timer->clkevt.set_next_event = moxart_clkevt_next_event; timer->clkevt.cpumask = cpumask_of(0); timer->clkevt.irq = irq; - timer->act.name = node->name; - timer->act.flags = IRQF_TIMER; - timer->act.handler = moxart_timer_interrupt; - timer->act.dev_id = &timer->clkevt; ret = clocksource_mmio_init(timer->base + TIMER2_BASE + REG_COUNT, "moxart_timer", pclk, 200, 32, @@ -214,7 +209,8 @@ static int __init moxart_timer_init(struct device_node *node) return ret; } - ret = setup_irq(irq, &timer->act); + ret = request_irq(irq, moxart_timer_interrupt, IRQF_TIMER, + node->name, &timer->clkevt); if (ret) { pr_err("%s: setup_irq failed\n", node->full_name); return ret; From e2a2d38501cb759333342d97442b3742830752ca Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 8 Sep 2016 14:25:40 +0200 Subject: [PATCH 257/538] clocksource/drivers/moxart: Replace panic by pr_err The clksrc-of code is supposed to catch the return code and fail gracefully. Don't panic on error, but print the error and exit with a relevant error code. Signed-off-by: Daniel Lezcano Acked-by: Joel Stanley --- drivers/clocksource/moxart_timer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/moxart_timer.c b/drivers/clocksource/moxart_timer.c index cba25b75c29b..2a8f4705c734 100644 --- a/drivers/clocksource/moxart_timer.c +++ b/drivers/clocksource/moxart_timer.c @@ -184,8 +184,10 @@ static int __init moxart_timer_init(struct device_node *node) } else if (of_device_is_compatible(node, "aspeed,ast2400-timer")) { timer->t1_enable_val = ASPEED_TIMER1_ENABLE; timer->t1_disable_val = ASPEED_TIMER1_DISABLE; - } else - panic("%s: unknown platform\n", node->full_name); + } else { + pr_err("%s: unknown platform\n", node->full_name); + return -EINVAL; + } timer->count_per_tick = DIV_ROUND_CLOSEST(pclk, HZ); From a17686c462441a01973c18c4c47986a320a5ebe3 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Fri, 9 Sep 2016 13:13:48 +0200 Subject: [PATCH 258/538] clocksource/drivers/timer-atmel-pit: Drop at91sam926x_pit_common_init Merge at91sam926x_pit_common_init in at91sam926x_pit_dt_init as this is the only initialization method now. Acked-by: Nicolas Ferre Signed-off-by: Alexandre Belloni Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-atmel-pit.c | 79 ++++++++++++--------------- 1 file changed, 34 insertions(+), 45 deletions(-) diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c index 7f0f5b26d8c5..da7e6d4eef4d 100644 --- a/drivers/clocksource/timer-atmel-pit.c +++ b/drivers/clocksource/timer-atmel-pit.c @@ -177,11 +177,41 @@ static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id) /* * Set up both clocksource and clockevent support. */ -static int __init at91sam926x_pit_common_init(struct pit_data *data) +static int __init at91sam926x_pit_dt_init(struct device_node *node) { - unsigned long pit_rate; - unsigned bits; - int ret; + unsigned long pit_rate; + unsigned bits; + int ret; + struct pit_data *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->base = of_iomap(node, 0); + if (!data->base) { + pr_err("Could not map PIT address\n"); + return -ENXIO; + } + + data->mck = of_clk_get(node, 0); + if (IS_ERR(data->mck)) { + pr_err("Unable to get mck clk\n"); + return PTR_ERR(data->mck); + } + + ret = clk_prepare_enable(data->mck); + if (ret) { + pr_err("Unable to enable mck\n"); + return ret; + } + + /* Get the interrupts property */ + data->irq = irq_of_parse_and_map(node, 0); + if (!data->irq) { + pr_err("Unable to get IRQ from DT\n"); + return -EINVAL; + } /* * Use our actual MCK to figure out how many MCK/16 ticks per @@ -236,46 +266,5 @@ static int __init at91sam926x_pit_common_init(struct pit_data *data) return 0; } - -static int __init at91sam926x_pit_dt_init(struct device_node *node) -{ - struct pit_data *data; - int ret; - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return -ENOMEM; - - data->base = of_iomap(node, 0); - if (!data->base) { - pr_err("Could not map PIT address\n"); - return -ENXIO; - } - - data->mck = of_clk_get(node, 0); - if (IS_ERR(data->mck)) - /* Fallback on clkdev for !CCF-based boards */ - data->mck = clk_get(NULL, "mck"); - - if (IS_ERR(data->mck)) { - pr_err("Unable to get mck clk\n"); - return PTR_ERR(data->mck); - } - - ret = clk_prepare_enable(data->mck); - if (ret) { - pr_err("Unable to enable mck\n"); - return ret; - } - - /* Get the interrupts property */ - data->irq = irq_of_parse_and_map(node, 0); - if (!data->irq) { - pr_err("Unable to get IRQ from DT\n"); - return -EINVAL; - } - - return at91sam926x_pit_common_init(data); -} CLOCKSOURCE_OF_DECLARE(at91sam926x_pit, "atmel,at91sam9260-pit", at91sam926x_pit_dt_init); From 0d41ec8dbf6766ae2f2b9622cc125be118638ccc Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Fri, 9 Sep 2016 13:13:49 +0200 Subject: [PATCH 259/538] clocksource/drivers/timer-atmel-pit: Remove uselesss WARN_ON_ONCE IRQ handlers are running with IRQ disabled for a while, remove wrong comment and useless test. Acked-by: Nicolas Ferre Signed-off-by: Alexandre Belloni Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-atmel-pit.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c index da7e6d4eef4d..91cf04704ed1 100644 --- a/drivers/clocksource/timer-atmel-pit.c +++ b/drivers/clocksource/timer-atmel-pit.c @@ -149,12 +149,6 @@ static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id) { struct pit_data *data = dev_id; - /* - * irqs should be disabled here, but as the irq is shared they are only - * guaranteed to be off if the timer irq is registered first. - */ - WARN_ON_ONCE(!irqs_disabled()); - /* The PIT interrupt may be disabled, and is shared */ if (clockevent_state_periodic(&data->clkevt) && (pit_read(data->base, AT91_PIT_SR) & AT91_PIT_PITS)) { From 2783e5d63a9a38b5246c75b29f0dce4cf995a1b3 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Fri, 9 Sep 2016 13:13:50 +0200 Subject: [PATCH 260/538] clocksource/drivers/timer-atmel-pit: Simplify IRQ handler Because the PIT is also a proper clocksource, the timekeeping code is already able to handle lost ticks. Reported-by: Thomas Gleixner Acked-by: Nicolas Ferre Signed-off-by: Alexandre Belloni Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-atmel-pit.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c index 91cf04704ed1..6555821bbdae 100644 --- a/drivers/clocksource/timer-atmel-pit.c +++ b/drivers/clocksource/timer-atmel-pit.c @@ -152,15 +152,10 @@ static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id) /* The PIT interrupt may be disabled, and is shared */ if (clockevent_state_periodic(&data->clkevt) && (pit_read(data->base, AT91_PIT_SR) & AT91_PIT_PITS)) { - unsigned nr_ticks; - /* Get number of ticks performed before irq, and ack it */ - nr_ticks = PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR)); - do { - data->cnt += data->cycle; - data->clkevt.event_handler(&data->clkevt); - nr_ticks--; - } while (nr_ticks); + data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, + AT91_PIT_PIVR)); + data->clkevt.event_handler(&data->clkevt); return IRQ_HANDLED; } From 2ea3401e2a84eed3f5f55b2075706f88df160d85 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Fri, 9 Sep 2016 15:26:08 +0200 Subject: [PATCH 261/538] clocksource/drivers/oxnas: Add OX820 compatible In order to support the Oxford Semiconductor OX820 SoC, add new compatible string to rps timer driver. Also add new string in the dt-bindings. Signed-off-by: Neil Armstrong Signed-off-by: Daniel Lezcano --- Documentation/devicetree/bindings/timer/oxsemi,rps-timer.txt | 2 +- drivers/clocksource/timer-oxnas-rps.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/timer/oxsemi,rps-timer.txt b/Documentation/devicetree/bindings/timer/oxsemi,rps-timer.txt index 3ca89cd1caef..d191612539e8 100644 --- a/Documentation/devicetree/bindings/timer/oxsemi,rps-timer.txt +++ b/Documentation/devicetree/bindings/timer/oxsemi,rps-timer.txt @@ -2,7 +2,7 @@ Oxford Semiconductor OXNAS SoCs Family RPS Timer ================================================ Required properties: -- compatible: Should be "oxsemi,ox810se-rps-timer" +- compatible: Should be "oxsemi,ox810se-rps-timer" or "oxsemi,ox820-rps-timer" - reg : Specifies base physical address and size of the registers. - interrupts : The interrupts of the two timers - clocks : The phandle of the timer clock source diff --git a/drivers/clocksource/timer-oxnas-rps.c b/drivers/clocksource/timer-oxnas-rps.c index bd887e2a8cf8..d630bf417773 100644 --- a/drivers/clocksource/timer-oxnas-rps.c +++ b/drivers/clocksource/timer-oxnas-rps.c @@ -295,3 +295,5 @@ static int __init oxnas_rps_timer_init(struct device_node *np) CLOCKSOURCE_OF_DECLARE(ox810se_rps, "oxsemi,ox810se-rps-timer", oxnas_rps_timer_init); +CLOCKSOURCE_OF_DECLARE(ox820_rps, + "oxsemi,ox820se-rps-timer", oxnas_rps_timer_init); From 83843c80dcf11a78995d167255b03072a1e49c2c Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sun, 28 Aug 2016 13:10:37 +0200 Subject: [PATCH 262/538] mac80211: fix tim recalculation after PS response Handle the case where the mac80211 intermediate queues are empty and the driver has buffered frames Fixes: ba8c3d6f16a1 ("mac80211: add an intermediate software queue implementation") Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 76b737dcc36f..aa58df80ede0 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1616,7 +1616,6 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, sta_info_recalc_tim(sta); } else { - unsigned long tids = sta->txq_buffered_tids & driver_release_tids; int tid; /* @@ -1648,7 +1647,8 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]); - if (!(tids & BIT(tid)) || txqi->tin.backlog_packets) + if (!(driver_release_tids & BIT(tid)) || + txqi->tin.backlog_packets) continue; sta_info_recalc_tim(sta); From df6ef5d8a87ace995d5c10a7bd684be05911a321 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sun, 4 Sep 2016 18:00:59 +0200 Subject: [PATCH 263/538] mac80211: fix sequence number assignment for PS response frames When using intermediate queues, sequence number allocation is deferred until dequeue. This doesn't work for PS response frames, which bypass those queues. Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 65 ++++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 502396694f47..cc8e95554b48 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -796,6 +796,36 @@ static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid) return ret; } +static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, + struct ieee80211_vif *vif, + struct ieee80211_sta *pubsta, + struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_txq *txq = NULL; + + if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) || + (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) + return NULL; + + if (!ieee80211_is_data(hdr->frame_control)) + return NULL; + + if (pubsta) { + u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; + + txq = pubsta->txq[tid]; + } else if (vif) { + txq = vif->txq; + } + + if (!txq) + return NULL; + + return to_txq_info(txq); +} + static ieee80211_tx_result debug_noinline ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) { @@ -853,7 +883,8 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) tid = *qc & IEEE80211_QOS_CTL_TID_MASK; tx->sta->tx_stats.msdu[tid]++; - if (!tx->sta->sta.txq[0]) + if (!ieee80211_get_txq(tx->local, info->control.vif, &tx->sta->sta, + tx->skb)) hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid); return TX_CONTINUE; @@ -1243,36 +1274,6 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, return TX_CONTINUE; } -static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, - struct ieee80211_vif *vif, - struct ieee80211_sta *pubsta, - struct sk_buff *skb) -{ - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - struct ieee80211_txq *txq = NULL; - - if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) || - (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) - return NULL; - - if (!ieee80211_is_data(hdr->frame_control)) - return NULL; - - if (pubsta) { - u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; - - txq = pubsta->txq[tid]; - } else if (vif) { - txq = vif->txq; - } - - if (!txq) - return NULL; - - return to_txq_info(txq); -} - static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb) { IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time(); @@ -3264,7 +3265,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { *ieee80211_get_qos_ctl(hdr) = tid; - if (!sta->sta.txq[0]) + if (!ieee80211_get_txq(local, &sdata->vif, &sta->sta, skb)) hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid); } else { info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; From 5df20f2141eadb5430caaad20eceac61cfe0f139 Mon Sep 17 00:00:00 2001 From: "Pedersen, Thomas" Date: Tue, 6 Sep 2016 11:59:00 -0700 Subject: [PATCH 264/538] mac80211: make mpath path fixing more robust A fixed mpath was not quite being treated as such: 1) if a PERR frame was received, a fixed mpath was deactivated. 2) queued path discovery for fixed mpath was potentially being considered, changing mpath state. 3) other mpath flags were potentially being inherited when fixing the mpath. Just assign PATH_FIXED and SN_VALID. This solves several issues when fixing a mesh path in one direction. The reverse direction mpath should probably also be fixed, or root announcements at least be enabled. Signed-off-by: Thomas Pedersen Signed-off-by: Johannes Berg --- net/mac80211/mesh_hwmp.c | 3 ++- net/mac80211/mesh_pathtbl.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 8f9c3bde835f..faccef977670 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -746,6 +746,7 @@ static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata, sta = next_hop_deref_protected(mpath); if (mpath->flags & MESH_PATH_ACTIVE && ether_addr_equal(ta, sta->sta.addr) && + !(mpath->flags & MESH_PATH_FIXED) && (!(mpath->flags & MESH_PATH_SN_VALID) || SN_GT(target_sn, mpath->sn) || target_sn == 0)) { mpath->flags &= ~MESH_PATH_ACTIVE; @@ -1012,7 +1013,7 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata) goto enddiscovery; spin_lock_bh(&mpath->state_lock); - if (mpath->flags & MESH_PATH_DELETED) { + if (mpath->flags & (MESH_PATH_DELETED | MESH_PATH_FIXED)) { spin_unlock_bh(&mpath->state_lock); goto enddiscovery; } diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 6db2ddfa0695..f0e6175a9821 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -826,7 +826,7 @@ void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop) mpath->metric = 0; mpath->hop_count = 0; mpath->exp_time = 0; - mpath->flags |= MESH_PATH_FIXED; + mpath->flags = MESH_PATH_FIXED | MESH_PATH_SN_VALID; mesh_path_activate(mpath); spin_unlock_bh(&mpath->state_lock); mesh_path_tx_pending(mpath); From ecfcdfec7e0cc64215a194044305f02a5a836e6d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 9 Sep 2016 15:38:12 +0200 Subject: [PATCH 265/538] netfilter: nf_nat: handle NF_DROP from nfnetlink_parse_nat_setup() nf_nat_setup_info() returns NF_* verdicts, so convert them to error codes that is what ctnelink expects. This has passed overlook without having any impact since this nf_nat_setup_info() has always returned NF_ACCEPT so far. Since 870190a9ec90 ("netfilter: nat: convert nat bysrc hash to rhashtable"), this is problem. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index de31818417b8..19c081e1b328 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -807,7 +807,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, if (err < 0) return err; - return nf_nat_setup_info(ct, &range, manip); + return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; } #else static int From 04c8b0f82c7d5a9a1c296eef914ae3bb820bcb85 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 27 Jun 2016 18:11:43 +0100 Subject: [PATCH 266/538] irqchip/gic: Make locking a BL_SWITCHER only feature The BL switcher code manipulates the logical/physical CPU mapping, forcing a lock to be taken on the IPI path. With an IPI heavy load, this single lock becomes contended. But when CONFIG_BL_SWITCHER is not enabled, there is no reason to take this lock at all since the CPU mapping is immutable. This patch allows the lock to be entierely removed when BL_SWITCHER is not enabled (which is the case in most configurations), leading to a small improvement of "perf bench sched pipe" (measured on an 8 core AMD Seattle system): Before: 101370 ops/sec After: 103680 ops/sec Take this opportunity to remove a useless lock being taken when handling an interrupt on a secondary GIC. Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 390fac59c6bc..d108fe6a32d9 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -91,7 +91,27 @@ struct gic_chip_data { #endif }; -static DEFINE_RAW_SPINLOCK(irq_controller_lock); +#ifdef CONFIG_BL_SWITCHER + +static DEFINE_RAW_SPINLOCK(cpu_map_lock); + +#define gic_lock_irqsave(f) \ + raw_spin_lock_irqsave(&cpu_map_lock, (f)) +#define gic_unlock_irqrestore(f) \ + raw_spin_unlock_irqrestore(&cpu_map_lock, (f)) + +#define gic_lock() raw_spin_lock(&cpu_map_lock) +#define gic_unlock() raw_spin_unlock(&cpu_map_lock) + +#else + +#define gic_lock_irqsave(f) do { (void)(f); } while(0) +#define gic_unlock_irqrestore(f) do { (void)(f); } while(0) + +#define gic_lock() do { } while(0) +#define gic_unlock() do { } while(0) + +#endif /* * The GIC mapping of CPU interfaces does not necessarily match @@ -317,12 +337,12 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val, if (cpu >= NR_GIC_CPU_IF || cpu >= nr_cpu_ids) return -EINVAL; - raw_spin_lock_irqsave(&irq_controller_lock, flags); + gic_lock_irqsave(flags); mask = 0xff << shift; bit = gic_cpu_map[cpu] << shift; val = readl_relaxed(reg) & ~mask; writel_relaxed(val | bit, reg); - raw_spin_unlock_irqrestore(&irq_controller_lock, flags); + gic_unlock_irqrestore(flags); return IRQ_SET_MASK_OK_DONE; } @@ -374,9 +394,7 @@ static void gic_handle_cascade_irq(struct irq_desc *desc) chained_irq_enter(chip, desc); - raw_spin_lock(&irq_controller_lock); status = readl_relaxed(gic_data_cpu_base(chip_data) + GIC_CPU_INTACK); - raw_spin_unlock(&irq_controller_lock); gic_irq = (status & GICC_IAR_INT_ID_MASK); if (gic_irq == GICC_INT_SPURIOUS) @@ -776,7 +794,7 @@ static void gic_raise_softirq(const struct cpumask *mask, unsigned int irq) return; } - raw_spin_lock_irqsave(&irq_controller_lock, flags); + gic_lock_irqsave(flags); /* Convert our logical CPU mask into a physical one. */ for_each_cpu(cpu, mask) @@ -791,7 +809,7 @@ static void gic_raise_softirq(const struct cpumask *mask, unsigned int irq) /* this always happens on GIC0 */ writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT); - raw_spin_unlock_irqrestore(&irq_controller_lock, flags); + gic_unlock_irqrestore(flags); } #endif @@ -859,7 +877,7 @@ void gic_migrate_target(unsigned int new_cpu_id) cur_target_mask = 0x01010101 << cur_cpu_id; ror_val = (cur_cpu_id - new_cpu_id) & 31; - raw_spin_lock(&irq_controller_lock); + gic_lock(); /* Update the target interface for this logical CPU */ gic_cpu_map[cpu] = 1 << new_cpu_id; @@ -879,7 +897,7 @@ void gic_migrate_target(unsigned int new_cpu_id) } } - raw_spin_unlock(&irq_controller_lock); + gic_unlock(); /* * Now let's migrate and clear any potential SGIs that might be From 91ef84428a86b75a52e15c6fe4f56b446ba75f93 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Fri, 19 Aug 2016 17:13:09 +0100 Subject: [PATCH 267/538] irqchip/gic-v3: Reset BPR during initialization Currently, when running on FVP, CPU 0 boots up with its BPR changed from the reset value. This renders it impossible to (preemptively) prioritize interrupts on CPU 0. This is harmless on normal systems since Linux typically does not support preemptive interrupts. It does however cause problems in systems with additional changes (such as patches for NMI simulation). Many thanks to Andrew Thoelke for suggesting the BPR as having the potential to harm preemption. Suggested-by: Andrew Thoelke Signed-off-by: Daniel Thompson Signed-off-by: Marc Zyngier --- arch/arm/include/asm/arch_gicv3.h | 6 ++++++ arch/arm64/include/asm/arch_gicv3.h | 6 ++++++ drivers/irqchip/irq-gic-v3.c | 8 ++++++++ 3 files changed, 20 insertions(+) diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h index e08d15184056..dfe4002812da 100644 --- a/arch/arm/include/asm/arch_gicv3.h +++ b/arch/arm/include/asm/arch_gicv3.h @@ -34,6 +34,7 @@ #define ICC_CTLR __ACCESS_CP15(c12, 0, c12, 4) #define ICC_SRE __ACCESS_CP15(c12, 0, c12, 5) #define ICC_IGRPEN1 __ACCESS_CP15(c12, 0, c12, 7) +#define ICC_BPR1 __ACCESS_CP15(c12, 0, c12, 3) #define ICC_HSRE __ACCESS_CP15(c12, 4, c9, 5) @@ -157,6 +158,11 @@ static inline void gic_write_sre(u32 val) isb(); } +static inline void gic_write_bpr1(u32 val) +{ + asm volatile("mcr " __stringify(ICC_BPR1) : : "r" (val)); +} + /* * Even in 32bit systems that use LPAE, there is no guarantee that the I/O * interface provides true 64bit atomic accesses, so using strd/ldrd doesn't diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h index 8ec88e5b290f..fc2a0cb47b2c 100644 --- a/arch/arm64/include/asm/arch_gicv3.h +++ b/arch/arm64/include/asm/arch_gicv3.h @@ -28,6 +28,7 @@ #define ICC_CTLR_EL1 sys_reg(3, 0, 12, 12, 4) #define ICC_SRE_EL1 sys_reg(3, 0, 12, 12, 5) #define ICC_GRPEN1_EL1 sys_reg(3, 0, 12, 12, 7) +#define ICC_BPR1_EL1 sys_reg(3, 0, 12, 12, 3) #define ICC_SRE_EL2 sys_reg(3, 4, 12, 9, 5) @@ -165,6 +166,11 @@ static inline void gic_write_sre(u32 val) isb(); } +static inline void gic_write_bpr1(u32 val) +{ + asm volatile("msr_s " __stringify(ICC_BPR1_EL1) ", %0" : : "r" (val)); +} + #define gic_read_typer(c) readq_relaxed(c) #define gic_write_irouter(v, c) writeq_relaxed(v, c) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index ede5672ab34d..ecc5b2360c7a 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -495,6 +495,14 @@ static void gic_cpu_sys_reg_init(void) /* Set priority mask register */ gic_write_pmr(DEFAULT_PMR_VALUE); + /* + * Some firmwares hand over to the kernel with the BPR changed from + * its reset value (and with a value large enough to prevent + * any pre-emptive interrupts from working at all). Writing a zero + * to BPR restores is reset value. + */ + gic_write_bpr1(0); + if (static_key_true(&supports_deactivate)) { /* EOI drops priority only (mode 1) */ gic_write_ctlr(ICC_CTLR_EL1_EOImode_drop); From 39a342b25252f34d1dbe758b2b99bd055ff3b885 Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Tue, 16 Aug 2016 11:14:10 +0100 Subject: [PATCH 268/538] irqchip/gic-pm: Update driver to use of_pm_clk_add_clk Commit 498b5fdd40dd ("PM / clk: Add support for adding a specific clock from device-tree") add a new helper function for adding a clock from device-tree to a device. Update the GIC-PM driver to use this new function to simplify the driver. Signed-off-by: Jon Hunter Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-pm.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/drivers/irqchip/irq-gic-pm.c b/drivers/irqchip/irq-gic-pm.c index 4cbffba3ff13..ecafd295c31c 100644 --- a/drivers/irqchip/irq-gic-pm.c +++ b/drivers/irqchip/irq-gic-pm.c @@ -64,7 +64,6 @@ static int gic_runtime_suspend(struct device *dev) static int gic_get_clocks(struct device *dev, const struct gic_clk_data *data) { - struct clk *clk; unsigned int i; int ret; @@ -76,28 +75,16 @@ static int gic_get_clocks(struct device *dev, const struct gic_clk_data *data) return ret; for (i = 0; i < data->num_clocks; i++) { - clk = of_clk_get_by_name(dev->of_node, data->clocks[i]); - if (IS_ERR(clk)) { - dev_err(dev, "failed to get clock %s\n", - data->clocks[i]); - ret = PTR_ERR(clk); - goto error; - } - - ret = pm_clk_add_clk(dev, clk); + ret = of_pm_clk_add_clk(dev, data->clocks[i]); if (ret) { - dev_err(dev, "failed to add clock at index %d\n", i); - clk_put(clk); - goto error; + dev_err(dev, "failed to add clock %s\n", + data->clocks[i]); + pm_clk_destroy(dev); + return ret; } } return 0; - -error: - pm_clk_destroy(dev); - - return ret; } static int gic_probe(struct platform_device *pdev) From 89c59cca48f04ede7e63944418af1794a8e31da6 Mon Sep 17 00:00:00 2001 From: Baoyou Xie Date: Wed, 7 Sep 2016 19:26:45 +0800 Subject: [PATCH 269/538] irqchip/gic: Mark gic_init_physaddr() static We get 1 warning when building kernel with W=1: drivers/irqchip/irq-gic.c:917:13: warning: no previous prototype for 'gic_init_physaddr' [-Wmissing-prototypes] In fact, this function is only used in the file in which it is declared and don't need a declaration, but can be made static. so this patch marks this function with 'static'. Signed-off-by: Baoyou Xie Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index d108fe6a32d9..58e5b4e87056 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -939,7 +939,7 @@ unsigned long gic_get_sgir_physaddr(void) return gic_dist_physaddr + GIC_DIST_SOFTINT; } -void __init gic_init_physaddr(struct device_node *node) +static void __init gic_init_physaddr(struct device_node *node) { struct resource res; if (of_address_to_resource(node, 0, &res) == 0) { From 88ef16d888a094587b2ac77de60927df5da5d56d Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Mon, 12 Sep 2016 20:54:20 +0200 Subject: [PATCH 270/538] ACPI: I/O Remapping Table (IORT) initial support IORT shows representation of IO topology for ARM based systems. It describes how various components are connected together on parent-child basis e.g. PCI RC -> SMMU -> ITS. Also see IORT spec. http://infocenter.arm.com/help/topic/com.arm.doc.den0049b/DEN0049B_IO_Remapping_Table.pdf Initial support allows to detect IORT table presence and save its root pointer obtained through acpi_get_table(). The pointer validity depends on acpi_gbl_permanent_mmap because if acpi_gbl_permanent_mmap is not set while using IORT nodes we would dereference unmapped pointers. For the aforementioned reason call acpi_iort_init() from acpi_init() which guarantees acpi_gbl_permanent_mmap to be set at that point. Add generic helpers which are helpful for scanning and retrieving information from IORT table content. List of the most important helpers: - iort_find_dev_node() finds IORT node for a given device - iort_node_map_rid() maps device RID and returns IORT node which provides final translation IORT support is placed under drivers/acpi/arm64/ new directory due to its ARM64 specific nature. The code there is considered only for ARM64. The long term plan is to keep all ARM64 specific tables support in this place e.g. GTDT table. Signed-off-by: Tomasz Nowicki Acked-by: Rafael J. Wysocki Reviewed-by: Hanjun Guo Reviewed-by: Lorenzo Pieralisi Signed-off-by: Marc Zyngier --- drivers/acpi/Kconfig | 4 + drivers/acpi/Makefile | 2 + drivers/acpi/arm64/Kconfig | 6 + drivers/acpi/arm64/Makefile | 1 + drivers/acpi/arm64/iort.c | 216 ++++++++++++++++++++++++++++++++++++ drivers/acpi/bus.c | 2 + include/linux/acpi_iort.h | 30 +++++ 7 files changed, 261 insertions(+) create mode 100644 drivers/acpi/arm64/Kconfig create mode 100644 drivers/acpi/arm64/Makefile create mode 100644 drivers/acpi/arm64/iort.c create mode 100644 include/linux/acpi_iort.h diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index 445ce28475b3..d5c06145d07f 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -521,4 +521,8 @@ config ACPI_CONFIGFS userspace. The configurable ACPI groups will be visible under /config/acpi, assuming configfs is mounted under /config. +if ARM64 +source "drivers/acpi/arm64/Kconfig" +endif + endif # ACPI diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index 5ae9d85c5159..e5ada7895697 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -105,3 +105,5 @@ obj-$(CONFIG_ACPI_CONFIGFS) += acpi_configfs.o video-objs += acpi_video.o video_detect.o obj-y += dptf/ + +obj-$(CONFIG_ARM64) += arm64/ diff --git a/drivers/acpi/arm64/Kconfig b/drivers/acpi/arm64/Kconfig new file mode 100644 index 000000000000..4616da4c15be --- /dev/null +++ b/drivers/acpi/arm64/Kconfig @@ -0,0 +1,6 @@ +# +# ACPI Configuration for ARM64 +# + +config ACPI_IORT + bool diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile new file mode 100644 index 000000000000..72331f2ce0e9 --- /dev/null +++ b/drivers/acpi/arm64/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_ACPI_IORT) += iort.o diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c new file mode 100644 index 000000000000..5279a358924a --- /dev/null +++ b/drivers/acpi/arm64/iort.c @@ -0,0 +1,216 @@ +/* + * Copyright (C) 2016, Semihalf + * Author: Tomasz Nowicki + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * This file implements early detection/parsing of I/O mapping + * reported to OS through firmware via I/O Remapping Table (IORT) + * IORT document number: ARM DEN 0049A + */ + +#define pr_fmt(fmt) "ACPI: IORT: " fmt + +#include +#include +#include + +typedef acpi_status (*iort_find_node_callback) + (struct acpi_iort_node *node, void *context); + +/* Root pointer to the mapped IORT table */ +static struct acpi_table_header *iort_table; + +static LIST_HEAD(iort_msi_chip_list); +static DEFINE_SPINLOCK(iort_msi_chip_lock); + +static struct acpi_iort_node *iort_scan_node(enum acpi_iort_node_type type, + iort_find_node_callback callback, + void *context) +{ + struct acpi_iort_node *iort_node, *iort_end; + struct acpi_table_iort *iort; + int i; + + if (!iort_table) + return NULL; + + /* Get the first IORT node */ + iort = (struct acpi_table_iort *)iort_table; + iort_node = ACPI_ADD_PTR(struct acpi_iort_node, iort, + iort->node_offset); + iort_end = ACPI_ADD_PTR(struct acpi_iort_node, iort_table, + iort_table->length); + + for (i = 0; i < iort->node_count; i++) { + if (WARN_TAINT(iort_node >= iort_end, TAINT_FIRMWARE_WORKAROUND, + "IORT node pointer overflows, bad table!\n")) + return NULL; + + if (iort_node->type == type && + ACPI_SUCCESS(callback(iort_node, context))) + return iort_node; + + iort_node = ACPI_ADD_PTR(struct acpi_iort_node, iort_node, + iort_node->length); + } + + return NULL; +} + +static acpi_status iort_match_node_callback(struct acpi_iort_node *node, + void *context) +{ + struct device *dev = context; + acpi_status status; + + if (node->type == ACPI_IORT_NODE_NAMED_COMPONENT) { + struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_device *adev = to_acpi_device_node(dev->fwnode); + struct acpi_iort_named_component *ncomp; + + if (!adev) { + status = AE_NOT_FOUND; + goto out; + } + + status = acpi_get_name(adev->handle, ACPI_FULL_PATHNAME, &buf); + if (ACPI_FAILURE(status)) { + dev_warn(dev, "Can't get device full path name\n"); + goto out; + } + + ncomp = (struct acpi_iort_named_component *)node->node_data; + status = !strcmp(ncomp->device_name, buf.pointer) ? + AE_OK : AE_NOT_FOUND; + acpi_os_free(buf.pointer); + } else if (node->type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX) { + struct acpi_iort_root_complex *pci_rc; + struct pci_bus *bus; + + bus = to_pci_bus(dev); + pci_rc = (struct acpi_iort_root_complex *)node->node_data; + + /* + * It is assumed that PCI segment numbers maps one-to-one + * with root complexes. Each segment number can represent only + * one root complex. + */ + status = pci_rc->pci_segment_number == pci_domain_nr(bus) ? + AE_OK : AE_NOT_FOUND; + } else { + status = AE_NOT_FOUND; + } +out: + return status; +} + +static int iort_id_map(struct acpi_iort_id_mapping *map, u8 type, u32 rid_in, + u32 *rid_out) +{ + /* Single mapping does not care for input id */ + if (map->flags & ACPI_IORT_ID_SINGLE_MAPPING) { + if (type == ACPI_IORT_NODE_NAMED_COMPONENT || + type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX) { + *rid_out = map->output_base; + return 0; + } + + pr_warn(FW_BUG "[map %p] SINGLE MAPPING flag not allowed for node type %d, skipping ID map\n", + map, type); + return -ENXIO; + } + + if (rid_in < map->input_base || + (rid_in >= map->input_base + map->id_count)) + return -ENXIO; + + *rid_out = map->output_base + (rid_in - map->input_base); + return 0; +} + +static struct acpi_iort_node *iort_node_map_rid(struct acpi_iort_node *node, + u32 rid_in, u32 *rid_out, + u8 type) +{ + u32 rid = rid_in; + + /* Parse the ID mapping tree to find specified node type */ + while (node) { + struct acpi_iort_id_mapping *map; + int i; + + if (node->type == type) { + if (rid_out) + *rid_out = rid; + return node; + } + + if (!node->mapping_offset || !node->mapping_count) + goto fail_map; + + map = ACPI_ADD_PTR(struct acpi_iort_id_mapping, node, + node->mapping_offset); + + /* Firmware bug! */ + if (!map->output_reference) { + pr_err(FW_BUG "[node %p type %d] ID map has NULL parent reference\n", + node, node->type); + goto fail_map; + } + + /* Do the RID translation */ + for (i = 0; i < node->mapping_count; i++, map++) { + if (!iort_id_map(map, node->type, rid, &rid)) + break; + } + + if (i == node->mapping_count) + goto fail_map; + + node = ACPI_ADD_PTR(struct acpi_iort_node, iort_table, + map->output_reference); + } + +fail_map: + /* Map input RID to output RID unchanged on mapping failure*/ + if (rid_out) + *rid_out = rid_in; + + return NULL; +} + +static struct acpi_iort_node *iort_find_dev_node(struct device *dev) +{ + struct pci_bus *pbus; + + if (!dev_is_pci(dev)) + return iort_scan_node(ACPI_IORT_NODE_NAMED_COMPONENT, + iort_match_node_callback, dev); + + /* Find a PCI root bus */ + pbus = to_pci_dev(dev)->bus; + while (!pci_is_root_bus(pbus)) + pbus = pbus->parent; + + return iort_scan_node(ACPI_IORT_NODE_PCI_ROOT_COMPLEX, + iort_match_node_callback, &pbus->dev); +} + +void __init acpi_iort_init(void) +{ + acpi_status status; + + status = acpi_get_table(ACPI_SIG_IORT, 0, &iort_table); + if (ACPI_FAILURE(status) && status != AE_NOT_FOUND) { + const char *msg = acpi_format_exception(status); + pr_err("Failed to get table, %s\n", msg); + } +} diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 85b7d07fe5c8..e56e6438515a 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -36,6 +36,7 @@ #ifdef CONFIG_X86 #include #endif +#include #include #include #include @@ -1186,6 +1187,7 @@ static int __init acpi_init(void) } pci_mmcfg_late_init(); + acpi_iort_init(); acpi_scan_init(); acpi_ec_init(); acpi_debugfs_init(); diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h new file mode 100644 index 000000000000..fcacaf7ed64d --- /dev/null +++ b/include/linux/acpi_iort.h @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2016, Semihalf + * Author: Tomasz Nowicki + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + */ + +#ifndef __ACPI_IORT_H__ +#define __ACPI_IORT_H__ + +#include + +#ifdef CONFIG_ACPI_IORT +void acpi_iort_init(void); +#else +static inline void acpi_iort_init(void) { } +#endif + +#endif /* __ACPI_IORT_H__ */ From 4bf2efd26d7624372fb7adff8745b4c2e8407004 Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Mon, 12 Sep 2016 20:32:21 +0200 Subject: [PATCH 271/538] ACPI: Add new IORT functions to support MSI domain handling For ITS, MSI functionality consists on building domain stack and during that process we need to reference to domain stack components e.g. before we create new DOMAIN_BUS_PCI_MSI domain we need to specify its DOMAIN_BUS_NEXUS parent domain. In order to manage that process properly, maintain list which elements contain domain token (unique for MSI domain stack) and ITS ID: iort_register_domain_token() and iort_deregister_domain_token(). Then retrieve domain token any time later with ITS ID being key off: iort_find_domain_token(). With domain token and domain type we are able to find corresponding IRQ domain. Since IORT is prepared to describe MSI domain on a per-device basis, use existing IORT helpers and implement two calls: 1. iort_msi_map_rid() to map MSI RID for a device 2. iort_get_device_domain() to find domain token for a device Signed-off-by: Tomasz Nowicki Acked-by: Rafael J. Wysocki Reviewed-by: Hanjun Guo Signed-off-by: Marc Zyngier --- drivers/acpi/arm64/iort.c | 152 ++++++++++++++++++++++++++++++++++++++ include/linux/acpi_iort.h | 12 +++ 2 files changed, 164 insertions(+) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 5279a358924a..6b81746cd13c 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -22,6 +22,12 @@ #include #include +struct iort_its_msi_chip { + struct list_head list; + struct fwnode_handle *fw_node; + u32 translation_id; +}; + typedef acpi_status (*iort_find_node_callback) (struct acpi_iort_node *node, void *context); @@ -31,6 +37,76 @@ static struct acpi_table_header *iort_table; static LIST_HEAD(iort_msi_chip_list); static DEFINE_SPINLOCK(iort_msi_chip_lock); +/** + * iort_register_domain_token() - register domain token and related ITS ID + * to the list from where we can get it back later on. + * @trans_id: ITS ID. + * @fw_node: Domain token. + * + * Returns: 0 on success, -ENOMEM if no memory when allocating list element + */ +int iort_register_domain_token(int trans_id, struct fwnode_handle *fw_node) +{ + struct iort_its_msi_chip *its_msi_chip; + + its_msi_chip = kzalloc(sizeof(*its_msi_chip), GFP_KERNEL); + if (!its_msi_chip) + return -ENOMEM; + + its_msi_chip->fw_node = fw_node; + its_msi_chip->translation_id = trans_id; + + spin_lock(&iort_msi_chip_lock); + list_add(&its_msi_chip->list, &iort_msi_chip_list); + spin_unlock(&iort_msi_chip_lock); + + return 0; +} + +/** + * iort_deregister_domain_token() - Deregister domain token based on ITS ID + * @trans_id: ITS ID. + * + * Returns: none. + */ +void iort_deregister_domain_token(int trans_id) +{ + struct iort_its_msi_chip *its_msi_chip, *t; + + spin_lock(&iort_msi_chip_lock); + list_for_each_entry_safe(its_msi_chip, t, &iort_msi_chip_list, list) { + if (its_msi_chip->translation_id == trans_id) { + list_del(&its_msi_chip->list); + kfree(its_msi_chip); + break; + } + } + spin_unlock(&iort_msi_chip_lock); +} + +/** + * iort_find_domain_token() - Find domain token based on given ITS ID + * @trans_id: ITS ID. + * + * Returns: domain token when find on the list, NULL otherwise + */ +struct fwnode_handle *iort_find_domain_token(int trans_id) +{ + struct fwnode_handle *fw_node = NULL; + struct iort_its_msi_chip *its_msi_chip; + + spin_lock(&iort_msi_chip_lock); + list_for_each_entry(its_msi_chip, &iort_msi_chip_list, list) { + if (its_msi_chip->translation_id == trans_id) { + fw_node = its_msi_chip->fw_node; + break; + } + } + spin_unlock(&iort_msi_chip_lock); + + return fw_node; +} + static struct acpi_iort_node *iort_scan_node(enum acpi_iort_node_type type, iort_find_node_callback callback, void *context) @@ -204,6 +280,82 @@ static struct acpi_iort_node *iort_find_dev_node(struct device *dev) iort_match_node_callback, &pbus->dev); } +/** + * iort_msi_map_rid() - Map a MSI requester ID for a device + * @dev: The device for which the mapping is to be done. + * @req_id: The device requester ID. + * + * Returns: mapped MSI RID on success, input requester ID otherwise + */ +u32 iort_msi_map_rid(struct device *dev, u32 req_id) +{ + struct acpi_iort_node *node; + u32 dev_id; + + node = iort_find_dev_node(dev); + if (!node) + return req_id; + + iort_node_map_rid(node, req_id, &dev_id, ACPI_IORT_NODE_ITS_GROUP); + return dev_id; +} + +/** + * iort_dev_find_its_id() - Find the ITS identifier for a device + * @dev: The device. + * @idx: Index of the ITS identifier list. + * @its_id: ITS identifier. + * + * Returns: 0 on success, appropriate error value otherwise + */ +static int iort_dev_find_its_id(struct device *dev, u32 req_id, + unsigned int idx, int *its_id) +{ + struct acpi_iort_its_group *its; + struct acpi_iort_node *node; + + node = iort_find_dev_node(dev); + if (!node) + return -ENXIO; + + node = iort_node_map_rid(node, req_id, NULL, ACPI_IORT_NODE_ITS_GROUP); + if (!node) + return -ENXIO; + + /* Move to ITS specific data */ + its = (struct acpi_iort_its_group *)node->node_data; + if (idx > its->its_count) { + dev_err(dev, "requested ITS ID index [%d] is greater than available [%d]\n", + idx, its->its_count); + return -ENXIO; + } + + *its_id = its->identifiers[idx]; + return 0; +} + +/** + * iort_get_device_domain() - Find MSI domain related to a device + * @dev: The device. + * @req_id: Requester ID for the device. + * + * Returns: the MSI domain for this device, NULL otherwise + */ +struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id) +{ + struct fwnode_handle *handle; + int its_id; + + if (iort_dev_find_its_id(dev, req_id, 0, &its_id)) + return NULL; + + handle = iort_find_domain_token(its_id); + if (!handle) + return NULL; + + return irq_find_matching_fwnode(handle, DOMAIN_BUS_PCI_MSI); +} + void __init acpi_iort_init(void) { acpi_status status; diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h index fcacaf7ed64d..0e32dac8fd03 100644 --- a/include/linux/acpi_iort.h +++ b/include/linux/acpi_iort.h @@ -20,11 +20,23 @@ #define __ACPI_IORT_H__ #include +#include +#include +int iort_register_domain_token(int trans_id, struct fwnode_handle *fw_node); +void iort_deregister_domain_token(int trans_id); +struct fwnode_handle *iort_find_domain_token(int trans_id); #ifdef CONFIG_ACPI_IORT void acpi_iort_init(void); +u32 iort_msi_map_rid(struct device *dev, u32 req_id); +struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id); #else static inline void acpi_iort_init(void) { } +static inline u32 iort_msi_map_rid(struct device *dev, u32 req_id) +{ return req_id; } +static inline struct irq_domain *iort_get_device_domain(struct device *dev, + u32 req_id) +{ return NULL; } #endif #endif /* __ACPI_IORT_H__ */ From be2021baeed64d8947a56529fc383308918ecc41 Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Mon, 12 Sep 2016 20:32:22 +0200 Subject: [PATCH 272/538] PCI/MSI: Setup MSI domain on a per-device basis using IORT ACPI table It is possible to provide information about which MSI controller to use on a per-device basis for DT. This patch supply this with ACPI support. Currently, IORT is the only one ACPI table which can provide such mapping. In order to plug IORT into MSI infrastructure we are adding ACPI equivalents for finding PCI device domain and its RID translation (pci_msi_domain_get_msi_rid and pci_msi_domain_get_msi_rid calls). Signed-off-by: Tomasz Nowicki Reviewed-by: Hanjun Guo Acked-by: Marc Zyngier Acked-by: Bjorn Helgaas Signed-off-by: Marc Zyngier --- drivers/pci/msi.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 98f12223c734..137b4c5fb638 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -1502,8 +1503,8 @@ u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev) pci_for_each_dma_alias(pdev, get_msi_id_cb, &rid); of_node = irq_domain_get_of_node(domain); - if (of_node) - rid = of_msi_map_rid(&pdev->dev, of_node, rid); + rid = of_node ? of_msi_map_rid(&pdev->dev, of_node, rid) : + iort_msi_map_rid(&pdev->dev, rid); return rid; } @@ -1519,9 +1520,13 @@ u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev) */ struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev) { + struct irq_domain *dom; u32 rid = 0; pci_for_each_dma_alias(pdev, get_msi_id_cb, &rid); - return of_msi_map_get_device_domain(&pdev->dev, rid); + dom = of_msi_map_get_device_domain(&pdev->dev, rid); + if (!dom) + dom = iort_get_device_domain(&pdev->dev, rid); + return dom; } #endif /* CONFIG_PCI_MSI_IRQ_DOMAIN */ From d14ae5e6bac36e88cd5deeee411104da424bc73d Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Mon, 12 Sep 2016 20:32:23 +0200 Subject: [PATCH 273/538] irqchip/gicv3-its: Cleanup for ITS domain initialization There is no point to initialize ITS without having msi-controller property in corresponding DT node. However, its_probe is checking msi-controller presence at the end, so we can save our time and do that check prior to its_probe call. Also, for the code clarity purpose, we put domain initialization to separate function. Signed-off-by: Tomasz Nowicki Acked-by: Marc Zyngier Reviewed-by: Hanjun Guo Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 57 +++++++++++++++++++------------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 36b9c28a5c91..943442d689d8 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1614,13 +1614,37 @@ static void its_enable_quirks(struct its_node *its) gic_enable_quirks(iidr, its_quirks, its); } +static int its_init_domain(struct device_node *node, struct its_node *its, + struct irq_domain *parent) +{ + struct irq_domain *inner_domain; + struct msi_domain_info *info; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + inner_domain = irq_domain_add_tree(node, &its_domain_ops, its); + if (!inner_domain) { + kfree(info); + return -ENOMEM; + } + + inner_domain->parent = parent; + inner_domain->bus_token = DOMAIN_BUS_NEXUS; + info->ops = &its_msi_domain_ops; + info->data = its; + inner_domain->host_data = info; + + return 0; +} + static int __init its_probe(struct device_node *node, struct irq_domain *parent) { struct resource res; struct its_node *its; void __iomem *its_base; - struct irq_domain *inner_domain; u32 val; u64 baser, tmp; int err; @@ -1712,28 +1736,9 @@ static int __init its_probe(struct device_node *node, writeq_relaxed(0, its->base + GITS_CWRITER); writel_relaxed(GITS_CTLR_ENABLE, its->base + GITS_CTLR); - if (of_property_read_bool(node, "msi-controller")) { - struct msi_domain_info *info; - - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) { - err = -ENOMEM; - goto out_free_tables; - } - - inner_domain = irq_domain_add_tree(node, &its_domain_ops, its); - if (!inner_domain) { - err = -ENOMEM; - kfree(info); - goto out_free_tables; - } - - inner_domain->parent = parent; - inner_domain->bus_token = DOMAIN_BUS_NEXUS; - info->ops = &its_msi_domain_ops; - info->data = its; - inner_domain->host_data = info; - } + err = its_init_domain(node, its, parent); + if (err) + goto out_free_tables; spin_lock(&its_lock); list_add(&its->entry, &its_nodes); @@ -1784,6 +1789,12 @@ int __init its_init(struct device_node *node, struct rdists *rdists, for (np = of_find_matching_node(node, its_device_id); np; np = of_find_matching_node(np, its_device_id)) { + if (!of_property_read_bool(np, "msi-controller")) { + pr_warn("%s: no msi-controller property, ITS ignored\n", + np->full_name); + continue; + } + its_probe(np, parent_domain); } From db40f0a7aea5e03ef044ef5dbc51a364e1ff7991 Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Mon, 12 Sep 2016 20:32:24 +0200 Subject: [PATCH 274/538] irqchip/gicv3-its: Refactor ITS DT init code to prepare for ACPI In order to add ACPI support we need to isolate ACPI&DT common code and move DT logic to corresponding functions. To achieve this we are using firmware agnostic handle which can be unpacked to either DT or ACPI node. No functional changes other than a very minor one: 1. Terminate its_init call with -ENODEV for non-DT case which allows to remove hack from its-gic-v3.c. 2. Fix ITS base register address type (from 'unsigned long' to 'phys_addr_t'), as a bonus we get nice string formatting. 3. Since there is only one of ITS parent domain convert it to static global variable and drop the parameter from its_probe_one. Users can refer to it in more convenient way then. Signed-off-by: Hanjun Guo Signed-off-by: Tomasz Nowicki Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 65 +++++++++++++++++------------- drivers/irqchip/irq-gic-v3.c | 7 +--- include/linux/irqchip/arm-gic-v3.h | 4 +- 3 files changed, 42 insertions(+), 34 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 943442d689d8..c7518c7b48bc 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -75,7 +75,7 @@ struct its_node { raw_spinlock_t lock; struct list_head entry; void __iomem *base; - unsigned long phys_base; + phys_addr_t phys_base; struct its_cmd_block *cmd_base; struct its_cmd_block *cmd_write; struct its_baser tables[GITS_BASER_NR_REGS]; @@ -115,6 +115,7 @@ struct its_device { static LIST_HEAD(its_nodes); static DEFINE_SPINLOCK(its_lock); static struct rdists *gic_rdists; +static struct irq_domain *its_parent; #define gic_data_rdist() (raw_cpu_ptr(gic_rdists->rdist)) #define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base) @@ -1614,8 +1615,7 @@ static void its_enable_quirks(struct its_node *its) gic_enable_quirks(iidr, its_quirks, its); } -static int its_init_domain(struct device_node *node, struct its_node *its, - struct irq_domain *parent) +static int its_init_domain(struct fwnode_handle *handle, struct its_node *its) { struct irq_domain *inner_domain; struct msi_domain_info *info; @@ -1624,13 +1624,13 @@ static int its_init_domain(struct device_node *node, struct its_node *its, if (!info) return -ENOMEM; - inner_domain = irq_domain_add_tree(node, &its_domain_ops, its); + inner_domain = irq_domain_create_tree(handle, &its_domain_ops, its); if (!inner_domain) { kfree(info); return -ENOMEM; } - inner_domain->parent = parent; + inner_domain->parent = its_parent; inner_domain->bus_token = DOMAIN_BUS_NEXUS; info->ops = &its_msi_domain_ops; info->data = its; @@ -1639,43 +1639,35 @@ static int its_init_domain(struct device_node *node, struct its_node *its, return 0; } -static int __init its_probe(struct device_node *node, - struct irq_domain *parent) +static int __init its_probe_one(struct resource *res, + struct fwnode_handle *handle, int numa_node) { - struct resource res; struct its_node *its; void __iomem *its_base; u32 val; u64 baser, tmp; int err; - err = of_address_to_resource(node, 0, &res); - if (err) { - pr_warn("%s: no regs?\n", node->full_name); - return -ENXIO; - } - - its_base = ioremap(res.start, resource_size(&res)); + its_base = ioremap(res->start, resource_size(res)); if (!its_base) { - pr_warn("%s: unable to map registers\n", node->full_name); + pr_warn("ITS@%pa: Unable to map ITS registers\n", &res->start); return -ENOMEM; } val = readl_relaxed(its_base + GITS_PIDR2) & GIC_PIDR2_ARCH_MASK; if (val != 0x30 && val != 0x40) { - pr_warn("%s: no ITS detected, giving up\n", node->full_name); + pr_warn("ITS@%pa: No ITS detected, giving up\n", &res->start); err = -ENODEV; goto out_unmap; } err = its_force_quiescent(its_base); if (err) { - pr_warn("%s: failed to quiesce, giving up\n", - node->full_name); + pr_warn("ITS@%pa: Failed to quiesce, giving up\n", &res->start); goto out_unmap; } - pr_info("ITS: %s\n", node->full_name); + pr_info("ITS %pR\n", res); its = kzalloc(sizeof(*its), GFP_KERNEL); if (!its) { @@ -1687,9 +1679,9 @@ static int __init its_probe(struct device_node *node, INIT_LIST_HEAD(&its->entry); INIT_LIST_HEAD(&its->its_device_list); its->base = its_base; - its->phys_base = res.start; + its->phys_base = res->start; its->ite_size = ((readl_relaxed(its_base + GITS_TYPER) >> 4) & 0xf) + 1; - its->numa_node = of_node_to_nid(node); + its->numa_node = numa_node; its->cmd_base = kzalloc(ITS_CMD_QUEUE_SZ, GFP_KERNEL); if (!its->cmd_base) { @@ -1736,7 +1728,7 @@ static int __init its_probe(struct device_node *node, writeq_relaxed(0, its->base + GITS_CWRITER); writel_relaxed(GITS_CTLR_ENABLE, its->base + GITS_CTLR); - err = its_init_domain(node, its, parent); + err = its_init_domain(handle, its); if (err) goto out_free_tables; @@ -1754,7 +1746,7 @@ static int __init its_probe(struct device_node *node, kfree(its); out_unmap: iounmap(its_base); - pr_err("ITS: failed probing %s (%d)\n", node->full_name, err); + pr_err("ITS@%pa: failed probing (%d)\n", &res->start, err); return err; } @@ -1782,10 +1774,10 @@ static struct of_device_id its_device_id[] = { {}, }; -int __init its_init(struct device_node *node, struct rdists *rdists, - struct irq_domain *parent_domain) +static int __init its_of_probe(struct device_node *node) { struct device_node *np; + struct resource res; for (np = of_find_matching_node(node, its_device_id); np; np = of_find_matching_node(np, its_device_id)) { @@ -1795,8 +1787,27 @@ int __init its_init(struct device_node *node, struct rdists *rdists, continue; } - its_probe(np, parent_domain); + if (of_address_to_resource(np, 0, &res)) { + pr_warn("%s: no regs?\n", np->full_name); + continue; + } + + its_probe_one(&res, &np->fwnode, of_node_to_nid(np)); } + return 0; +} + +int __init its_init(struct fwnode_handle *handle, struct rdists *rdists, + struct irq_domain *parent_domain) +{ + struct device_node *of_node; + + its_parent = parent_domain; + of_node = to_of_node(handle); + if (of_node) + its_of_probe(of_node); + else + return -ENODEV; if (list_empty(&its_nodes)) { pr_warn("ITS: No ITS available, not enabling LPIs\n"); diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index ecc5b2360c7a..850f9c422f24 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -918,7 +918,6 @@ static int __init gic_init_bases(void __iomem *dist_base, u64 redist_stride, struct fwnode_handle *handle) { - struct device_node *node; u32 typer; int gic_irqs; int err; @@ -959,10 +958,8 @@ static int __init gic_init_bases(void __iomem *dist_base, set_handle_irq(gic_handle_irq); - node = to_of_node(handle); - if (IS_ENABLED(CONFIG_ARM_GIC_V3_ITS) && gic_dist_supports_lpis() && - node) /* Temp hack to prevent ITS init for ACPI */ - its_init(node, &gic_data.rdists, gic_data.domain); + if (IS_ENABLED(CONFIG_ARM_GIC_V3_ITS) && gic_dist_supports_lpis()) + its_init(handle, &gic_data.rdists, gic_data.domain); gic_smp_init(); gic_dist_init(); diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 99ac022edc60..8361c8d3edd1 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -430,9 +430,9 @@ struct rdists { }; struct irq_domain; -struct device_node; +struct fwnode_handle; int its_cpu_init(void); -int its_init(struct device_node *node, struct rdists *rdists, +int its_init(struct fwnode_handle *handle, struct rdists *rdists, struct irq_domain *domain); static inline bool gic_enable_sre(void) From 3f010cf197324b6c1e87f472e64b87c5f909735e Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Mon, 12 Sep 2016 20:32:25 +0200 Subject: [PATCH 275/538] irqchip/gicv3-its: Probe ITS in the ACPI way ITS is prepared for being initialized different than DT, therefore we can initialize it in ACPI way. We collect register base address from MADT table and pass mandatory info to firmware-agnostic ITS init call. Use here IORT lib to register ITS domain which then can be found and used on to build another PCI MSI domain in hierarchical stack domain. NOTE: Waiting for proper ITS and NUMA node relation description in IORT table, we pass around NUMA_NO_NODE to the its_probe_one init call. This means that Cavium ThunderX erratum 23144 (pass1.1 only) is not supported for ACPI boot method yet. Signed-off-by: Tomasz Nowicki Acked-by: Marc Zyngier Reviewed-by: Hanjun Guo Signed-off-by: Marc Zyngier --- drivers/irqchip/Kconfig | 1 + drivers/irqchip/irq-gic-v3-its.c | 61 +++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 7f8728984f44..9aeea1d8a579 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -39,6 +39,7 @@ config ARM_GIC_V3_ITS bool depends on PCI depends on PCI_MSI + select ACPI_IORT if ACPI config ARM_NVIC bool diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index c7518c7b48bc..35c851c14e49 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -15,10 +15,13 @@ * along with this program. If not, see . */ +#include #include #include #include #include +#include +#include #include #include #include @@ -1438,6 +1441,11 @@ static int its_irq_gic_domain_alloc(struct irq_domain *domain, fwspec.param[0] = GIC_IRQ_TYPE_LPI; fwspec.param[1] = hwirq; fwspec.param[2] = IRQ_TYPE_EDGE_RISING; + } else if (is_fwnode_irqchip(domain->parent->fwnode)) { + fwspec.fwnode = domain->parent->fwnode; + fwspec.param_count = 2; + fwspec.param[0] = hwirq; + fwspec.param[1] = IRQ_TYPE_EDGE_RISING; } else { return -EINVAL; } @@ -1797,6 +1805,57 @@ static int __init its_of_probe(struct device_node *node) return 0; } +#ifdef CONFIG_ACPI + +#define ACPI_GICV3_ITS_MEM_SIZE (SZ_128K) + +static int __init gic_acpi_parse_madt_its(struct acpi_subtable_header *header, + const unsigned long end) +{ + struct acpi_madt_generic_translator *its_entry; + struct fwnode_handle *dom_handle; + struct resource res; + int err; + + its_entry = (struct acpi_madt_generic_translator *)header; + memset(&res, 0, sizeof(res)); + res.start = its_entry->base_address; + res.end = its_entry->base_address + ACPI_GICV3_ITS_MEM_SIZE - 1; + res.flags = IORESOURCE_MEM; + + dom_handle = irq_domain_alloc_fwnode((void *)its_entry->base_address); + if (!dom_handle) { + pr_err("ITS@%pa: Unable to allocate GICv3 ITS domain token\n", + &res.start); + return -ENOMEM; + } + + err = iort_register_domain_token(its_entry->translation_id, dom_handle); + if (err) { + pr_err("ITS@%pa: Unable to register GICv3 ITS domain token (ITS ID %d) to IORT\n", + &res.start, its_entry->translation_id); + goto dom_err; + } + + err = its_probe_one(&res, dom_handle, NUMA_NO_NODE); + if (!err) + return 0; + + iort_deregister_domain_token(its_entry->translation_id); +dom_err: + irq_domain_free_fwnode(dom_handle); + return err; +} + +static void __init its_acpi_probe(void) +{ + acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_TRANSLATOR, + gic_acpi_parse_madt_its, 0); +} +#else +static void __init its_acpi_probe(void) { } +#endif + int __init its_init(struct fwnode_handle *handle, struct rdists *rdists, struct irq_domain *parent_domain) { @@ -1807,7 +1866,7 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists, if (of_node) its_of_probe(of_node); else - return -ENODEV; + its_acpi_probe(); if (list_empty(&its_nodes)) { pr_warn("ITS: No ITS available, not enabling LPIs\n"); From db744aaa279fa1d2a06dd6b95f9599acf3557885 Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Mon, 12 Sep 2016 20:32:26 +0200 Subject: [PATCH 276/538] irqchip/gicv3-its: Factor out PCI-MSI part that might be reused for ACPI Firmware agnostic code lands in common functions which do necessary domain initialization based on unique domain handler. DT specific code goes to DT specific init call. Signed-off-by: Tomasz Nowicki Acked-by: Marc Zyngier Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its-pci-msi.c | 44 ++++++++++++++++-------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its-pci-msi.c b/drivers/irqchip/irq-gic-v3-its-pci-msi.c index aee60ed025dc..d2c2496d61e9 100644 --- a/drivers/irqchip/irq-gic-v3-its-pci-msi.c +++ b/drivers/irqchip/irq-gic-v3-its-pci-msi.c @@ -106,34 +106,48 @@ static struct of_device_id its_device_id[] = { {}, }; -static int __init its_pci_msi_init(void) +static int __init its_pci_msi_init_one(struct fwnode_handle *handle, + const char *name) { - struct device_node *np; struct irq_domain *parent; + parent = irq_find_matching_fwnode(handle, DOMAIN_BUS_NEXUS); + if (!parent || !msi_get_domain_info(parent)) { + pr_err("%s: Unable to locate ITS domain\n", name); + return -ENXIO; + } + + if (!pci_msi_create_irq_domain(handle, &its_pci_msi_domain_info, + parent)) { + pr_err("%s: Unable to create PCI domain\n", name); + return -ENOMEM; + } + + return 0; +} + +static int __init its_pci_of_msi_init(void) +{ + struct device_node *np; + for (np = of_find_matching_node(NULL, its_device_id); np; np = of_find_matching_node(np, its_device_id)) { if (!of_property_read_bool(np, "msi-controller")) continue; - parent = irq_find_matching_host(np, DOMAIN_BUS_NEXUS); - if (!parent || !msi_get_domain_info(parent)) { - pr_err("%s: unable to locate ITS domain\n", - np->full_name); + if (its_pci_msi_init_one(of_node_to_fwnode(np), np->full_name)) continue; - } - - if (!pci_msi_create_irq_domain(of_node_to_fwnode(np), - &its_pci_msi_domain_info, - parent)) { - pr_err("%s: unable to create PCI domain\n", - np->full_name); - continue; - } pr_info("PCI/MSI: %s domain created\n", np->full_name); } return 0; } + +static int __init its_pci_msi_init(void) +{ + its_pci_of_msi_init(); + + return 0; +} early_initcall(its_pci_msi_init); From 723344dd0b2aa10ef9d28fe7f35d594d3e64f0f9 Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Mon, 12 Sep 2016 20:32:27 +0200 Subject: [PATCH 277/538] irqchip/gicv3-its: Use MADT ITS subtable to do PCI/MSI domain initialization Let ACPI build ITS PCI MSI domain. ACPI code is responsible for retrieving inner domain token and passing it on to its_pci_msi_init_one generic init call. IORT maintains list of registered domain tokens and allows to find corresponding domain based on MADT ITS subtable ID info. Signed-off-by: Tomasz Nowicki Acked-by: Marc Zyngier Reviewed-by: Hanjun Guo Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its-pci-msi.c | 44 ++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/drivers/irqchip/irq-gic-v3-its-pci-msi.c b/drivers/irqchip/irq-gic-v3-its-pci-msi.c index d2c2496d61e9..aee1c60d7ab5 100644 --- a/drivers/irqchip/irq-gic-v3-its-pci-msi.c +++ b/drivers/irqchip/irq-gic-v3-its-pci-msi.c @@ -15,6 +15,7 @@ * along with this program. If not, see . */ +#include #include #include #include @@ -144,9 +145,52 @@ static int __init its_pci_of_msi_init(void) return 0; } +#ifdef CONFIG_ACPI + +static int __init +its_pci_msi_parse_madt(struct acpi_subtable_header *header, + const unsigned long end) +{ + struct acpi_madt_generic_translator *its_entry; + struct fwnode_handle *dom_handle; + const char *node_name; + int err = -ENXIO; + + its_entry = (struct acpi_madt_generic_translator *)header; + node_name = kasprintf(GFP_KERNEL, "ITS@0x%lx", + (long)its_entry->base_address); + dom_handle = iort_find_domain_token(its_entry->translation_id); + if (!dom_handle) { + pr_err("%s: Unable to locate ITS domain handle\n", node_name); + goto out; + } + + err = its_pci_msi_init_one(dom_handle, node_name); + if (!err) + pr_info("PCI/MSI: %s domain created\n", node_name); + +out: + kfree(node_name); + return err; +} + +static int __init its_pci_acpi_msi_init(void) +{ + acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_TRANSLATOR, + its_pci_msi_parse_madt, 0); + return 0; +} +#else +static int __init its_pci_acpi_msi_init(void) +{ + return 0; +} +#endif + static int __init its_pci_msi_init(void) { its_pci_of_msi_init(); + its_pci_acpi_msi_init(); return 0; } From 4440a2ab3b9f40dddbe006331ef0659c76859296 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Tue, 13 Sep 2016 08:49:18 +0800 Subject: [PATCH 278/538] netfilter: synproxy: Check oom when adding synproxy and seqadj ct extensions When memory is exhausted, nfct_seqadj_ext_add may fail to add the synproxy and seqadj extensions. The function nf_ct_seqadj_init doesn't check if get valid seqadj pointer by the nfct_seqadj. Now drop the packet directly when fail to add seqadj extension to avoid dereference NULL pointer in nf_ct_seqadj_init from init_conntrack(). Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_synproxy.h | 14 ++++++++++++++ net/netfilter/nf_conntrack_core.c | 6 +++--- net/netfilter/nf_nat_core.c | 3 ++- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h index 6793614e6502..e6937318546c 100644 --- a/include/net/netfilter/nf_conntrack_synproxy.h +++ b/include/net/netfilter/nf_conntrack_synproxy.h @@ -27,6 +27,20 @@ static inline struct nf_conn_synproxy *nfct_synproxy_ext_add(struct nf_conn *ct) #endif } +static inline bool nf_ct_add_synproxy(struct nf_conn *ct, + const struct nf_conn *tmpl) +{ + if (tmpl && nfct_synproxy(tmpl)) { + if (!nfct_seqadj_ext_add(ct)) + return false; + + if (!nfct_synproxy_ext_add(ct)) + return false; + } + + return true; +} + struct synproxy_stats { unsigned int syn_received; unsigned int cookie_invalid; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index dd2c43abf9e2..9934b0c93c1e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1035,9 +1035,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, if (IS_ERR(ct)) return (struct nf_conntrack_tuple_hash *)ct; - if (tmpl && nfct_synproxy(tmpl)) { - nfct_seqadj_ext_add(ct); - nfct_synproxy_ext_add(ct); + if (!nf_ct_add_synproxy(ct, tmpl)) { + nf_conntrack_free(ct); + return ERR_PTR(-ENOMEM); } timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 19c081e1b328..ecee105bbada 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -441,7 +441,8 @@ nf_nat_setup_info(struct nf_conn *ct, ct->status |= IPS_DST_NAT; if (nfct_help(ct)) - nfct_seqadj_ext_add(ct); + if (!nfct_seqadj_ext_add(ct)) + return NF_DROP; } if (maniptype == NF_NAT_MANIP_SRC) { From 53a5d5ddccf849dbc27a8c1bba0b43c3a45fb792 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 7 Sep 2016 18:42:08 +0800 Subject: [PATCH 279/538] crypto: echainiv - Replace chaining with multiplication The current implementation uses a global per-cpu array to store data which are used to derive the next IV. This is insecure as the attacker may change the stored data. This patch removes all traces of chaining and replaces it with multiplication of the salt and the sequence number. Fixes: a10f554fa7e0 ("crypto: echainiv - Add encrypted chain IV...") Cc: stable@vger.kernel.org Reported-by: Mathias Krause Signed-off-by: Herbert Xu --- crypto/echainiv.c | 115 ++++++++++------------------------------------ 1 file changed, 24 insertions(+), 91 deletions(-) diff --git a/crypto/echainiv.c b/crypto/echainiv.c index 1b01fe98e91f..e3d889b122e0 100644 --- a/crypto/echainiv.c +++ b/crypto/echainiv.c @@ -1,8 +1,8 @@ /* * echainiv: Encrypted Chain IV Generator * - * This generator generates an IV based on a sequence number by xoring it - * with a salt and then encrypting it with the same key as used to encrypt + * This generator generates an IV based on a sequence number by multiplying + * it with a salt and then encrypting it with the same key as used to encrypt * the plain text. This algorithm requires that the block size be equal * to the IV size. It is mainly useful for CBC. * @@ -24,81 +24,17 @@ #include #include #include -#include #include -#include -#include +#include #include -#define MAX_IV_SIZE 16 - -static DEFINE_PER_CPU(u32 [MAX_IV_SIZE / sizeof(u32)], echainiv_iv); - -/* We don't care if we get preempted and read/write IVs from the next CPU. */ -static void echainiv_read_iv(u8 *dst, unsigned size) -{ - u32 *a = (u32 *)dst; - u32 __percpu *b = echainiv_iv; - - for (; size >= 4; size -= 4) { - *a++ = this_cpu_read(*b); - b++; - } -} - -static void echainiv_write_iv(const u8 *src, unsigned size) -{ - const u32 *a = (const u32 *)src; - u32 __percpu *b = echainiv_iv; - - for (; size >= 4; size -= 4) { - this_cpu_write(*b, *a); - a++; - b++; - } -} - -static void echainiv_encrypt_complete2(struct aead_request *req, int err) -{ - struct aead_request *subreq = aead_request_ctx(req); - struct crypto_aead *geniv; - unsigned int ivsize; - - if (err == -EINPROGRESS) - return; - - if (err) - goto out; - - geniv = crypto_aead_reqtfm(req); - ivsize = crypto_aead_ivsize(geniv); - - echainiv_write_iv(subreq->iv, ivsize); - - if (req->iv != subreq->iv) - memcpy(req->iv, subreq->iv, ivsize); - -out: - if (req->iv != subreq->iv) - kzfree(subreq->iv); -} - -static void echainiv_encrypt_complete(struct crypto_async_request *base, - int err) -{ - struct aead_request *req = base->data; - - echainiv_encrypt_complete2(req, err); - aead_request_complete(req, err); -} - static int echainiv_encrypt(struct aead_request *req) { struct crypto_aead *geniv = crypto_aead_reqtfm(req); struct aead_geniv_ctx *ctx = crypto_aead_ctx(geniv); struct aead_request *subreq = aead_request_ctx(req); - crypto_completion_t compl; - void *data; + __be64 nseqno; + u64 seqno; u8 *info; unsigned int ivsize = crypto_aead_ivsize(geniv); int err; @@ -108,8 +44,6 @@ static int echainiv_encrypt(struct aead_request *req) aead_request_set_tfm(subreq, ctx->child); - compl = echainiv_encrypt_complete; - data = req; info = req->iv; if (req->src != req->dst) { @@ -127,29 +61,30 @@ static int echainiv_encrypt(struct aead_request *req) return err; } - if (unlikely(!IS_ALIGNED((unsigned long)info, - crypto_aead_alignmask(geniv) + 1))) { - info = kmalloc(ivsize, req->base.flags & - CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL: - GFP_ATOMIC); - if (!info) - return -ENOMEM; - - memcpy(info, req->iv, ivsize); - } - - aead_request_set_callback(subreq, req->base.flags, compl, data); + aead_request_set_callback(subreq, req->base.flags, + req->base.complete, req->base.data); aead_request_set_crypt(subreq, req->dst, req->dst, req->cryptlen, info); aead_request_set_ad(subreq, req->assoclen); - crypto_xor(info, ctx->salt, ivsize); + memcpy(&nseqno, info + ivsize - 8, 8); + seqno = be64_to_cpu(nseqno); + memset(info, 0, ivsize); + scatterwalk_map_and_copy(info, req->dst, req->assoclen, ivsize, 1); - echainiv_read_iv(info, ivsize); - err = crypto_aead_encrypt(subreq); - echainiv_encrypt_complete2(req, err); - return err; + do { + u64 a; + + memcpy(&a, ctx->salt + ivsize - 8, 8); + + a |= 1; + a *= seqno; + + memcpy(info + ivsize - 8, &a, 8); + } while ((ivsize -= 8)); + + return crypto_aead_encrypt(subreq); } static int echainiv_decrypt(struct aead_request *req) @@ -196,8 +131,7 @@ static int echainiv_aead_create(struct crypto_template *tmpl, alg = crypto_spawn_aead_alg(spawn); err = -EINVAL; - if (inst->alg.ivsize & (sizeof(u32) - 1) || - inst->alg.ivsize > MAX_IV_SIZE) + if (inst->alg.ivsize & (sizeof(u64) - 1) || !inst->alg.ivsize) goto free_inst; inst->alg.encrypt = echainiv_encrypt; @@ -206,7 +140,6 @@ static int echainiv_aead_create(struct crypto_template *tmpl, inst->alg.init = aead_init_geniv; inst->alg.exit = aead_exit_geniv; - inst->alg.base.cra_alignmask |= __alignof__(u32) - 1; inst->alg.base.cra_ctxsize = sizeof(struct aead_geniv_ctx); inst->alg.base.cra_ctxsize += inst->alg.ivsize; From acdb04d0b36769b3e05990c488dc74d8b7ac8060 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 13 Sep 2016 14:43:29 +0800 Subject: [PATCH 280/538] crypto: skcipher - Fix blkcipher walk OOM crash When we need to allocate a temporary blkcipher_walk_next and it fails, the code is supposed to take the slow path of processing the data block by block. However, due to an unrelated change we instead end up dereferencing the NULL pointer. This patch fixes it by moving the unrelated bsize setting out of the way so that we enter the slow path as inteded. Fixes: 7607bd8ff03b ("[CRYPTO] blkcipher: Added blkcipher_walk_virt_block") Cc: stable@vger.kernel.org Reported-by: xiakaixu Reported-by: Ard Biesheuvel Signed-off-by: Herbert Xu Tested-by: Ard Biesheuvel --- crypto/blkcipher.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crypto/blkcipher.c b/crypto/blkcipher.c index 369999530108..a832426820e8 100644 --- a/crypto/blkcipher.c +++ b/crypto/blkcipher.c @@ -233,6 +233,8 @@ static int blkcipher_walk_next(struct blkcipher_desc *desc, return blkcipher_walk_done(desc, walk, -EINVAL); } + bsize = min(walk->walk_blocksize, n); + walk->flags &= ~(BLKCIPHER_WALK_SLOW | BLKCIPHER_WALK_COPY | BLKCIPHER_WALK_DIFF); if (!scatterwalk_aligned(&walk->in, walk->alignmask) || @@ -245,7 +247,6 @@ static int blkcipher_walk_next(struct blkcipher_desc *desc, } } - bsize = min(walk->walk_blocksize, n); n = scatterwalk_clamp(&walk->in, n); n = scatterwalk_clamp(&walk->out, n); From f82e90b28654804ab72881d577d87c3d5c65e2bc Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 13 Sep 2016 09:48:52 +0100 Subject: [PATCH 281/538] crypto: arm/aes-ctr - fix NULL dereference in tail processing The AES-CTR glue code avoids calling into the blkcipher API for the tail portion of the walk, by comparing the remainder of walk.nbytes modulo AES_BLOCK_SIZE with the residual nbytes, and jumping straight into the tail processing block if they are equal. This tail processing block checks whether nbytes != 0, and does nothing otherwise. However, in case of an allocation failure in the blkcipher layer, we may enter this code with walk.nbytes == 0, while nbytes > 0. In this case, we should not dereference the source and destination pointers, since they may be NULL. So instead of checking for nbytes != 0, check for (walk.nbytes % AES_BLOCK_SIZE) != 0, which implies the former in non-error conditions. Fixes: 86464859cc77 ("crypto: arm - AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions") Cc: stable@vger.kernel.org Reported-by: xiakaixu Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm/crypto/aes-ce-glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/crypto/aes-ce-glue.c b/arch/arm/crypto/aes-ce-glue.c index da3c0428507b..aef022a87c53 100644 --- a/arch/arm/crypto/aes-ce-glue.c +++ b/arch/arm/crypto/aes-ce-glue.c @@ -284,7 +284,7 @@ static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE); } - if (nbytes) { + if (walk.nbytes % AES_BLOCK_SIZE) { u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; u8 __aligned(8) tail[AES_BLOCK_SIZE]; From 2db34e78f126c6001d79d3b66ab1abb482dc7caa Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 13 Sep 2016 09:48:53 +0100 Subject: [PATCH 282/538] crypto: arm64/aes-ctr - fix NULL dereference in tail processing The AES-CTR glue code avoids calling into the blkcipher API for the tail portion of the walk, by comparing the remainder of walk.nbytes modulo AES_BLOCK_SIZE with the residual nbytes, and jumping straight into the tail processing block if they are equal. This tail processing block checks whether nbytes != 0, and does nothing otherwise. However, in case of an allocation failure in the blkcipher layer, we may enter this code with walk.nbytes == 0, while nbytes > 0. In this case, we should not dereference the source and destination pointers, since they may be NULL. So instead of checking for nbytes != 0, check for (walk.nbytes % AES_BLOCK_SIZE) != 0, which implies the former in non-error conditions. Fixes: 49788fe2a128 ("arm64/crypto: AES-ECB/CBC/CTR/XTS using ARMv8 NEON and Crypto Extensions") Cc: stable@vger.kernel.org Reported-by: xiakaixu Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 5c888049d061..6b2aa0fd6cd0 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -216,7 +216,7 @@ static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE); } - if (nbytes) { + if (walk.nbytes % AES_BLOCK_SIZE) { u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; u8 __aligned(8) tail[AES_BLOCK_SIZE]; From e2753293ac4bce8623650bb2d610b7e657bc869f Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 12 Sep 2016 13:38:42 -0700 Subject: [PATCH 283/538] x86/pkeys: Fix pkeys build breakage for some non-x86 arches Guenter Roeck reported breakage on the h8300 and c6x architectures (among others) caused by the new memory protection keys syscalls. This patch does what Arnd suggested and adds them to kernel/sys_ni.c. Fixes: a60f7b69d92c ("generic syscalls: Wire up memory protection keys syscalls") Reported-and-tested-by: Guenter Roeck Signed-off-by: Dave Hansen Acked-by: Arnd Bergmann Cc: linux-arch@vger.kernel.org Cc: Dave Hansen Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/20160912203842.48E7AC50@viggo.jf.intel.com Signed-off-by: Thomas Gleixner --- kernel/sys_ni.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 2c5e3a8e00d7..635482e60ca3 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -250,3 +250,8 @@ cond_syscall(sys_execveat); /* membarrier */ cond_syscall(sys_membarrier); + +/* memory protection keys */ +cond_syscall(sys_pkey_mprotect); +cond_syscall(sys_pkey_alloc); +cond_syscall(sys_pkey_free); From 74ab0e7a836a7df772af50cac21267eb43688841 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 12 Sep 2016 09:59:27 +0200 Subject: [PATCH 284/538] x86/mce/AMD: Use msr_ops.misc() in allocate_threshold_blocks() Change MSR_IA32_MCx_MISC() macro to msr_ops.misc() because SMCA machines define a different set of MSRs and msr_ops will give you the correct MISC register. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1468269447-8808-1-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 7b7f3be783d4..78b7681f7f66 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -869,7 +869,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) } } - err = allocate_threshold_blocks(cpu, bank, 0, MSR_IA32_MCx_MISC(bank)); + err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank)); if (!err) goto out; From db819d60f6720080150a365080ff656cf239f88f Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 12 Sep 2016 09:59:28 +0200 Subject: [PATCH 285/538] x86/mce: Add support for new MCA_SYND register Syndrome information is no longer contained in MCA_STATUS for SMCA systems but in a new register - MCA_SYND. Add a synd field to struct mce to hold MCA_SYND register value. Add it to the end of struct mce to maintain compatibility with old versions of mcelog. Also, add it to the respective tracepoint. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1467633035-32080-1-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/mce.h | 5 ++++- arch/x86/include/uapi/asm/mce.h | 1 + arch/x86/kernel/cpu/mcheck/mce.c | 4 ++++ arch/x86/kernel/cpu/mcheck/mce_amd.c | 3 +++ include/trace/events/mce.h | 6 ++++-- 5 files changed, 16 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 8bf766ef0e18..21bc5a3a4c89 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -40,9 +40,10 @@ #define MCI_STATUS_AR (1ULL<<55) /* Action required */ /* AMD-specific bits */ +#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */ +#define MCI_STATUS_SYNDV (1ULL<<53) /* synd reg. valid */ #define MCI_STATUS_DEFERRED (1ULL<<44) /* uncorrected error, deferred exception */ #define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ -#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */ /* * McaX field if set indicates a given bank supports MCA extensions: @@ -110,6 +111,7 @@ #define MSR_AMD64_SMCA_MC0_MISC0 0xc0002003 #define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004 #define MSR_AMD64_SMCA_MC0_IPID 0xc0002005 +#define MSR_AMD64_SMCA_MC0_SYND 0xc0002006 #define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008 #define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009 #define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a @@ -119,6 +121,7 @@ #define MSR_AMD64_SMCA_MCx_MISC(x) (MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_SYND(x) (MSR_AMD64_SMCA_MC0_SYND + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x))) diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index 2184943341bf..8c75fbc94c3f 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -26,6 +26,7 @@ struct mce { __u32 socketid; /* CPU socket ID */ __u32 apicid; /* CPU initial apic ID */ __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ + __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */ }; #define MCE_GET_RECORD_LEN _IOR('M', 1, int) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7f3f0e147242..91a179b95fd0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -569,6 +569,7 @@ static void mce_read_aux(struct mce *m, int i) { if (m->status & MCI_STATUS_MISCV) m->misc = mce_rdmsrl(msr_ops.misc(i)); + if (m->status & MCI_STATUS_ADDRV) { m->addr = mce_rdmsrl(msr_ops.addr(i)); @@ -581,6 +582,9 @@ static void mce_read_aux(struct mce *m, int i) m->addr <<= shift; } } + + if (mce_flags.smca && (m->status & MCI_STATUS_SYNDV)) + m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); } static bool memory_error(struct mce *m) diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 78b7681f7f66..419e0ee3b12f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -479,6 +479,9 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) if (m.status & MCI_STATUS_ADDRV) rdmsrl(msr_addr, m.addr); + if (mce_flags.smca && (m.status & MCI_STATUS_SYNDV)) + rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd); + mce_log(&m); wrmsrl(msr_status, 0); diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h index 4cbbcef6baa8..8be5268caf28 100644 --- a/include/trace/events/mce.h +++ b/include/trace/events/mce.h @@ -20,6 +20,7 @@ TRACE_EVENT(mce_record, __field( u64, status ) __field( u64, addr ) __field( u64, misc ) + __field( u64, synd ) __field( u64, ip ) __field( u64, tsc ) __field( u64, walltime ) @@ -38,6 +39,7 @@ TRACE_EVENT(mce_record, __entry->status = m->status; __entry->addr = m->addr; __entry->misc = m->misc; + __entry->synd = m->synd; __entry->ip = m->ip; __entry->tsc = m->tsc; __entry->walltime = m->time; @@ -50,11 +52,11 @@ TRACE_EVENT(mce_record, __entry->cpuvendor = m->cpuvendor; ), - TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, ADDR/MISC: %016Lx/%016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PROCESSOR: %u:%x, TIME: %llu, SOCKET: %u, APIC: %x", + TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, ADDR/MISC/SYND: %016Lx/%016Lx/%016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PROCESSOR: %u:%x, TIME: %llu, SOCKET: %u, APIC: %x", __entry->cpu, __entry->mcgcap, __entry->mcgstatus, __entry->bank, __entry->status, - __entry->addr, __entry->misc, + __entry->addr, __entry->misc, __entry->synd, __entry->cs, __entry->ip, __entry->tsc, __entry->cpuvendor, __entry->cpuid, From b300e87300b68120aa5374341b252875a1cb6ea1 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 12 Sep 2016 09:59:29 +0200 Subject: [PATCH 286/538] EDAC/mce_amd: Print syndrome register value on SMCA systems Print SyndV bit status and print the raw value of the MCA_SYND register. Further decoding of the syndrome from struct mce.synd can be done in other places where appropriate, e.g. DRAM ECC. Boris: make the error stanza more compact by putting the error address and syndrome on the same line: [Hardware Error]: Corrected error, no action required. [Hardware Error]: CPU:2 (17:0:0) MC4_STATUS[-|CE|-|PCC|AddrV|-|-|SyndV|CECC]: 0x96204100001e0117 [Hardware Error]: Error Addr: 0x000000007f4c52e3, Syndrome: 0x0000000000000000 [Hardware Error]: Invalid IP block specified. [Hardware Error]: cache level: L3/GEN, tx: DATA, mem-tx: RD Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1467633035-32080-2-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Thomas Gleixner --- drivers/edac/mce_amd.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 9b6800a79c7f..057ece577800 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -927,7 +927,7 @@ static void decode_smca_errors(struct mce *m) size_t len; if (rdmsr_safe(addr, &low, &high)) { - pr_emerg("Invalid IP block specified, error information is unreliable.\n"); + pr_emerg(HW_ERR "Invalid IP block specified.\n"); return; } @@ -1078,6 +1078,8 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) u32 low, high; u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); + pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-")); + if (!rdmsr_safe(addr, &low, &high) && (low & MCI_CONFIG_MCAX)) pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); @@ -1091,12 +1093,18 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) pr_cont("]: 0x%016llx\n", m->status); if (m->status & MCI_STATUS_ADDRV) - pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr); + pr_emerg(HW_ERR "Error Addr: 0x%016llx", m->addr); if (boot_cpu_has(X86_FEATURE_SMCA)) { + if (m->status & MCI_STATUS_SYNDV) + pr_cont(", Syndrome: 0x%016llx", m->synd); + + pr_cont("\n"); + decode_smca_errors(m); goto err_code; - } + } else + pr_cont("\n"); if (!fam_ops) goto err_code; From bad744b7f29d264c2c2ad8fb723dd480e6c9b007 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 12 Sep 2016 09:59:30 +0200 Subject: [PATCH 287/538] x86/RAS: Add syndrome support to mce_amd_inj Add a debugfs file which holds the error syndrome (written into MCA_SYND) of an injected error. Only write it on SMCA systems. Update README file, while at it. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1467633035-32080-3-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Thomas Gleixner --- arch/x86/ras/mce_amd_inj.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c index 1104515d5ad2..ff8eb1a9ce6d 100644 --- a/arch/x86/ras/mce_amd_inj.c +++ b/arch/x86/ras/mce_amd_inj.c @@ -68,6 +68,7 @@ static int inj_##reg##_set(void *data, u64 val) \ MCE_INJECT_SET(status); MCE_INJECT_SET(misc); MCE_INJECT_SET(addr); +MCE_INJECT_SET(synd); #define MCE_INJECT_GET(reg) \ static int inj_##reg##_get(void *data, u64 *val) \ @@ -81,10 +82,12 @@ static int inj_##reg##_get(void *data, u64 *val) \ MCE_INJECT_GET(status); MCE_INJECT_GET(misc); MCE_INJECT_GET(addr); +MCE_INJECT_GET(synd); DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); /* * Caller needs to be make sure this cpu doesn't disappear @@ -258,6 +261,7 @@ static void prepare_msrs(void *info) } wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), i_mce.misc); + wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), i_mce.synd); } else { wrmsrl(MSR_IA32_MCx_STATUS(b), i_mce.status); wrmsrl(MSR_IA32_MCx_ADDR(b), i_mce.addr); @@ -275,6 +279,9 @@ static void do_inject(void) if (i_mce.misc) i_mce.status |= MCI_STATUS_MISCV; + if (i_mce.synd) + i_mce.status |= MCI_STATUS_SYNDV; + if (inj_type == SW_INJ) { mce_inject_log(&i_mce); return; @@ -371,6 +378,9 @@ static const char readme_msg[] = "\t used for error thresholding purposes and its validity is indicated by\n" "\t MCi_STATUS[MiscV].\n" "\n" +"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n" +"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n" +"\n" "addr:\t Error address value to be written to MCi_ADDR. Log address information\n" "\t associated with the error.\n" "\n" @@ -420,6 +430,7 @@ static struct dfs_node { { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, From cfee4f6f0b2026380c6bc6913dbd27943df17371 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 12 Sep 2016 09:59:31 +0200 Subject: [PATCH 288/538] x86/mce/AMD: Read MSRs on the CPU allocating the threshold blocks Scalable MCA systems allow non-core MCA banks to only be accessible by certain CPUs. The MSRs for these banks are Read-as-Zero on other CPUs. During allocate_threshold_blocks(), get_block_address() can be scheduled on CPUs other than the one allocating the block. This causes the MSRs to be read on the wrong CPU and results in incorrect behavior. Add a @cpu parameter to get_block_address() and pass this in to ensure that the MSRs are only read on the CPU that is allocating the block. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1472673994-12235-2-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 419e0ee3b12f..9da92fb2e073 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -293,7 +293,7 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } -static u32 get_block_address(u32 current_addr, u32 low, u32 high, +static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 high, unsigned int bank, unsigned int block) { u32 addr = 0, offset = 0; @@ -309,13 +309,13 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, */ u32 low, high; - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) return addr; if (!(low & MCI_CONFIG_MCAX)) return addr; - if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && + if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && (low & MASK_BLKPTR_LO)) addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); } @@ -421,12 +421,12 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, void mce_amd_feature_init(struct cpuinfo_x86 *c) { u32 low = 0, high = 0, address = 0; - unsigned int bank, block; + unsigned int bank, block, cpu = smp_processor_id(); int offset = -1; for (bank = 0; bank < mca_cfg.banks; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { - address = get_block_address(address, low, high, bank, block); + address = get_block_address(cpu, address, low, high, bank, block); if (!address) break; @@ -544,15 +544,14 @@ static void amd_deferred_error_interrupt(void) static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; - int cpu = smp_processor_id(); - unsigned int bank, block; + unsigned int bank, block, cpu = smp_processor_id(); /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; for (block = 0; block < NR_BLOCKS; ++block) { - address = get_block_address(address, low, high, bank, block); + address = get_block_address(cpu, address, low, high, bank, block); if (!address) break; @@ -774,7 +773,7 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, if (err) goto out_free; recurse: - address = get_block_address(address, low, high, bank, ++block); + address = get_block_address(cpu, address, low, high, bank, ++block); if (!address) return 0; From c019b951e1f9f1de0c5b0726032e3adf34c523a7 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 12 Sep 2016 09:59:32 +0200 Subject: [PATCH 289/538] EDAC/mce_amd: Add missing SMCA error descriptions Add missing SMCA error descriptions to the error descriptions arrays. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1472673994-12235-3-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Thomas Gleixner --- drivers/edac/mce_amd.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 057ece577800..455cd49d6253 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -185,6 +185,8 @@ static const char * const f17h_if_mce_desc[] = { "BPQ snoop parity on Thread 1", "L1 BTB multi-match error", "L2 BTB multi-match error", + "L2 Cache Response Poison error", + "System Read Data error", }; static const char * const f17h_l2_mce_desc[] = { @@ -198,6 +200,7 @@ static const char * const f17h_de_mce_desc[] = { "uop cache tag parity error", "uop cache data parity error", "Insn buffer parity error", + "uop queue parity error", "Insn dispatch queue parity error", "Fetch address FIFO parity", "Patch RAM data parity", @@ -214,6 +217,9 @@ static const char * const f17h_ex_mce_desc[] = { "EX payload parity", "Checkpoint queue parity", "Retire dispatch queue parity", + "Retire status queue parity error", + "Scheduling queue parity error", + "Branch buffer queue parity error", }; static const char * const f17h_fp_mce_desc[] = { @@ -223,6 +229,7 @@ static const char * const f17h_fp_mce_desc[] = { "NSQ parity error", "Retire queue parity", "Status register file parity", + "Hardware assertion", }; static const char * const f17h_l3_mce_desc[] = { From 856095b1794be487527771dbd2fe28e34e94b266 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 12 Sep 2016 09:59:33 +0200 Subject: [PATCH 290/538] EDAC/mce_amd: Use SMCA prefix for error descriptions arrays The error descriptions defined for Fam17h can be reused for other SMCA systems, so their names should reflect this. Change f17h prefix to smca for error descriptions. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1472673994-12235-4-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Thomas Gleixner --- drivers/edac/mce_amd.c | 80 +++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 455cd49d6253..ea549a94361b 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -148,12 +148,12 @@ static const char * const mc6_mce_desc[] = { }; /* Scalable MCA error strings */ -static const char * const f17h_ls_mce_desc[] = { +static const char * const smca_ls_mce_desc[] = { "Load queue parity", "Store queue parity", "Miss address buffer payload parity", "L1 TLB parity", - "", /* reserved */ + "Reserved", "DC tag error type 6", "DC tag error type 1", "Internal error type 1", @@ -172,7 +172,7 @@ static const char * const f17h_ls_mce_desc[] = { "L2 fill data error", }; -static const char * const f17h_if_mce_desc[] = { +static const char * const smca_if_mce_desc[] = { "microtag probe port parity error", "IC microtag or full tag multi-hit error", "IC full tag parity", @@ -189,14 +189,14 @@ static const char * const f17h_if_mce_desc[] = { "System Read Data error", }; -static const char * const f17h_l2_mce_desc[] = { +static const char * const smca_l2_mce_desc[] = { "L2M tag multi-way-hit error", "L2M tag ECC error", "L2M data ECC error", "HW assert", }; -static const char * const f17h_de_mce_desc[] = { +static const char * const smca_de_mce_desc[] = { "uop cache tag parity error", "uop cache data parity error", "Insn buffer parity error", @@ -208,7 +208,7 @@ static const char * const f17h_de_mce_desc[] = { "uop buffer parity" }; -static const char * const f17h_ex_mce_desc[] = { +static const char * const smca_ex_mce_desc[] = { "Watchdog timeout error", "Phy register file parity", "Flag register file parity", @@ -222,7 +222,7 @@ static const char * const f17h_ex_mce_desc[] = { "Branch buffer queue parity error", }; -static const char * const f17h_fp_mce_desc[] = { +static const char * const smca_fp_mce_desc[] = { "Physical register file parity", "Freelist parity error", "Schedule queue parity", @@ -232,7 +232,7 @@ static const char * const f17h_fp_mce_desc[] = { "Hardware assertion", }; -static const char * const f17h_l3_mce_desc[] = { +static const char * const smca_l3_mce_desc[] = { "Shadow tag macro ECC error", "Shadow tag macro multi-way-hit error", "L3M tag ECC error", @@ -243,7 +243,7 @@ static const char * const f17h_l3_mce_desc[] = { "L3 HW assert", }; -static const char * const f17h_cs_mce_desc[] = { +static const char * const smca_cs_mce_desc[] = { "Illegal request from transport layer", "Address violation", "Security violation", @@ -255,14 +255,14 @@ static const char * const f17h_cs_mce_desc[] = { "ECC error on probe filter access", }; -static const char * const f17h_pie_mce_desc[] = { +static const char * const smca_pie_mce_desc[] = { "HW assert", "Internal PIE register security violation", "Error on GMI link", "Poison data written to internal PIE register", }; -static const char * const f17h_umc_mce_desc[] = { +static const char * const smca_umc_mce_desc[] = { "DRAM ECC error", "Data poison error on DRAM", "SDP parity error", @@ -271,15 +271,15 @@ static const char * const f17h_umc_mce_desc[] = { "Write data CRC error", }; -static const char * const f17h_pb_mce_desc[] = { +static const char * const smca_pb_mce_desc[] = { "Parameter Block RAM ECC error", }; -static const char * const f17h_psp_mce_desc[] = { +static const char * const smca_psp_mce_desc[] = { "PSP RAM ECC or parity error", }; -static const char * const f17h_smu_mce_desc[] = { +static const char * const smca_smu_mce_desc[] = { "SMU RAM ECC or parity error", }; @@ -837,8 +837,8 @@ static void decode_f17h_core_errors(const char *ip_name, u8 xec, switch (mca_type) { case SMCA_LS: - error_desc_array = f17h_ls_mce_desc; - len = ARRAY_SIZE(f17h_ls_mce_desc) - 1; + error_desc_array = smca_ls_mce_desc; + len = ARRAY_SIZE(smca_ls_mce_desc) - 1; if (xec == 0x4) { pr_cont("Unrecognized LS MCA error code.\n"); @@ -847,33 +847,33 @@ static void decode_f17h_core_errors(const char *ip_name, u8 xec, break; case SMCA_IF: - error_desc_array = f17h_if_mce_desc; - len = ARRAY_SIZE(f17h_if_mce_desc) - 1; + error_desc_array = smca_if_mce_desc; + len = ARRAY_SIZE(smca_if_mce_desc) - 1; break; case SMCA_L2_CACHE: - error_desc_array = f17h_l2_mce_desc; - len = ARRAY_SIZE(f17h_l2_mce_desc) - 1; + error_desc_array = smca_l2_mce_desc; + len = ARRAY_SIZE(smca_l2_mce_desc) - 1; break; case SMCA_DE: - error_desc_array = f17h_de_mce_desc; - len = ARRAY_SIZE(f17h_de_mce_desc) - 1; + error_desc_array = smca_de_mce_desc; + len = ARRAY_SIZE(smca_de_mce_desc) - 1; break; case SMCA_EX: - error_desc_array = f17h_ex_mce_desc; - len = ARRAY_SIZE(f17h_ex_mce_desc) - 1; + error_desc_array = smca_ex_mce_desc; + len = ARRAY_SIZE(smca_ex_mce_desc) - 1; break; case SMCA_FP: - error_desc_array = f17h_fp_mce_desc; - len = ARRAY_SIZE(f17h_fp_mce_desc) - 1; + error_desc_array = smca_fp_mce_desc; + len = ARRAY_SIZE(smca_fp_mce_desc) - 1; break; case SMCA_L3_CACHE: - error_desc_array = f17h_l3_mce_desc; - len = ARRAY_SIZE(f17h_l3_mce_desc) - 1; + error_desc_array = smca_l3_mce_desc; + len = ARRAY_SIZE(smca_l3_mce_desc) - 1; break; default: @@ -899,13 +899,13 @@ static void decode_df_errors(u8 xec, unsigned int mca_type) switch (mca_type) { case SMCA_CS: - error_desc_array = f17h_cs_mce_desc; - len = ARRAY_SIZE(f17h_cs_mce_desc) - 1; + error_desc_array = smca_cs_mce_desc; + len = ARRAY_SIZE(smca_cs_mce_desc) - 1; break; case SMCA_PIE: - error_desc_array = f17h_pie_mce_desc; - len = ARRAY_SIZE(f17h_pie_mce_desc) - 1; + error_desc_array = smca_pie_mce_desc; + len = ARRAY_SIZE(smca_pie_mce_desc) - 1; break; default: @@ -963,23 +963,23 @@ static void decode_smca_errors(struct mce *m) break; case SMCA_UMC: - error_desc_array = f17h_umc_mce_desc; - len = ARRAY_SIZE(f17h_umc_mce_desc) - 1; + error_desc_array = smca_umc_mce_desc; + len = ARRAY_SIZE(smca_umc_mce_desc) - 1; break; case SMCA_PB: - error_desc_array = f17h_pb_mce_desc; - len = ARRAY_SIZE(f17h_pb_mce_desc) - 1; + error_desc_array = smca_pb_mce_desc; + len = ARRAY_SIZE(smca_pb_mce_desc) - 1; break; case SMCA_PSP: - error_desc_array = f17h_psp_mce_desc; - len = ARRAY_SIZE(f17h_psp_mce_desc) - 1; + error_desc_array = smca_psp_mce_desc; + len = ARRAY_SIZE(smca_psp_mce_desc) - 1; break; case SMCA_SMU: - error_desc_array = f17h_smu_mce_desc; - len = ARRAY_SIZE(f17h_smu_mce_desc) - 1; + error_desc_array = smca_smu_mce_desc; + len = ARRAY_SIZE(smca_smu_mce_desc) - 1; break; default: From 5896820e0aa32572ad03b30563c539655b6c6375 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 12 Sep 2016 09:59:34 +0200 Subject: [PATCH 291/538] x86/mce/AMD, EDAC/mce_amd: Define and use tables for known SMCA IP types Scalable MCA defines a number of IP types. An MCA bank on an SMCA system is defined as one of these IP types. A bank's type is uniquely identified by the combination of the HWID and MCATYPE values read from its MCA_IPID register. Add the required tables in order to be able to lookup error descriptions based on a bank's type and the error's extended error code. [ bp: Align comments, simplify a bit. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1472741832-1690-1-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/mce.h | 61 +++++---- arch/x86/kernel/cpu/mcheck/mce_amd.c | 104 ++++++++++---- drivers/edac/mce_amd.c | 194 +++++---------------------- 3 files changed, 147 insertions(+), 212 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 21bc5a3a4c89..9bd7ff5ffbcc 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -337,44 +337,47 @@ extern void apei_mce_report_mem_error(int corrected, * Scalable MCA. */ #ifdef CONFIG_X86_MCE_AMD -enum amd_ip_types { - SMCA_F17H_CORE = 0, /* Core errors */ - SMCA_DF, /* Data Fabric */ - SMCA_UMC, /* Unified Memory Controller */ - SMCA_PB, /* Parameter Block */ - SMCA_PSP, /* Platform Security Processor */ - SMCA_SMU, /* System Management Unit */ - N_AMD_IP_TYPES -}; - -struct amd_hwid { - const char *name; - unsigned int hwid; -}; -extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES]; - -enum amd_core_mca_blocks { +/* These may be used by multiple smca_hwid_mcatypes */ +enum smca_bank_types { SMCA_LS = 0, /* Load Store */ SMCA_IF, /* Instruction Fetch */ - SMCA_L2_CACHE, /* L2 cache */ - SMCA_DE, /* Decoder unit */ - RES, /* Reserved */ - SMCA_EX, /* Execution unit */ + SMCA_L2_CACHE, /* L2 Cache */ + SMCA_DE, /* Decoder Unit */ + SMCA_EX, /* Execution Unit */ SMCA_FP, /* Floating Point */ - SMCA_L3_CACHE, /* L3 cache */ - N_CORE_MCA_BLOCKS + SMCA_L3_CACHE, /* L3 Cache */ + SMCA_CS, /* Coherent Slave */ + SMCA_PIE, /* Power, Interrupts, etc. */ + SMCA_UMC, /* Unified Memory Controller */ + SMCA_PB, /* Parameter Block */ + SMCA_PSP, /* Platform Security Processor */ + SMCA_SMU, /* System Management Unit */ + N_SMCA_BANK_TYPES +}; + +struct smca_bank_name { + const char *name; /* Short name for sysfs */ + const char *long_name; /* Long name for pretty-printing */ }; -extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS]; +extern struct smca_bank_name smca_bank_names[N_SMCA_BANK_TYPES]; + +#define HWID_MCATYPE(hwid, mcatype) ((hwid << 16) | mcatype) -enum amd_df_mca_blocks { - SMCA_CS = 0, /* Coherent Slave */ - SMCA_PIE, /* Power management, Interrupts, etc */ - N_DF_BLOCKS +struct smca_hwid_mcatype { + unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */ + u32 hwid_mcatype; /* (hwid,mcatype) tuple */ + u32 xec_bitmap; /* Bitmap of valid ExtErrorCodes; current max is 21. */ }; -extern const char * const amd_df_mcablock_names[N_DF_BLOCKS]; +struct smca_bank_info { + struct smca_hwid_mcatype *type; + u32 type_instance; +}; + +extern struct smca_bank_info smca_banks[MAX_NR_BANKS]; + #endif #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 9da92fb2e073..3b74b62d0808 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -63,34 +63,55 @@ static const char * const th_names[] = { "execution_unit", }; -/* Define HWID to IP type mappings for Scalable MCA */ -struct amd_hwid amd_hwids[] = { - [SMCA_F17H_CORE] = { "f17h_core", 0xB0 }, - [SMCA_DF] = { "data_fabric", 0x2E }, - [SMCA_UMC] = { "umc", 0x96 }, - [SMCA_PB] = { "param_block", 0x5 }, - [SMCA_PSP] = { "psp", 0xFF }, - [SMCA_SMU] = { "smu", 0x1 }, +struct smca_bank_name smca_bank_names[] = { + [SMCA_LS] = { "load_store", "Load Store Unit" }, + [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, + [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, + [SMCA_DE] = { "decode_unit", "Decode Unit" }, + [SMCA_EX] = { "execution_unit", "Execution Unit" }, + [SMCA_FP] = { "floating_point", "Floating Point Unit" }, + [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, + [SMCA_CS] = { "coherent_slave", "Coherent Slave" }, + [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, + [SMCA_UMC] = { "umc", "Unified Memory Controller" }, + [SMCA_PB] = { "param_block", "Parameter Block" }, + [SMCA_PSP] = { "psp", "Platform Security Processor" }, + [SMCA_SMU] = { "smu", "System Management Unit" }, }; -EXPORT_SYMBOL_GPL(amd_hwids); - -const char * const amd_core_mcablock_names[] = { - [SMCA_LS] = "load_store", - [SMCA_IF] = "insn_fetch", - [SMCA_L2_CACHE] = "l2_cache", - [SMCA_DE] = "decode_unit", - [RES] = "", - [SMCA_EX] = "execution_unit", - [SMCA_FP] = "floating_point", - [SMCA_L3_CACHE] = "l3_cache", -}; -EXPORT_SYMBOL_GPL(amd_core_mcablock_names); +EXPORT_SYMBOL_GPL(smca_bank_names); + +static struct smca_hwid_mcatype smca_hwid_mcatypes[] = { + /* { bank_type, hwid_mcatype, xec_bitmap } */ + + /* ZN Core (HWID=0xB0) MCA types */ + { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF }, + { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF }, + { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF }, + { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF }, + /* HWID 0xB0 MCATYPE 0x4 is Reserved */ + { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0x7FF }, + { SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F }, + { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF }, + + /* Data Fabric MCA types */ + { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF }, + { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0xF }, + + /* Unified Memory Controller MCA type */ + { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0x3F }, + + /* Parameter Block MCA type */ + { SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 }, -const char * const amd_df_mcablock_names[] = { - [SMCA_CS] = "coherent_slave", - [SMCA_PIE] = "pie", + /* Platform Security Processor MCA type */ + { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 }, + + /* System Management Unit MCA type */ + { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 }, }; -EXPORT_SYMBOL_GPL(amd_df_mcablock_names); + +struct smca_bank_info smca_banks[MAX_NR_BANKS]; +EXPORT_SYMBOL_GPL(smca_banks); static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */ @@ -108,6 +129,36 @@ void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; * CPU Initialization */ +static void get_smca_bank_info(unsigned int bank) +{ + unsigned int i, hwid_mcatype, cpu = smp_processor_id(); + struct smca_hwid_mcatype *type; + u32 high, instanceId; + u16 hwid, mcatype; + + /* Collect bank_info using CPU 0 for now. */ + if (cpu) + return; + + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instanceId, &high)) { + pr_warn("Failed to read MCA_IPID for bank %d\n", bank); + return; + } + + hwid = high & MCI_IPID_HWID; + mcatype = (high & MCI_IPID_MCATYPE) >> 16; + hwid_mcatype = HWID_MCATYPE(hwid, mcatype); + + for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { + type = &smca_hwid_mcatypes[i]; + if (hwid_mcatype == type->hwid_mcatype) { + smca_banks[bank].type = type; + smca_banks[bank].type_instance = instanceId; + break; + } + } +} + struct thresh_restart { struct threshold_block *b; int reset; @@ -425,6 +476,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) int offset = -1; for (bank = 0; bank < mca_cfg.banks; ++bank) { + if (mce_flags.smca) + get_smca_bank_info(bank); + for (block = 0; block < NR_BLOCKS; ++block) { address = get_block_address(cpu, address, low, high, bank, block); if (!address) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index ea549a94361b..99b3bf3f4182 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -283,6 +283,27 @@ static const char * const smca_smu_mce_desc[] = { "SMU RAM ECC or parity error", }; +struct smca_mce_desc { + const char * const *descs; + unsigned int num_descs; +}; + +static struct smca_mce_desc smca_mce_descs[] = { + [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) }, + [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) }, + [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) }, + [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) }, + [SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) }, + [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) }, + [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, + [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, + [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, + [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, + [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, + [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, + [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, +}; + static bool f12h_mc0_mce(u16 ec, u8 xec) { bool ret = false; @@ -827,175 +848,32 @@ static void decode_mc6_mce(struct mce *m) pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n"); } -static void decode_f17h_core_errors(const char *ip_name, u8 xec, - unsigned int mca_type) -{ - const char * const *error_desc_array; - size_t len; - - pr_emerg(HW_ERR "%s Error: ", ip_name); - - switch (mca_type) { - case SMCA_LS: - error_desc_array = smca_ls_mce_desc; - len = ARRAY_SIZE(smca_ls_mce_desc) - 1; - - if (xec == 0x4) { - pr_cont("Unrecognized LS MCA error code.\n"); - return; - } - break; - - case SMCA_IF: - error_desc_array = smca_if_mce_desc; - len = ARRAY_SIZE(smca_if_mce_desc) - 1; - break; - - case SMCA_L2_CACHE: - error_desc_array = smca_l2_mce_desc; - len = ARRAY_SIZE(smca_l2_mce_desc) - 1; - break; - - case SMCA_DE: - error_desc_array = smca_de_mce_desc; - len = ARRAY_SIZE(smca_de_mce_desc) - 1; - break; - - case SMCA_EX: - error_desc_array = smca_ex_mce_desc; - len = ARRAY_SIZE(smca_ex_mce_desc) - 1; - break; - - case SMCA_FP: - error_desc_array = smca_fp_mce_desc; - len = ARRAY_SIZE(smca_fp_mce_desc) - 1; - break; - - case SMCA_L3_CACHE: - error_desc_array = smca_l3_mce_desc; - len = ARRAY_SIZE(smca_l3_mce_desc) - 1; - break; - - default: - pr_cont("Corrupted MCA core error info.\n"); - return; - } - - if (xec > len) { - pr_cont("Unrecognized %s MCA bank error code.\n", - amd_core_mcablock_names[mca_type]); - return; - } - - pr_cont("%s.\n", error_desc_array[xec]); -} - -static void decode_df_errors(u8 xec, unsigned int mca_type) -{ - const char * const *error_desc_array; - size_t len; - - pr_emerg(HW_ERR "Data Fabric Error: "); - - switch (mca_type) { - case SMCA_CS: - error_desc_array = smca_cs_mce_desc; - len = ARRAY_SIZE(smca_cs_mce_desc) - 1; - break; - - case SMCA_PIE: - error_desc_array = smca_pie_mce_desc; - len = ARRAY_SIZE(smca_pie_mce_desc) - 1; - break; - - default: - pr_cont("Corrupted MCA Data Fabric info.\n"); - return; - } - - if (xec > len) { - pr_cont("Unrecognized %s MCA bank error code.\n", - amd_df_mcablock_names[mca_type]); - return; - } - - pr_cont("%s.\n", error_desc_array[xec]); -} - /* Decode errors according to Scalable MCA specification */ static void decode_smca_errors(struct mce *m) { - u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank); - unsigned int hwid, mca_type, i; - u8 xec = XEC(m->status, xec_mask); - const char * const *error_desc_array; + struct smca_hwid_mcatype *type; + unsigned int bank_type; const char *ip_name; - u32 low, high; - size_t len; + u8 xec = XEC(m->status, xec_mask); - if (rdmsr_safe(addr, &low, &high)) { - pr_emerg(HW_ERR "Invalid IP block specified.\n"); + if (m->bank >= ARRAY_SIZE(smca_banks)) return; - } - - hwid = high & MCI_IPID_HWID; - mca_type = (high & MCI_IPID_MCATYPE) >> 16; - pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", m->bank, high, low); - - /* - * Based on hwid and mca_type values, decode errors from respective IPs. - * Note: mca_type values make sense only in the context of an hwid. - */ - for (i = 0; i < ARRAY_SIZE(amd_hwids); i++) - if (amd_hwids[i].hwid == hwid) - break; - - switch (i) { - case SMCA_F17H_CORE: - ip_name = (mca_type == SMCA_L3_CACHE) ? - "L3 Cache" : "F17h Core"; - return decode_f17h_core_errors(ip_name, xec, mca_type); - break; - - case SMCA_DF: - return decode_df_errors(xec, mca_type); - break; - - case SMCA_UMC: - error_desc_array = smca_umc_mce_desc; - len = ARRAY_SIZE(smca_umc_mce_desc) - 1; - break; - - case SMCA_PB: - error_desc_array = smca_pb_mce_desc; - len = ARRAY_SIZE(smca_pb_mce_desc) - 1; - break; - - case SMCA_PSP: - error_desc_array = smca_psp_mce_desc; - len = ARRAY_SIZE(smca_psp_mce_desc) - 1; - break; - - case SMCA_SMU: - error_desc_array = smca_smu_mce_desc; - len = ARRAY_SIZE(smca_smu_mce_desc) - 1; - break; - - default: - pr_emerg(HW_ERR "HWID:%d does not match any existing IPs.\n", hwid); + type = smca_banks[m->bank].type; + if (!type) return; - } - ip_name = amd_hwids[i].name; - pr_emerg(HW_ERR "%s Error: ", ip_name); + bank_type = type->bank_type; + ip_name = smca_bank_names[bank_type].long_name; - if (xec > len) { - pr_cont("Unrecognized %s MCA bank error code.\n", ip_name); - return; - } + pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec); - pr_cont("%s.\n", error_desc_array[xec]); + /* Only print the decode of valid error codes */ + if (xec < smca_mce_descs[bank_type].num_descs && + (type->xec_bitmap & BIT_ULL(xec))) { + pr_emerg(HW_ERR "%s Error: ", ip_name); + pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]); + } } static inline void amd_decode_err_code(u16 ec) From 87a6d4091bd795b43d684bfc87253e04a263af1c Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 12 Sep 2016 09:59:35 +0200 Subject: [PATCH 292/538] x86/mce/AMD: Update sysfs bank names for SMCA systems Define a bank's sysfs filename based on its IP type and InstanceId. Credits go to Aravind for: * The general idea and proto- get_name(). * Defining smca_umc_block_names[] and buf_mcatype[]. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: Aravind Gopalakrishnan Link: http://lkml.kernel.org/r/1473193490-3291-2-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 49 ++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 3b74b62d0808..0f9d0786bc97 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -63,6 +64,11 @@ static const char * const th_names[] = { "execution_unit", }; +static const char * const smca_umc_block_names[] = { + "dram_ecc", + "misc_umc" +}; + struct smca_bank_name smca_bank_names[] = { [SMCA_LS] = { "load_store", "Load Store Unit" }, [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, @@ -113,6 +119,17 @@ static struct smca_hwid_mcatype smca_hwid_mcatypes[] = { struct smca_bank_info smca_banks[MAX_NR_BANKS]; EXPORT_SYMBOL_GPL(smca_banks); +/* + * In SMCA enabled processors, we can have multiple banks for a given IP type. + * So to define a unique name for each bank, we use a temp c-string to append + * the MCA_IPID[InstanceId] to type's name in get_name(). + * + * InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN + * is greater than 8 plus 1 (for underscore) plus length of longest type name. + */ +#define MAX_MCATYPE_NAME_LEN 30 +static char buf_mcatype[MAX_MCATYPE_NAME_LEN]; + static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */ @@ -769,6 +786,34 @@ static struct kobj_type threshold_ktype = { .default_attrs = default_attrs, }; +static const char *get_name(unsigned int bank, struct threshold_block *b) +{ + unsigned int bank_type; + + if (!mce_flags.smca) { + if (b && bank == 4) + return bank4_names(b); + + return th_names[bank]; + } + + if (!smca_banks[bank].type) + return NULL; + + bank_type = smca_banks[bank].type->bank_type; + + if (b && bank_type == SMCA_UMC) { + if (b->block < ARRAY_SIZE(smca_umc_block_names)) + return smca_umc_block_names[b->block]; + return NULL; + } + + snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, + "%s_%x", smca_bank_names[bank_type].name, + smca_banks[bank].type_instance); + return buf_mcatype; +} + static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, unsigned int block, u32 address) { @@ -823,7 +868,7 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, err = kobject_init_and_add(&b->kobj, &threshold_ktype, per_cpu(threshold_banks, cpu)[bank]->kobj, - (bank == 4 ? bank4_names(b) : th_names[bank])); + get_name(bank, b)); if (err) goto out_free; recurse: @@ -878,7 +923,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) struct device *dev = per_cpu(mce_device, cpu); struct amd_northbridge *nb = NULL; struct threshold_bank *b = NULL; - const char *name = th_names[bank]; + const char *name = get_name(bank, NULL); int err = 0; if (is_shared_bank(bank)) { From 66ef269dbbe45e264ccf7146d5db32b04478d148 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 12 Sep 2016 09:59:36 +0200 Subject: [PATCH 293/538] x86/mce/AMD: Ensure the deferred error interrupt is of type APIC on SMCA systems The Deferred Error Interrupt Type is set per bank on Scalable MCA systems. This is done in a bitfield in the MCA_CONFIG register of each bank. We should set its type to APIC-based interrupt and not assume BIOS has set it for us. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1472737486-1720-1-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 0f9d0786bc97..16766e09c2b7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -463,6 +463,20 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, */ smca_high &= ~BIT(2); + /* + * SMCA sets the Deferred Error Interrupt type per bank. + * + * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us + * if the DeferredIntType bit field is available. + * + * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the + * high portion of the MSR). OS should set this to 0x1 to enable + * APIC based interrupt. First, check that no interrupt has been + * set. + */ + if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3)) + smca_high |= BIT(5); + wrmsr(smca_addr, smca_low, smca_high); } From 5828c46f2c07b97d758da6dc6afd5c374768d44d Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 12 Sep 2016 09:59:37 +0200 Subject: [PATCH 294/538] x86/mce/AMD: Save MCA_IPID in MCE struct on SMCA systems The MCA_IPID register uniquely identifies a bank's type and instance on Scalable MCA systems. We should save the value of this register in struct mce along with the other relevant error information. This ensures that we can decode errors without relying on system software to correlate the bank to the type. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1472680624-34221-1-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Thomas Gleixner --- arch/x86/include/uapi/asm/mce.h | 1 + arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++++-- arch/x86/kernel/cpu/mcheck/mce_amd.c | 8 ++++++-- include/trace/events/mce.h | 5 ++++- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index 8c75fbc94c3f..69a6e07e3149 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -27,6 +27,7 @@ struct mce { __u32 apicid; /* CPU initial apic ID */ __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */ + __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */ }; #define MCE_GET_RECORD_LEN _IOR('M', 1, int) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 91a179b95fd0..17e9ff011c0e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -583,8 +583,12 @@ static void mce_read_aux(struct mce *m, int i) } } - if (mce_flags.smca && (m->status & MCI_STATUS_SYNDV)) - m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); + if (mce_flags.smca) { + m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); + + if (m->status & MCI_STATUS_SYNDV) + m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); + } } static bool memory_error(struct mce *m) diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 16766e09c2b7..d2f92ab5322f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -564,8 +564,12 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) if (m.status & MCI_STATUS_ADDRV) rdmsrl(msr_addr, m.addr); - if (mce_flags.smca && (m.status & MCI_STATUS_SYNDV)) - rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd); + if (mce_flags.smca) { + rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid); + + if (m.status & MCI_STATUS_SYNDV) + rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd); + } mce_log(&m); diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h index 8be5268caf28..70f02149808c 100644 --- a/include/trace/events/mce.h +++ b/include/trace/events/mce.h @@ -21,6 +21,7 @@ TRACE_EVENT(mce_record, __field( u64, addr ) __field( u64, misc ) __field( u64, synd ) + __field( u64, ipid ) __field( u64, ip ) __field( u64, tsc ) __field( u64, walltime ) @@ -40,6 +41,7 @@ TRACE_EVENT(mce_record, __entry->addr = m->addr; __entry->misc = m->misc; __entry->synd = m->synd; + __entry->ipid = m->ipid; __entry->ip = m->ip; __entry->tsc = m->tsc; __entry->walltime = m->time; @@ -52,10 +54,11 @@ TRACE_EVENT(mce_record, __entry->cpuvendor = m->cpuvendor; ), - TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, ADDR/MISC/SYND: %016Lx/%016Lx/%016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PROCESSOR: %u:%x, TIME: %llu, SOCKET: %u, APIC: %x", + TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR/MISC/SYND: %016Lx/%016Lx/%016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PROCESSOR: %u:%x, TIME: %llu, SOCKET: %u, APIC: %x", __entry->cpu, __entry->mcgcap, __entry->mcgstatus, __entry->bank, __entry->status, + __entry->ipid, __entry->addr, __entry->misc, __entry->synd, __entry->cs, __entry->ip, __entry->tsc, From 4b711f92c9b21878794597997ecda1428acc334c Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 12 Sep 2016 09:59:38 +0200 Subject: [PATCH 295/538] x86/mce, EDAC/mce_amd: Print MCA_SYND and MCA_IPID during MCE on SMCA systems The MCA_SYND and MCA_IPID registers contain valuable information and should be included in MCE output. The MCA_SYND register contains syndrome and other error information, and the MCA_IPID register will uniquely identify the MCA bank's type without having to rely on system software. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1472680624-34221-2-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mcheck/mce.c | 7 +++++++ drivers/edac/mce_amd.c | 2 ++ 2 files changed, 9 insertions(+) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 17e9ff011c0e..7d905e3d58a2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -293,6 +293,13 @@ static void print_mce(struct mce *m) if (m->misc) pr_cont("MISC %llx ", m->misc); + if (mce_flags.smca) { + if (m->synd) + pr_cont("SYND %llx ", m->synd); + if (m->ipid) + pr_cont("IPID %llx ", m->ipid); + } + pr_cont("\n"); /* * Note this output is parsed by external tools and old fields diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 99b3bf3f4182..e8855a4f92d9 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -984,6 +984,8 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (m->status & MCI_STATUS_SYNDV) pr_cont(", Syndrome: 0x%016llx", m->synd); + pr_cont(", IPID: 0x%016llx", m->ipid); + pr_cont("\n"); decode_smca_errors(m); From 4f29b73bae158e3635b8f289f77376b054904ef5 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 12 Sep 2016 09:59:39 +0200 Subject: [PATCH 296/538] x86/mce/AMD: Extract the error address on SMCA systems The MCA_ADDR registers on Scalable MCA systems contain the ErrorAddr in bits [55:0] and the least significant bit of the address in bits [61:56]. We should extract the valid ErrorAddr bits from the MCA_ADDR register rather than saving the raw value to struct mce. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1473275643-1721-1-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mcheck/mce.c | 10 ++++++++++ arch/x86/kernel/cpu/mcheck/mce_amd.c | 13 ++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7d905e3d58a2..a7fdf453d895 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -588,6 +588,16 @@ static void mce_read_aux(struct mce *m, int i) m->addr >>= shift; m->addr <<= shift; } + + /* + * Extract [55:] where lsb is the least significant + * *valid* bit of the address bits. + */ + if (mce_flags.smca) { + u8 lsb = (m->addr >> 56) & 0x3f; + + m->addr &= GENMASK_ULL(55, lsb); + } } if (mce_flags.smca) { diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index d2f92ab5322f..9b5403462936 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -561,9 +561,20 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) if (threshold_err) m.misc = misc; - if (m.status & MCI_STATUS_ADDRV) + if (m.status & MCI_STATUS_ADDRV) { rdmsrl(msr_addr, m.addr); + /* + * Extract [55:] where lsb is the least significant + * *valid* bit of the address bits. + */ + if (mce_flags.smca) { + u8 lsb = (m.addr >> 56) & 0x3f; + + m.addr &= GENMASK_ULL(55, lsb); + } + } + if (mce_flags.smca) { rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid); From a884675b873a0185d2626d1f304987c94cef6d74 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 12 Sep 2016 09:59:40 +0200 Subject: [PATCH 297/538] x86/MCE/AMD, EDAC: Handle reserved bank 4 on Fam17h properly Bank 4 is reserved on family 0x17 and shouldn't generate any MCE records. However, broken hardware and software is not something unheard of so warn about bank 4 errors. They shouldn't be coming from bank 4 naturally but users can still use mce_amd_inj to simulate errors from it for testing purposed. Also, avoid special handling in the injector mce_amd_inj like it is being done on the older families. [ bp: Rewrite commit message and merge into one patch. Use boot_cpu_data. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Reviewed-by: Aravind Gopalakrishnan Link: http://lkml.kernel.org/r/1473384591-5323-1-git-send-email-Yazen.Ghannam@amd.com Link: http://lkml.kernel.org/r/1473384591-5323-2-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Thomas Gleixner --- arch/x86/ras/mce_amd_inj.c | 4 +++- drivers/edac/mce_amd.c | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c index ff8eb1a9ce6d..f4b442cc8a3e 100644 --- a/arch/x86/ras/mce_amd_inj.c +++ b/arch/x86/ras/mce_amd_inj.c @@ -308,7 +308,9 @@ static void do_inject(void) * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for * Fam10h and later BKDGs. */ - if (static_cpu_has(X86_FEATURE_AMD_DCM) && b == 4) { + if (static_cpu_has(X86_FEATURE_AMD_DCM) && + b == 4 && + boot_cpu_data.x86 < 0x17) { toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); cpu = get_nbc_for_node(amd_get_nb_id(cpu)); } diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index e8855a4f92d9..daaac2c79ca7 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -859,6 +859,9 @@ static void decode_smca_errors(struct mce *m) if (m->bank >= ARRAY_SIZE(smca_banks)) return; + if (boot_cpu_data.x86 >= 0x17 && m->bank == 4) + pr_emerg(HW_ERR "Bank 4 is reserved on Fam17h.\n"); + type = smca_banks[m->bank].type; if (!type) return; From 7cc4ef8ed132e72ba44804cae3ddb2587ff757d6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 12 Sep 2016 09:59:41 +0200 Subject: [PATCH 298/538] x86/RAS/mce_amd_inj: Fix some W= warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In particular: arch/x86/ras/mce_amd_inj.c: In function ‘prepare_msrs’: arch/x86/ras/mce_amd_inj.c:249:13: warning: declaration of ‘i_mce’ shadows a global declaration [-Wshadow] struct mce i_mce = *(struct mce *)info; ^~~~~ arch/x86/ras/mce_amd_inj.c: In function ‘init_mce_inject’: arch/x86/ras/mce_amd_inj.c:453:16: warning: comparison between signed and unsigned integer expressions [-Wsign-compare] for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) { Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/20160912075941.24699-16-bp@alien8.de Signed-off-by: Thomas Gleixner --- arch/x86/ras/mce_amd_inj.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c index f4b442cc8a3e..cd318d93099e 100644 --- a/arch/x86/ras/mce_amd_inj.c +++ b/arch/x86/ras/mce_amd_inj.c @@ -246,28 +246,27 @@ static void toggle_nb_mca_mst_cpu(u16 nid) static void prepare_msrs(void *info) { - struct mce i_mce = *(struct mce *)info; - u8 b = i_mce.bank; + struct mce m = *(struct mce *)info; + u8 b = m.bank; - wrmsrl(MSR_IA32_MCG_STATUS, i_mce.mcgstatus); + wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); if (boot_cpu_has(X86_FEATURE_SMCA)) { - if (i_mce.inject_flags == DFR_INT_INJ) { - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), i_mce.status); - wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), i_mce.addr); + if (m.inject_flags == DFR_INT_INJ) { + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); + wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); } else { - wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), i_mce.status); - wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), i_mce.addr); + wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); + wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); } - wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), i_mce.misc); - wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), i_mce.synd); + wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); + wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); } else { - wrmsrl(MSR_IA32_MCx_STATUS(b), i_mce.status); - wrmsrl(MSR_IA32_MCx_ADDR(b), i_mce.addr); - wrmsrl(MSR_IA32_MCx_MISC(b), i_mce.misc); + wrmsrl(MSR_IA32_MCx_STATUS(b), m.status); + wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr); + wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); } - } static void do_inject(void) @@ -441,7 +440,7 @@ static struct dfs_node { static int __init init_mce_inject(void) { - int i; + unsigned int i; u64 cap; rdmsrl(MSR_IA32_MCG_CAP, cap); From ba6d018e3d2f6a0fad58a668cadf66b2d1f80f59 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Sat, 10 Sep 2016 20:30:45 +0200 Subject: [PATCH 299/538] x86/mm/pkeys: Do not skip PKRU register if debug registers are not used __show_regs() fails to dump the PKRU state when the debug registers are in their default state because there is a return statement on the debug register state. Change the logic to report PKRU value even when debug registers are in their default state. Fixes:c0b17b5bd4b7 ("x86/mm/pkeys: Dump PKRU with other kernel registers") Signed-off-by: Nicolas Iooss Acked-by: Dave Hansen Link: http://lkml.kernel.org/r/20160910183045.4618-1-nicolas.iooss_linux@m4x.org Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process_64.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 63236d8f84bf..a21068e49dac 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -110,12 +110,13 @@ void __show_regs(struct pt_regs *regs, int all) get_debugreg(d7, 7); /* Only print out debug registers if they are in their non-default state. */ - if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && - (d6 == DR6_RESERVED) && (d7 == 0x400)) - return; - - printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); - printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); + if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && + (d6 == DR6_RESERVED) && (d7 == 0x400))) { + printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", + d0, d1, d2); + printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", + d3, d6, d7); + } if (boot_cpu_has(X86_FEATURE_OSPKE)) printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru()); From 3e1be7ad2d38c6bd6aeef96df9bd0a7822f4e51c Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 9 Sep 2016 22:43:12 +0800 Subject: [PATCH 300/538] bnx2: Reset device during driver initialization When system enters into kdump kernel because of kernel panic, it won't shutdown devices. On-flight DMA will continue transferring data until device driver initializes. All devices are supposed to reset during driver initialization. And this property is used to fix the kdump failure in system with intel iommu. Other systems with hardware iommu should be similar. Please check commit 091d42e ("iommu/vt-d: Copy translation tables from old kernel") and those commits around. But bnx2 driver doesn't reset device during driver initialization. The device resetting is deferred to net device up stage. This will cause hardware iommu handling failure on bnx2 device. And its resetting relies on firmware. So in this patch move the firmware requesting code to earlier bnx2_init_one(), then next call bnx2_reset_chip to reset device. Signed-off-by: Baoquan He Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c index 8fc3f3c137f8..505ceaf451e2 100644 --- a/drivers/net/ethernet/broadcom/bnx2.c +++ b/drivers/net/ethernet/broadcom/bnx2.c @@ -6356,10 +6356,6 @@ bnx2_open(struct net_device *dev) struct bnx2 *bp = netdev_priv(dev); int rc; - rc = bnx2_request_firmware(bp); - if (rc < 0) - goto out; - netif_carrier_off(dev); bnx2_disable_int(bp); @@ -6428,7 +6424,6 @@ bnx2_open(struct net_device *dev) bnx2_free_irq(bp); bnx2_free_mem(bp); bnx2_del_napi(bp); - bnx2_release_firmware(bp); goto out; } @@ -8575,6 +8570,12 @@ bnx2_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) pci_set_drvdata(pdev, dev); + rc = bnx2_request_firmware(bp); + if (rc < 0) + goto error; + + + bnx2_reset_chip(bp, BNX2_DRV_MSG_CODE_RESET); memcpy(dev->dev_addr, bp->mac_addr, ETH_ALEN); dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_SG | @@ -8607,6 +8608,7 @@ bnx2_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) return 0; error: + bnx2_release_firmware(bp); pci_iounmap(pdev, bp->regview); pci_release_regions(pdev); pci_disable_device(pdev); From 715f5552b1e90ba3eecf6d1a6d044d0d5226663f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 10 Sep 2016 23:11:23 +0800 Subject: [PATCH 301/538] sctp: hold the transport before using it in sctp_hash_cmp Since commit 4f0087812648 ("sctp: apply rhashtable api to send/recv path"), sctp uses transport rhashtable with .obj_cmpfn sctp_hash_cmp, in which it compares the members of the transport with the rhashtable args to check if it's the right transport. But sctp uses the transport without holding it in sctp_hash_cmp, it can cause a use-after-free panic. As after it gets transport from hashtable, another CPU may close the sk and free the asoc. In sctp_association_free, it frees all the transports, meanwhile, the assoc's refcnt may be reduced to 0, assoc can be destroyed by sctp_association_destroy. So after that, transport->assoc is actually an unavailable memory address in sctp_hash_cmp. Although sctp_hash_cmp is under rcu_read_lock, it still can not avoid this, as assoc is not freed by RCU. This patch is to hold the transport before checking it's members with sctp_transport_hold, in which it checks the refcnt first, holds it if it's not 0. Fixes: 4f0087812648 ("sctp: apply rhashtable api to send/recv path") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/input.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/net/sctp/input.c b/net/sctp/input.c index 69444d32ecda..1555fb8c68e0 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -796,27 +796,34 @@ struct sctp_hash_cmp_arg { static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { + struct sctp_transport *t = (struct sctp_transport *)ptr; const struct sctp_hash_cmp_arg *x = arg->key; - const struct sctp_transport *t = ptr; - struct sctp_association *asoc = t->asoc; - const struct net *net = x->net; + struct sctp_association *asoc; + int err = 1; if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr)) - return 1; - if (!net_eq(sock_net(asoc->base.sk), net)) - return 1; + return err; + if (!sctp_transport_hold(t)) + return err; + + asoc = t->asoc; + if (!net_eq(sock_net(asoc->base.sk), x->net)) + goto out; if (x->ep) { if (x->ep != asoc->ep) - return 1; + goto out; } else { if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port)) - return 1; + goto out; if (!sctp_bind_addr_match(&asoc->base.bind_addr, x->laddr, sctp_sk(asoc->base.sk))) - return 1; + goto out; } - return 0; + err = 0; +out: + sctp_transport_put(t); + return err; } static inline u32 sctp_hash_obj(const void *data, u32 len, u32 seed) From 57ccdf449f962ab5fc8cbf26479402f13bdb8be7 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 7 Sep 2016 18:51:13 +0800 Subject: [PATCH 302/538] tick/nohz: Prevent stopping the tick on an offline CPU can_stop_full_tick() has no check for offline cpus. So it allows to stop the tick on an offline cpu from the interrupt return path, which is wrong and subsequently makes irq_work_needs_cpu() warn about being called for an offline cpu. Commit f7ea0fd639c2c4 ("tick: Don't invoke tick_nohz_stop_sched_tick() if the cpu is offline") added prevention for can_stop_idle_tick(), but forgot to do the same in can_stop_full_tick(). Add it. [ tglx: Massaged changelog ] Signed-off-by: Wanpeng Li Cc: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1473245473-4463-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 2ec7c00228f3..3bcb61b52f6c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -186,10 +186,13 @@ static bool check_tick_dependency(atomic_t *dep) return false; } -static bool can_stop_full_tick(struct tick_sched *ts) +static bool can_stop_full_tick(int cpu, struct tick_sched *ts) { WARN_ON_ONCE(!irqs_disabled()); + if (unlikely(!cpu_online(cpu))) + return false; + if (check_tick_dependency(&tick_dep_mask)) return false; @@ -843,7 +846,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) return; - if (can_stop_full_tick(ts)) + if (can_stop_full_tick(cpu, ts)) tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); else if (ts->tick_stopped) tick_nohz_restart_sched_tick(ts, ktime_get()); From 440f895aa97f81a2bdc02993da5360a1f6da2fb5 Mon Sep 17 00:00:00 2001 From: Christophe Jaillet Date: Sun, 11 Sep 2016 21:43:34 +0200 Subject: [PATCH 303/538] drivers: net: phy: xgene: Fix 'remove' function If 'IS_ERR(pdata->clk)' is true, then 'clk_disable_unprepare(pdata->clk)' will do nothing. It is likely that 'if (!IS_ERR(pdata->clk))' was expected here. In fact, the test can even be removed because 'clk_disable_unprepare' already handles such cases. Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/phy/mdio-xgene.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/mdio-xgene.c b/drivers/net/phy/mdio-xgene.c index 775674808249..92af182951be 100644 --- a/drivers/net/phy/mdio-xgene.c +++ b/drivers/net/phy/mdio-xgene.c @@ -424,10 +424,8 @@ static int xgene_mdio_remove(struct platform_device *pdev) mdiobus_unregister(mdio_bus); mdiobus_free(mdio_bus); - if (dev->of_node) { - if (IS_ERR(pdata->clk)) - clk_disable_unprepare(pdata->clk); - } + if (dev->of_node) + clk_disable_unprepare(pdata->clk); return 0; } From ad5987b47e96a0fb6d13fea250e936aed000093c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 13 Sep 2016 15:53:55 +0200 Subject: [PATCH 304/538] nl80211: validate number of probe response CSA counters Due to an apparent copy/paste bug, the number of counters for the beacon configuration were checked twice, instead of checking the number of probe response counters. Fix this to check the number of probe response counters before parsing those. Cc: stable@vger.kernel.org Fixes: 9a774c78e211 ("cfg80211: Support multiple CSA counters") Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f02653a08993..4809f4d2cdcc 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6978,7 +6978,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) params.n_counter_offsets_presp = len / sizeof(u16); if (rdev->wiphy.max_num_csa_counters && - (params.n_counter_offsets_beacon > + (params.n_counter_offsets_presp > rdev->wiphy.max_num_csa_counters)) return -EINVAL; From d59dc7bcfa649ef2128a76b6487b16f4b3f14d23 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 8 Sep 2016 21:30:53 -0400 Subject: [PATCH 305/538] sched/numa, mm: Revert to checking pmd/pte_write instead of VMA flags Commit: 4d9424669946 ("mm: convert p[te|md]_mknonnuma and remaining page table manipulations") changed NUMA balancing from _PAGE_NUMA to using PROT_NONE, and was quickly found to introduce a regression with NUMA grouping. It was followed up by these commits: 53da3bc2ba9e ("mm: fix up numa read-only thread grouping logic") bea66fbd11af ("mm: numa: group related processes based on VMA flags instead of page table flags") b191f9b106ea ("mm: numa: preserve PTE write permissions across a NUMA hinting fault") The first of those two commits try alternate approaches to NUMA grouping, which apparently do not work as well as looking at the PTE write permissions. The latter patch preserves the PTE write permissions across a NUMA protection fault. However, it forgets to revert the condition for whether or not to group tasks together back to what it was before v3.19, even though the information is now preserved in the page tables once again. This patch brings the NUMA grouping heuristic back to what it was before commit 4d9424669946, which the changelogs of subsequent commits suggest worked best. We have all the information again. We should probably use it. Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: aarcange@redhat.com Cc: linux-mm@kvack.org Cc: mgorman@suse.de Link: http://lkml.kernel.org/r/20160908213053.07c992a9@annuminas.surriel.com Signed-off-by: Ingo Molnar --- mm/huge_memory.c | 2 +- mm/memory.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2db2112aa31e..c8bde270f557 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1168,7 +1168,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) } /* See similar comment in do_numa_page for explanation */ - if (!(vma->vm_flags & VM_WRITE)) + if (!pmd_write(pmd)) flags |= TNF_NO_GROUP; /* diff --git a/mm/memory.c b/mm/memory.c index 83be99d9d8a1..558c85270ae2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3398,7 +3398,7 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) * pte_dirty has unpredictable behaviour between PTE scan updates, * background writeback, dirty balancing and application behaviour. */ - if (!(vma->vm_flags & VM_WRITE)) + if (!pte_write(pte)) flags |= TNF_NO_GROUP; /* From 1ef0199a1a698d82ecd39d11d1daa3f4ab006c75 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 12 Sep 2016 15:05:50 -0700 Subject: [PATCH 306/538] selftests/x86/sigreturn: Use CX, not AX, as the scratch register RAX is handled specially in ESPFIX64. Use CX as our scratch register so that, if something goes wrong with RAX handling, we'll notice. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/9ceeb24ea56925586c330dc46306f757ddea9fb5.1473717910.git.luto@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/sigreturn.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c index 8a577e7070c6..246145b84a12 100644 --- a/tools/testing/selftests/x86/sigreturn.c +++ b/tools/testing/selftests/x86/sigreturn.c @@ -106,7 +106,7 @@ asm (".pushsection .text\n\t" ".type int3, @function\n\t" ".align 4096\n\t" "int3:\n\t" - "mov %ss,%eax\n\t" + "mov %ss,%ecx\n\t" "int3\n\t" ".size int3, . - int3\n\t" ".align 4096, 0xcc\n\t" @@ -306,7 +306,7 @@ static volatile sig_atomic_t sig_corrupt_final_ss; #ifdef __x86_64__ # define REG_IP REG_RIP # define REG_SP REG_RSP -# define REG_AX REG_RAX +# define REG_CX REG_RCX struct selectors { unsigned short cs, gs, fs, ss; @@ -326,7 +326,7 @@ static unsigned short *csptr(ucontext_t *ctx) #else # define REG_IP REG_EIP # define REG_SP REG_ESP -# define REG_AX REG_EAX +# define REG_CX REG_ECX static greg_t *ssptr(ucontext_t *ctx) { @@ -457,10 +457,10 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void) ctx->uc_mcontext.gregs[REG_IP] = sig_cs == code16_sel ? 0 : (unsigned long)&int3; ctx->uc_mcontext.gregs[REG_SP] = (unsigned long)0x8badf00d5aadc0deULL; - ctx->uc_mcontext.gregs[REG_AX] = 0; + ctx->uc_mcontext.gregs[REG_CX] = 0; memcpy(&requested_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); - requested_regs[REG_AX] = *ssptr(ctx); /* The asm code does this. */ + requested_regs[REG_CX] = *ssptr(ctx); /* The asm code does this. */ return; } @@ -482,7 +482,7 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void) unsigned short ss; asm ("mov %%ss,%0" : "=r" (ss)); - greg_t asm_ss = ctx->uc_mcontext.gregs[REG_AX]; + greg_t asm_ss = ctx->uc_mcontext.gregs[REG_CX]; if (asm_ss != sig_ss && sig == SIGTRAP) { /* Sanity check failure. */ printf("[FAIL]\tSIGTRAP: ss = %hx, frame ss = %hx, ax = %llx\n", @@ -654,8 +654,8 @@ static int test_valid_sigreturn(int cs_bits, bool use_16bit_ss, int force_ss) #endif /* Sanity check on the kernel */ - if (i == REG_AX && requested_regs[i] != resulting_regs[i]) { - printf("[FAIL]\tAX (saved SP) mismatch: requested 0x%llx; got 0x%llx\n", + if (i == REG_CX && requested_regs[i] != resulting_regs[i]) { + printf("[FAIL]\tCX (saved SP) mismatch: requested 0x%llx; got 0x%llx\n", (unsigned long long)requested_regs[i], (unsigned long long)resulting_regs[i]); nerrs++; From 85063fac1f72419eec4349621fe829b07f9acb1e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 12 Sep 2016 15:05:51 -0700 Subject: [PATCH 307/538] x86/entry/64: Clean up and document espfix64 stack setup The espfix64 setup code was a bit inscrutible and contained an unnecessary push of RAX. Remove that push, update all the stack offsets to match, and document the whole mess. Reported-By: Borislav Petkov Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e5459eb10cf1175c8b36b840bc425f210d045f35.1473717910.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 64 ++++++++++++++++++++++++++++++++------- 1 file changed, 53 insertions(+), 11 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index c0373d667674..e7fba58f4d9c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -586,27 +586,69 @@ native_irq_return_iret: #ifdef CONFIG_X86_ESPFIX64 native_irq_return_ldt: - pushq %rax - pushq %rdi + /* + * We are running with user GSBASE. All GPRs contain their user + * values. We have a percpu ESPFIX stack that is eight slots + * long (see ESPFIX_STACK_SIZE). espfix_waddr points to the bottom + * of the ESPFIX stack. + * + * We clobber RAX and RDI in this code. We stash RDI on the + * normal stack and RAX on the ESPFIX stack. + * + * The ESPFIX stack layout we set up looks like this: + * + * --- top of ESPFIX stack --- + * SS + * RSP + * RFLAGS + * CS + * RIP <-- RSP points here when we're done + * RAX <-- espfix_waddr points here + * --- bottom of ESPFIX stack --- + */ + + pushq %rdi /* Stash user RDI */ SWAPGS movq PER_CPU_VAR(espfix_waddr), %rdi - movq %rax, (0*8)(%rdi) /* RAX */ - movq (2*8)(%rsp), %rax /* RIP */ + movq %rax, (0*8)(%rdi) /* user RAX */ + movq (1*8)(%rsp), %rax /* user RIP */ movq %rax, (1*8)(%rdi) - movq (3*8)(%rsp), %rax /* CS */ + movq (2*8)(%rsp), %rax /* user CS */ movq %rax, (2*8)(%rdi) - movq (4*8)(%rsp), %rax /* RFLAGS */ + movq (3*8)(%rsp), %rax /* user RFLAGS */ movq %rax, (3*8)(%rdi) - movq (6*8)(%rsp), %rax /* SS */ + movq (5*8)(%rsp), %rax /* user SS */ movq %rax, (5*8)(%rdi) - movq (5*8)(%rsp), %rax /* RSP */ + movq (4*8)(%rsp), %rax /* user RSP */ movq %rax, (4*8)(%rdi) - andl $0xffff0000, %eax - popq %rdi + /* Now RAX == RSP. */ + + andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */ + popq %rdi /* Restore user RDI */ + + /* + * espfix_stack[31:16] == 0. The page tables are set up such that + * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of + * espfix_waddr for any X. That is, there are 65536 RO aliases of + * the same page. Set up RSP so that RSP[31:16] contains the + * respective 16 bits of the /userspace/ RSP and RSP nonetheless + * still points to an RO alias of the ESPFIX stack. + */ orq PER_CPU_VAR(espfix_stack), %rax SWAPGS movq %rax, %rsp - popq %rax + + /* + * At this point, we cannot write to the stack any more, but we can + * still read. + */ + popq %rax /* Restore user RAX */ + + /* + * RSP now points to an ordinary IRET frame, except that the page + * is read-only and RSP[31:16] are preloaded with the userspace + * values. We can now IRET back to userspace. + */ jmp native_irq_return_iret #endif END(common_interrupt) From f148b41e8b2e114d0aba023adf326b03368f3246 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 11 Sep 2016 14:58:21 +0900 Subject: [PATCH 308/538] x86: Clean up various simple wrapper functions Remove unneeded variables and assignments. While we are here, let's fix the following as well: - Remove unnecessary parentheses - Remove unnecessary unsigned-suffix 'U' from constant values - Reword the comment in set_apic_id() (suggested by Thomas Gleixner) Signed-off-by: Masahiro Yamada Cc: Alex Thorlton Cc: Andrew Banman Cc: Borislav Petkov Cc: Daniel J Blueman Cc: Denys Vlasenko Cc: Dimitri Sivanich Cc: Linus Torvalds Cc: Matt Fleming Cc: Mike Travis Cc: Nathan Zimmer Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steffen Persvold Cc: Thomas Gleixner Cc: Toshi Kani Cc: Wei Jiangang Link: http://lkml.kernel.org/r/1473573502-27954-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic_flat_64.c | 16 +++------------- arch/x86/kernel/apic/apic_numachip.c | 5 +---- arch/x86/kernel/apic/x2apic_uv_x.c | 7 ++----- arch/x86/mm/pat_rbtree.c | 4 +--- arch/x86/platform/uv/bios_uv.c | 7 ++----- arch/x86/platform/uv/tlb_uv.c | 6 +----- 6 files changed, 10 insertions(+), 35 deletions(-) diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 5b2ae106bd4a..70796f51b2ff 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -116,27 +116,17 @@ static void flat_send_IPI_all(int vector) static unsigned int flat_get_apic_id(unsigned long x) { - unsigned int id; - - id = (((x)>>24) & 0xFFu); - - return id; + return (x >> 24) & 0xFF; } static unsigned long set_apic_id(unsigned int id) { - unsigned long x; - - x = ((id & 0xFFu)<<24); - return x; + return (id & 0xFF) << 24; } static unsigned int read_xapic_id(void) { - unsigned int id; - - id = flat_get_apic_id(apic_read(APIC_ID)); - return id; + return flat_get_apic_id(apic_read(APIC_ID)); } static int flat_apic_id_registered(void) diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 714d4fda0d52..e08fe2c8dd8c 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -40,10 +40,7 @@ static unsigned int numachip1_get_apic_id(unsigned long x) static unsigned long numachip1_set_apic_id(unsigned int id) { - unsigned long x; - - x = ((id & 0xffU) << 24); - return x; + return (id & 0xff) << 24; } static unsigned int numachip2_get_apic_id(unsigned long x) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index cb0673c1e940..0f8cd928f368 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -533,11 +533,8 @@ static unsigned int x2apic_get_apic_id(unsigned long x) static unsigned long set_apic_id(unsigned int id) { - unsigned long x; - - /* maskout x2apic_extra_bits ? */ - x = id; - return x; + /* CHECKME: Do we need to mask out the xapic extra bits? */ + return id; } static unsigned int uv_read_apic_id(void) diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index de391b7bc19a..159b52ccd600 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -254,9 +254,7 @@ struct memtype *rbt_memtype_erase(u64 start, u64 end) struct memtype *rbt_memtype_lookup(u64 addr) { - struct memtype *data; - data = memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE); - return data; + return memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE); } #if defined(CONFIG_DEBUG_FS) diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 23f2f3e41c7f..b4d5e95fe4df 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -149,11 +149,8 @@ EXPORT_SYMBOL_GPL(uv_bios_change_memprotect); s64 uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len) { - s64 ret; - - ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie, - (u64)addr, buf, (u64)len, 0); - return ret; + return uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie, + (u64)addr, buf, (u64)len, 0); } EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa); diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index fdb4d42b4ce5..276e1b7bba3c 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -580,11 +580,7 @@ static int uv1_wait_completion(struct bau_desc *bau_desc, */ static unsigned long uv2_3_read_status(unsigned long offset, int rshft, int desc) { - unsigned long descriptor_status; - - descriptor_status = - ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1; - return descriptor_status; + return ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1; } /* From 0b97a484e52cb423662eb98904aad82dafcc1f10 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 14 Sep 2016 09:41:34 +0200 Subject: [PATCH 309/538] mac80211: check skb_linearize() return value The A-MSDU TX code (within TXQs) didn't always check the return value of skb_linearize() properly, resulting in potentially passing a frag- list SKB down to the driver even when it said it can't handle it. Fix that. Fixes: 6e0456b545456 ("mac80211: add A-MSDU tx support") Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index cc8e95554b48..18b285e06bc8 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1515,8 +1515,12 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, spin_unlock_bh(&fq->lock); if (skb && skb_has_frag_list(skb) && - !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) - skb_linearize(skb); + !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) { + if (skb_linearize(skb)) { + ieee80211_free_txskb(&local->hw, skb); + return NULL; + } + } return skb; } From ecb3f394c5dba897d215a5422f1b363e93e2ce4e Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Tue, 13 Sep 2016 12:14:51 -0400 Subject: [PATCH 310/538] genirq: Expose interrupt information through sysfs Information about interrupts is exposed via /proc/interrupts, but the format of that file has changed over kernel versions and differs across architectures. It also has varying column numbers depending on hardware. That all makes it hard for tools to parse. To solve this, expose the information through sysfs so each irq attribute is in a separate file in a consistent, machine parsable way. This feature is only available when both CONFIG_SPARSE_IRQ and CONFIG_SYSFS are enabled. Examples: /sys/kernel/irq/18/actions: i801_smbus,ehci_hcd:usb1,uhci_hcd:usb7 /sys/kernel/irq/18/chip_name: IR-IO-APIC /sys/kernel/irq/18/hwirq: 18 /sys/kernel/irq/18/name: fasteoi /sys/kernel/irq/18/per_cpu_count: 0,0 /sys/kernel/irq/18/type: level /sys/kernel/irq/25/actions: ahci0 /sys/kernel/irq/25/chip_name: IR-PCI-MSI /sys/kernel/irq/25/hwirq: 512000 /sys/kernel/irq/25/name: edge /sys/kernel/irq/25/per_cpu_count: 29036,0 /sys/kernel/irq/25/type: edge [ tglx: Moved kobject_del() under sparse_irq_lock, massaged code comments and changelog ] Signed-off-by: Craig Gallek Cc: David Decotigny Link: http://lkml.kernel.org/r/1473783291-122873-1-git-send-email-kraigatgoog@gmail.com Signed-off-by: Thomas Gleixner --- Documentation/ABI/testing/sysfs-kernel-irq | 53 ++++++ include/linux/irqdesc.h | 3 + kernel/irq/irqdesc.c | 193 ++++++++++++++++++++- 3 files changed, 247 insertions(+), 2 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-kernel-irq diff --git a/Documentation/ABI/testing/sysfs-kernel-irq b/Documentation/ABI/testing/sysfs-kernel-irq new file mode 100644 index 000000000000..eb074b100986 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-irq @@ -0,0 +1,53 @@ +What: /sys/kernel/irq +Date: September 2016 +KernelVersion: 4.9 +Contact: Craig Gallek +Description: Directory containing information about the system's IRQs. + Specifically, data from the associated struct irq_desc. + The information here is similar to that in /proc/interrupts + but in a more machine-friendly format. This directory contains + one subdirectory for each Linux IRQ number. + +What: /sys/kernel/irq//actions +Date: September 2016 +KernelVersion: 4.9 +Contact: Craig Gallek +Description: The IRQ action chain. A comma-separated list of zero or more + device names associated with this interrupt. + +What: /sys/kernel/irq//chip_name +Date: September 2016 +KernelVersion: 4.9 +Contact: Craig Gallek +Description: Human-readable chip name supplied by the associated device + driver. + +What: /sys/kernel/irq//hwirq +Date: September 2016 +KernelVersion: 4.9 +Contact: Craig Gallek +Description: When interrupt translation domains are used, this file contains + the underlying hardware IRQ number used for this Linux IRQ. + +What: /sys/kernel/irq//name +Date: September 2016 +KernelVersion: 4.9 +Contact: Craig Gallek +Description: Human-readable flow handler name as defined by the irq chip + driver. + +What: /sys/kernel/irq//per_cpu_count +Date: September 2016 +KernelVersion: 4.9 +Contact: Craig Gallek +Description: The number of times the interrupt has fired since boot. This + is a comma-separated list of counters; one per CPU in CPU id + order. NOTE: This file consistently shows counters for all + CPU ids. This differs from the behavior of /proc/interrupts + which only shows counters for online CPUs. + +What: /sys/kernel/irq//type +Date: September 2016 +KernelVersion: 4.9 +Contact: Craig Gallek +Description: The type of the interrupt. Either the string 'level' or 'edge'. diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index b51beebf9804..c9be57931b58 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -2,6 +2,7 @@ #define _LINUX_IRQDESC_H #include +#include /* * Core internal functions to deal with irq descriptors @@ -43,6 +44,7 @@ struct pt_regs; * @force_resume_depth: number of irqactions on a irq descriptor with * IRQF_FORCE_RESUME set * @rcu: rcu head for delayed free + * @kobj: kobject used to represent this struct in sysfs * @dir: /proc/irq/ procfs entry * @name: flow handler name for /proc/interrupts output */ @@ -88,6 +90,7 @@ struct irq_desc { #endif #ifdef CONFIG_SPARSE_IRQ struct rcu_head rcu; + struct kobject kobj; #endif int parent_irq; struct module *owner; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index a623b44f2d4b..93b51727abaa 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "internals.h" @@ -123,6 +124,181 @@ static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS); #ifdef CONFIG_SPARSE_IRQ +static void irq_kobj_release(struct kobject *kobj); + +#ifdef CONFIG_SYSFS +static struct kobject *irq_kobj_base; + +#define IRQ_ATTR_RO(_name) \ +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +static ssize_t per_cpu_count_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + int cpu, irq = desc->irq_data.irq; + ssize_t ret = 0; + char *p = ""; + + for_each_possible_cpu(cpu) { + unsigned int c = kstat_irqs_cpu(irq, cpu); + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c); + p = ","; + } + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + return ret; +} +IRQ_ATTR_RO(per_cpu_count); + +static ssize_t chip_name_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + if (desc->irq_data.chip && desc->irq_data.chip->name) { + ret = scnprintf(buf, PAGE_SIZE, "%s\n", + desc->irq_data.chip->name); + } + raw_spin_unlock_irq(&desc->lock); + + return ret; +} +IRQ_ATTR_RO(chip_name); + +static ssize_t hwirq_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + if (desc->irq_data.domain) + ret = sprintf(buf, "%d\n", (int)desc->irq_data.hwirq); + raw_spin_unlock_irq(&desc->lock); + + return ret; +} +IRQ_ATTR_RO(hwirq); + +static ssize_t type_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + ret = sprintf(buf, "%s\n", + irqd_is_level_type(&desc->irq_data) ? "level" : "edge"); + raw_spin_unlock_irq(&desc->lock); + + return ret; + +} +IRQ_ATTR_RO(type); + +static ssize_t name_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + if (desc->name) + ret = scnprintf(buf, PAGE_SIZE, "%s\n", desc->name); + raw_spin_unlock_irq(&desc->lock); + + return ret; +} +IRQ_ATTR_RO(name); + +static ssize_t actions_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + struct irqaction *action; + ssize_t ret = 0; + char *p = ""; + + raw_spin_lock_irq(&desc->lock); + for (action = desc->action; action != NULL; action = action->next) { + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", + p, action->name); + p = ","; + } + raw_spin_unlock_irq(&desc->lock); + + if (ret) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + + return ret; +} +IRQ_ATTR_RO(actions); + +static struct attribute *irq_attrs[] = { + &per_cpu_count_attr.attr, + &chip_name_attr.attr, + &hwirq_attr.attr, + &type_attr.attr, + &name_attr.attr, + &actions_attr.attr, + NULL +}; + +static struct kobj_type irq_kobj_type = { + .release = irq_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_attrs = irq_attrs, +}; + +static void irq_sysfs_add(int irq, struct irq_desc *desc) +{ + if (irq_kobj_base) { + /* + * Continue even in case of failure as this is nothing + * crucial. + */ + if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq)) + pr_warn("Failed to add kobject for irq %d\n", irq); + } +} + +static int __init irq_sysfs_init(void) +{ + struct irq_desc *desc; + int irq; + + /* Prevent concurrent irq alloc/free */ + irq_lock_sparse(); + + irq_kobj_base = kobject_create_and_add("irq", kernel_kobj); + if (!irq_kobj_base) { + irq_unlock_sparse(); + return -ENOMEM; + } + + /* Add the already allocated interrupts */ + for_each_irq_desc(irq, desc) + irq_sysfs_add(irq, desc); + irq_unlock_sparse(); + + return 0; +} +postcore_initcall(irq_sysfs_init); + +#else /* !CONFIG_SYSFS */ + +static struct kobj_type irq_kobj_type = { + .release = irq_kobj_release, +}; + +static void irq_sysfs_add(int irq, struct irq_desc *desc) {} + +#endif /* CONFIG_SYSFS */ + static RADIX_TREE(irq_desc_tree, GFP_KERNEL); static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) @@ -187,6 +363,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, desc_set_defaults(irq, desc, node, affinity, owner); irqd_set(&desc->irq_data, flags); + kobject_init(&desc->kobj, &irq_kobj_type); return desc; @@ -197,15 +374,22 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, return NULL; } -static void delayed_free_desc(struct rcu_head *rhp) +static void irq_kobj_release(struct kobject *kobj) { - struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); free_masks(desc); free_percpu(desc->kstat_irqs); kfree(desc); } +static void delayed_free_desc(struct rcu_head *rhp) +{ + struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); + + kobject_put(&desc->kobj); +} + static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -217,8 +401,12 @@ static void free_desc(unsigned int irq) * kstat_irq_usr(). Once we deleted the descriptor from the * sparse tree we can free it. Access in proc will fail to * lookup the descriptor. + * + * The sysfs entry must be serialized against a concurrent + * irq_sysfs_init() as well. */ mutex_lock(&sparse_irq_lock); + kobject_del(&desc->kobj); delete_irq_desc(irq); mutex_unlock(&sparse_irq_lock); @@ -261,6 +449,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, goto err; mutex_lock(&sparse_irq_lock); irq_insert_desc(start + i, desc); + irq_sysfs_add(start + i, desc); mutex_unlock(&sparse_irq_lock); } return start; From cfeeed279dc2fa83a00fbe4856ebd231d56201ab Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 8 Sep 2016 16:49:20 -0500 Subject: [PATCH 311/538] x86/dumpstack: Allow preemption in show_stack_log_lvl() and dump_trace() show_stack_log_lvl() and dump_trace() are already preemption safe: - If they're running in irq or exception context, preemption is already disabled and the percpu stack pointers can be trusted. - If they're running with preemption enabled, they must be running on the task stack anyway, so it doesn't matter if they're comparing the stack pointer against a percpu stack pointer from this CPU or another one: either way it won't match. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a0ca0b1044eca97d4f0ec7c1619cf80b3b65560d.1473371307.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 14 ++++++-------- arch/x86/kernel/dumpstack_64.c | 26 +++++++++----------------- 2 files changed, 15 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index c533b8b5a373..da5cd62f93ab 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -24,16 +24,16 @@ static void *is_irq_stack(void *p, void *irq) } -static void *is_hardirq_stack(unsigned long *stack, int cpu) +static void *is_hardirq_stack(unsigned long *stack) { - void *irq = per_cpu(hardirq_stack, cpu); + void *irq = this_cpu_read(hardirq_stack); return is_irq_stack(stack, irq); } -static void *is_softirq_stack(unsigned long *stack, int cpu) +static void *is_softirq_stack(unsigned long *stack) { - void *irq = per_cpu(softirq_stack, cpu); + void *irq = this_cpu_read(softirq_stack); return is_irq_stack(stack, irq); } @@ -42,7 +42,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { - const unsigned cpu = get_cpu(); int graph = 0; u32 *prev_esp; @@ -53,9 +52,9 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, for (;;) { void *end_stack; - end_stack = is_hardirq_stack(stack, cpu); + end_stack = is_hardirq_stack(stack); if (!end_stack) - end_stack = is_softirq_stack(stack, cpu); + end_stack = is_softirq_stack(stack); bp = ops->walk_stack(task, stack, bp, ops, data, end_stack, &graph); @@ -74,7 +73,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, break; touch_nmi_watchdog(); } - put_cpu(); } EXPORT_SYMBOL(dump_trace); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index b243352c779e..07373bec76f1 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -31,8 +31,8 @@ static char x86_stack_ids[][8] = { #endif }; -static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, char **idp) +static unsigned long *in_exception_stack(unsigned long stack, unsigned *usedp, + char **idp) { unsigned k; @@ -41,7 +41,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, * 'stack' is in one of them: */ for (k = 0; k < N_EXCEPTION_STACKS; k++) { - unsigned long end = per_cpu(orig_ist, cpu).ist[k]; + unsigned long end = raw_cpu_ptr(&orig_ist)->ist[k]; /* * Is 'stack' above this exception frame's end? * If yes then skip to the next frame. @@ -111,7 +111,7 @@ enum stack_type { }; static enum stack_type -analyze_stack(int cpu, struct task_struct *task, unsigned long *stack, +analyze_stack(struct task_struct *task, unsigned long *stack, unsigned long **stack_end, unsigned long *irq_stack, unsigned *used, char **id) { @@ -121,8 +121,7 @@ analyze_stack(int cpu, struct task_struct *task, unsigned long *stack, if ((unsigned long)task_stack_page(task) == addr) return STACK_IS_NORMAL; - *stack_end = in_exception_stack(cpu, (unsigned long)stack, - used, id); + *stack_end = in_exception_stack((unsigned long)stack, used, id); if (*stack_end) return STACK_IS_EXCEPTION; @@ -149,8 +148,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { - const unsigned cpu = get_cpu(); - unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu); + unsigned long *irq_stack = (unsigned long *)this_cpu_read(irq_stack_ptr); unsigned used = 0; int graph = 0; int done = 0; @@ -169,8 +167,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, enum stack_type stype; char *id; - stype = analyze_stack(cpu, task, stack, &stack_end, - irq_stack, &used, &id); + stype = analyze_stack(task, stack, &stack_end, irq_stack, &used, + &id); /* Default finish unless specified to continue */ done = 1; @@ -225,7 +223,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, * This handles the process stack: */ bp = ops->walk_stack(task, stack, bp, ops, data, NULL, &graph); - put_cpu(); } EXPORT_SYMBOL(dump_trace); @@ -236,13 +233,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *irq_stack_end; unsigned long *irq_stack; unsigned long *stack; - int cpu; int i; - preempt_disable(); - cpu = smp_processor_id(); - - irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); + irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr); irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long)); sp = sp ? : get_stack_pointer(task, regs); @@ -274,7 +267,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, stack++; touch_nmi_watchdog(); } - preempt_enable(); pr_cont("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); From e38447ee1f487eaccdbef4a61dc064f4ae94e2fa Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 5 Sep 2016 16:33:03 +0300 Subject: [PATCH 312/538] x86/vdso: Unmap vdso blob on vvar mapping failure If remapping of vDSO blob failed on vvar mapping, we need to unmap previously mapped vDSO blob. Signed-off-by: Dmitry Safonov Acked-by: Andy Lutomirski Cc: 0x7f454c46@gmail.com Cc: oleg@redhat.com Cc: linux-mm@kvack.org Cc: gorcunov@openvz.org Cc: xemul@virtuozzo.com Link: http://lkml.kernel.org/r/20160905133308.28234-2-dsafonov@virtuozzo.com Signed-off-by: Thomas Gleixner --- arch/x86/entry/vdso/vma.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index f840766659a8..3bab6ba3ffc5 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -238,12 +238,14 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) if (IS_ERR(vma)) { ret = PTR_ERR(vma); - goto up_fail; + do_munmap(mm, text_start, image->size); } up_fail: - if (ret) + if (ret) { current->mm->context.vdso = NULL; + current->mm->context.vdso_image = NULL; + } up_write(&mm->mmap_sem); return ret; From 576ebfefd37bd41e965787f60684c8e4b7f79457 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 5 Sep 2016 16:33:04 +0300 Subject: [PATCH 313/538] x86/vdso: Replace calculate_addr in map_vdso() with addr That will allow to specify address where to map vDSO blob. For the randomized vDSO mappings introduce map_vdso_randomized() which will simplify calls to map_vdso. Signed-off-by: Dmitry Safonov Acked-by: Andy Lutomirski Cc: 0x7f454c46@gmail.com Cc: oleg@redhat.com Cc: linux-mm@kvack.org Cc: gorcunov@openvz.org Cc: xemul@virtuozzo.com Link: http://lkml.kernel.org/r/20160905133308.28234-3-dsafonov@virtuozzo.com Signed-off-by: Thomas Gleixner --- arch/x86/entry/vdso/vma.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 3bab6ba3ffc5..5bcb25a9e573 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -176,11 +176,16 @@ static int vvar_fault(const struct vm_special_mapping *sm, return VM_FAULT_SIGBUS; } -static int map_vdso(const struct vdso_image *image, bool calculate_addr) +/* + * Add vdso and vvar mappings to current process. + * @image - blob to map + * @addr - request a specific address (zero to map at free addr) + */ +static int map_vdso(const struct vdso_image *image, unsigned long addr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long addr, text_start; + unsigned long text_start; int ret = 0; static const struct vm_special_mapping vdso_mapping = { @@ -193,13 +198,6 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) .fault = vvar_fault, }; - if (calculate_addr) { - addr = vdso_addr(current->mm->start_stack, - image->size - image->sym_vvar_start); - } else { - addr = 0; - } - if (down_write_killable(&mm->mmap_sem)) return -EINTR; @@ -251,13 +249,20 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) return ret; } +static int map_vdso_randomized(const struct vdso_image *image) +{ + unsigned long addr = vdso_addr(current->mm->start_stack, + image->size - image->sym_vvar_start); + return map_vdso(image, addr); +} + #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static int load_vdso32(void) { if (vdso32_enabled != 1) /* Other values all mean "disabled" */ return 0; - return map_vdso(&vdso_image_32, false); + return map_vdso(&vdso_image_32, 0); } #endif @@ -267,7 +272,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (!vdso64_enabled) return 0; - return map_vdso(&vdso_image_64, true); + return map_vdso_randomized(&vdso_image_64); } #ifdef CONFIG_COMPAT @@ -278,8 +283,7 @@ int compat_arch_setup_additional_pages(struct linux_binprm *bprm, if (test_thread_flag(TIF_X32)) { if (!vdso64_enabled) return 0; - - return map_vdso(&vdso_image_x32, true); + return map_vdso_randomized(&vdso_image_x32); } #endif #ifdef CONFIG_IA32_EMULATION From 2eefd8789698e89c4a5d610921dc3c1b66e3bd0d Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 5 Sep 2016 16:33:05 +0300 Subject: [PATCH 314/538] x86/arch_prctl/vdso: Add ARCH_MAP_VDSO_* Add API to change vdso blob type with arch_prctl. As this is usefull only by needs of CRIU, expose this interface under CONFIG_CHECKPOINT_RESTORE. Signed-off-by: Dmitry Safonov Acked-by: Andy Lutomirski Cc: 0x7f454c46@gmail.com Cc: oleg@redhat.com Cc: linux-mm@kvack.org Cc: gorcunov@openvz.org Cc: xemul@virtuozzo.com Link: http://lkml.kernel.org/r/20160905133308.28234-4-dsafonov@virtuozzo.com Signed-off-by: Thomas Gleixner --- arch/x86/entry/vdso/vma.c | 45 ++++++++++++++++++++++++------- arch/x86/include/asm/vdso.h | 2 ++ arch/x86/include/uapi/asm/prctl.h | 6 +++++ arch/x86/kernel/process_64.c | 25 +++++++++++++++++ include/linux/mm.h | 2 ++ mm/mmap.c | 8 ++++++ 6 files changed, 78 insertions(+), 10 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 5bcb25a9e573..4459e73e234d 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -176,6 +176,16 @@ static int vvar_fault(const struct vm_special_mapping *sm, return VM_FAULT_SIGBUS; } +static const struct vm_special_mapping vdso_mapping = { + .name = "[vdso]", + .fault = vdso_fault, + .mremap = vdso_mremap, +}; +static const struct vm_special_mapping vvar_mapping = { + .name = "[vvar]", + .fault = vvar_fault, +}; + /* * Add vdso and vvar mappings to current process. * @image - blob to map @@ -188,16 +198,6 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) unsigned long text_start; int ret = 0; - static const struct vm_special_mapping vdso_mapping = { - .name = "[vdso]", - .fault = vdso_fault, - .mremap = vdso_mremap, - }; - static const struct vm_special_mapping vvar_mapping = { - .name = "[vvar]", - .fault = vvar_fault, - }; - if (down_write_killable(&mm->mmap_sem)) return -EINTR; @@ -256,6 +256,31 @@ static int map_vdso_randomized(const struct vdso_image *image) return map_vdso(image, addr); } +int map_vdso_once(const struct vdso_image *image, unsigned long addr) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + + down_write(&mm->mmap_sem); + /* + * Check if we have already mapped vdso blob - fail to prevent + * abusing from userspace install_speciall_mapping, which may + * not do accounting and rlimit right. + * We could search vma near context.vdso, but it's a slowpath, + * so let's explicitely check all VMAs to be completely sure. + */ + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma_is_special_mapping(vma, &vdso_mapping) || + vma_is_special_mapping(vma, &vvar_mapping)) { + up_write(&mm->mmap_sem); + return -EEXIST; + } + } + up_write(&mm->mmap_sem); + + return map_vdso(image, addr); +} + #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static int load_vdso32(void) { diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 43dc55be524e..2444189cbe28 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -41,6 +41,8 @@ extern const struct vdso_image vdso_image_32; extern void __init init_vdso_image(const struct vdso_image *image); +extern int map_vdso_once(const struct vdso_image *image, unsigned long addr); + #endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_VDSO_H */ diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index 3ac5032fae09..ae135de547f5 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -6,4 +6,10 @@ #define ARCH_GET_FS 0x1003 #define ARCH_GET_GS 0x1004 +#ifdef CONFIG_CHECKPOINT_RESTORE +# define ARCH_MAP_VDSO_X32 0x2001 +# define ARCH_MAP_VDSO_32 0x2002 +# define ARCH_MAP_VDSO_64 0x2003 +#endif + #endif /* _ASM_X86_PRCTL_H */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 63236d8f84bf..f240a465920b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -49,6 +49,7 @@ #include #include #include +#include asmlinkage extern void ret_from_fork(void); @@ -524,6 +525,17 @@ void set_personality_ia32(bool x32) } EXPORT_SYMBOL_GPL(set_personality_ia32); +static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) +{ + int ret; + + ret = map_vdso_once(image, addr); + if (ret) + return ret; + + return (long)image->size; +} + long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) { int ret = 0; @@ -577,6 +589,19 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) break; } +#ifdef CONFIG_CHECKPOINT_RESTORE +#ifdef CONFIG_X86_X32 + case ARCH_MAP_VDSO_X32: + return prctl_map_vdso(&vdso_image_x32, addr); +#endif +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + case ARCH_MAP_VDSO_32: + return prctl_map_vdso(&vdso_image_32, addr); +#endif + case ARCH_MAP_VDSO_64: + return prctl_map_vdso(&vdso_image_64, addr); +#endif + default: ret = -EINVAL; break; diff --git a/include/linux/mm.h b/include/linux/mm.h index ef815b9cd426..5f14534f0c90 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2019,6 +2019,8 @@ extern struct file *get_task_exe_file(struct task_struct *task); extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages); extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages); +extern bool vma_is_special_mapping(const struct vm_area_struct *vma, + const struct vm_special_mapping *sm); extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, diff --git a/mm/mmap.c b/mm/mmap.c index ca9d91bca0d6..6373ebd358c0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3063,6 +3063,14 @@ static struct vm_area_struct *__install_special_mapping( return ERR_PTR(ret); } +bool vma_is_special_mapping(const struct vm_area_struct *vma, + const struct vm_special_mapping *sm) +{ + return vma->vm_private_data == sm && + (vma->vm_ops == &special_mapping_vmops || + vma->vm_ops == &legacy_special_mapping_vmops); +} + /* * Called with mm->mmap_sem held for writing. * Insert a new vma covering the given region, with the given flags. From 90954e7b940778478754452f1ec8b23ea9a9ad42 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 5 Sep 2016 16:33:06 +0300 Subject: [PATCH 315/538] x86/coredump: Use pr_reg size, rather that TIF_IA32 flag Killed PR_REG_SIZE and PR_REG_PTR macro as we can get regset size from regset view. I wish I could also kill PRSTATUS_SIZE nicely. Suggested-by: Oleg Nesterov Signed-off-by: Dmitry Safonov Cc: 0x7f454c46@gmail.com Cc: linux-mm@kvack.org Cc: luto@kernel.org Cc: gorcunov@openvz.org Cc: xemul@virtuozzo.com Link: http://lkml.kernel.org/r/20160905133308.28234-5-dsafonov@virtuozzo.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/compat.h | 8 ++++---- fs/binfmt_elf.c | 23 ++++++++--------------- 2 files changed, 12 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index a18806165fe4..03d269bed941 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -275,10 +275,10 @@ struct compat_shmid64_ds { #ifdef CONFIG_X86_X32_ABI typedef struct user_regs_struct compat_elf_gregset_t; -#define PR_REG_SIZE(S) (test_thread_flag(TIF_IA32) ? 68 : 216) -#define PRSTATUS_SIZE(S) (test_thread_flag(TIF_IA32) ? 144 : 296) -#define SET_PR_FPVALID(S,V) \ - do { *(int *) (((void *) &((S)->pr_reg)) + PR_REG_SIZE(0)) = (V); } \ +/* Full regset -- prstatus on x32, otherwise on ia32 */ +#define PRSTATUS_SIZE(S, R) (R != sizeof(S.pr_reg) ? 144 : 296) +#define SET_PR_FPVALID(S, V, R) \ + do { *(int *) (((void *) &((S)->pr_reg)) + R) = (V); } \ while (0) #define COMPAT_USE_64BIT_TIME \ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e5495f37c6ed..2472af2798c7 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1624,20 +1624,12 @@ static void do_thread_regset_writeback(struct task_struct *task, regset->writeback(task, regset, 1); } -#ifndef PR_REG_SIZE -#define PR_REG_SIZE(S) sizeof(S) -#endif - #ifndef PRSTATUS_SIZE -#define PRSTATUS_SIZE(S) sizeof(S) -#endif - -#ifndef PR_REG_PTR -#define PR_REG_PTR(S) (&((S)->pr_reg)) +#define PRSTATUS_SIZE(S, R) sizeof(S) #endif #ifndef SET_PR_FPVALID -#define SET_PR_FPVALID(S, V) ((S)->pr_fpvalid = (V)) +#define SET_PR_FPVALID(S, V, R) ((S)->pr_fpvalid = (V)) #endif static int fill_thread_core_info(struct elf_thread_core_info *t, @@ -1645,6 +1637,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, long signr, size_t *total) { unsigned int i; + unsigned int regset_size = view->regsets[0].n * view->regsets[0].size; /* * NT_PRSTATUS is the one special case, because the regset data @@ -1653,12 +1646,11 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, * We assume that regset 0 is NT_PRSTATUS. */ fill_prstatus(&t->prstatus, t->task, signr); - (void) view->regsets[0].get(t->task, &view->regsets[0], - 0, PR_REG_SIZE(t->prstatus.pr_reg), - PR_REG_PTR(&t->prstatus), NULL); + (void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset_size, + &t->prstatus.pr_reg, NULL); fill_note(&t->notes[0], "CORE", NT_PRSTATUS, - PRSTATUS_SIZE(t->prstatus), &t->prstatus); + PRSTATUS_SIZE(t->prstatus, regset_size), &t->prstatus); *total += notesize(&t->notes[0]); do_thread_regset_writeback(t->task, &view->regsets[0]); @@ -1688,7 +1680,8 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, regset->core_note_type, size, data); else { - SET_PR_FPVALID(&t->prstatus, 1); + SET_PR_FPVALID(&t->prstatus, + 1, regset_size); fill_note(&t->notes[i], "CORE", NT_PRFPREG, size, data); } From cc87324b3dbb9bdf6916c7f479230db24c4aa309 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 5 Sep 2016 16:33:07 +0300 Subject: [PATCH 316/538] x86/ptrace: Down with test_thread_flag(TIF_IA32) As the task isn't executing at the moment of {GET,SET}REGS, return regset that corresponds to code selector, rather than value of TIF_IA32 flag. I.e. if we ptrace i386 elf binary that has just changed it's code selector to __USER_CS, than GET_REGS will return full x86_64 register set. Note, that this will work only if application has changed it's CS. If the application does 32-bit syscall with __USER_CS, ptrace will still return 64-bit register set. Which might be still confusing for tools that expect TS_COMPACT to be exposed [1, 2]. So this this change should make PTRACE_GETREGSET more reliable and this will be another step to drop TIF_{IA32,X32} flags. [1]: https://sourceforge.net/p/strace/mailman/message/30471411/ [2]: https://lkml.org/lkml/2012/1/18/320 Signed-off-by: Dmitry Safonov Cc: 0x7f454c46@gmail.com Cc: oleg@redhat.com Cc: linux-mm@kvack.org Cc: luto@kernel.org Cc: Pedro Alves Cc: gorcunov@openvz.org Cc: xemul@virtuozzo.com Link: http://lkml.kernel.org/r/20160905133308.28234-6-dsafonov@virtuozzo.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index f79576a541ff..ad0bab8fc594 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1358,7 +1358,7 @@ void update_regset_xstate_info(unsigned int size, u64 xstate_mask) const struct user_regset_view *task_user_regset_view(struct task_struct *task) { #ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(task, TIF_IA32)) + if (!user_64bit_mode(task_pt_regs(task))) #endif #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION return &user_x86_32_view; From 6846351052e685c2d1428e80ead2d7ca3d7ed913 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 5 Sep 2016 16:33:08 +0300 Subject: [PATCH 317/538] x86/signal: Add SA_{X32,IA32}_ABI sa_flags Introduce new flags that defines which ABI to use on creating sigframe. Those flags kernel will set according to sigaction syscall ABI, which set handler for the signal being delivered. So that will drop the dependency on TIF_IA32/TIF_X32 flags on signal deliver. Those flags will be used only under CONFIG_COMPAT. Similar way ARM uses sa_flags to differ in which mode deliver signal for 26-bit applications (look at SA_THIRYTWO). Signed-off-by: Dmitry Safonov Reviewed-by: Andy Lutomirski Cc: 0x7f454c46@gmail.com Cc: oleg@redhat.com Cc: linux-mm@kvack.org Cc: gorcunov@openvz.org Cc: xemul@virtuozzo.com Link: http://lkml.kernel.org/r/20160905133308.28234-7-dsafonov@virtuozzo.com Signed-off-by: Thomas Gleixner --- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/include/asm/fpu/signal.h | 6 ++++++ arch/x86/include/asm/signal.h | 4 ++++ arch/x86/kernel/signal.c | 20 ++++++++++-------- arch/x86/kernel/signal_compat.c | 34 ++++++++++++++++++++++++++++--- kernel/signal.c | 7 +++++++ 6 files changed, 60 insertions(+), 13 deletions(-) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 2f29f4e407c3..cb13c0564ea7 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -378,7 +378,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode); } put_user_catch(err); - err |= copy_siginfo_to_user32(&frame->info, &ksig->info); + err |= __copy_siginfo_to_user32(&frame->info, &ksig->info, false); err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index 0e970d00dfcd..20a1fbf7fe4e 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -19,6 +19,12 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, # define ia32_setup_rt_frame __setup_rt_frame #endif +#ifdef CONFIG_COMPAT +int __copy_siginfo_to_user32(compat_siginfo_t __user *to, + const siginfo_t *from, bool x32_ABI); +#endif + + extern void convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk); extern void convert_to_fxsr(struct task_struct *tsk, diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index dd1e7d6387ab..8af22be0fe61 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -23,6 +23,10 @@ typedef struct { unsigned long sig[_NSIG_WORDS]; } sigset_t; +/* non-uapi in-kernel SA_FLAGS for those indicates ABI for a signal frame */ +#define SA_IA32_ABI 0x02000000u +#define SA_X32_ABI 0x01000000u + #ifndef CONFIG_COMPAT typedef sigset_t compat_sigset_t; #endif diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 04cb3212db2d..b1a5d252d482 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -42,6 +42,7 @@ #include #include +#include #define COPY(x) do { \ get_user_ex(regs->x, &sc->x); \ @@ -547,7 +548,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, return -EFAULT; if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - if (copy_siginfo_to_user32(&frame->info, &ksig->info)) + if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true)) return -EFAULT; } @@ -660,20 +661,21 @@ asmlinkage long sys_rt_sigreturn(void) return 0; } -static inline int is_ia32_compat_frame(void) +static inline int is_ia32_compat_frame(struct ksignal *ksig) { return IS_ENABLED(CONFIG_IA32_EMULATION) && - test_thread_flag(TIF_IA32); + ksig->ka.sa.sa_flags & SA_IA32_ABI; } -static inline int is_ia32_frame(void) +static inline int is_ia32_frame(struct ksignal *ksig) { - return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(); + return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig); } -static inline int is_x32_frame(void) +static inline int is_x32_frame(struct ksignal *ksig) { - return IS_ENABLED(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32); + return IS_ENABLED(CONFIG_X86_X32_ABI) && + ksig->ka.sa.sa_flags & SA_X32_ABI; } static int @@ -684,12 +686,12 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) compat_sigset_t *cset = (compat_sigset_t *) set; /* Set up the stack frame */ - if (is_ia32_frame()) { + if (is_ia32_frame(ksig)) { if (ksig->ka.sa.sa_flags & SA_SIGINFO) return ia32_setup_rt_frame(usig, ksig, cset, regs); else return ia32_setup_frame(usig, ksig, cset, regs); - } else if (is_x32_frame()) { + } else if (is_x32_frame(ksig)) { return x32_setup_rt_frame(ksig, cset, regs); } else { return __setup_rt_frame(ksig->sig, ksig, set, regs); diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index b44564bf86a8..40df33753bae 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -1,5 +1,6 @@ #include #include +#include /* * The compat_siginfo_t structure and handing code is very easy @@ -92,10 +93,31 @@ static inline void signal_compat_build_tests(void) /* any new si_fields should be added here */ } -int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) +void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) +{ + /* Don't leak in-kernel non-uapi flags to user-space */ + if (oact) + oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); + + if (!act) + return; + + /* Don't let flags to be set from userspace */ + act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); + + if (user_64bit_mode(current_pt_regs())) + return; + + if (in_ia32_syscall()) + act->sa.sa_flags |= SA_IA32_ABI; + if (in_x32_syscall()) + act->sa.sa_flags |= SA_X32_ABI; +} + +int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, + bool x32_ABI) { int err = 0; - bool ia32 = test_thread_flag(TIF_IA32); signal_compat_build_tests(); @@ -146,7 +168,7 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) put_user_ex(from->si_arch, &to->si_arch); break; case __SI_CHLD >> 16: - if (ia32) { + if (!x32_ABI) { put_user_ex(from->si_utime, &to->si_utime); put_user_ex(from->si_stime, &to->si_stime); } else { @@ -180,6 +202,12 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) return err; } +/* from syscall's path, where we know the ABI */ +int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) +{ + return __copy_siginfo_to_user32(to, from, in_x32_syscall()); +} + int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) { int err = 0; diff --git a/kernel/signal.c b/kernel/signal.c index af21afc00d08..75761acc77cf 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3044,6 +3044,11 @@ void kernel_sigaction(int sig, __sighandler_t action) } EXPORT_SYMBOL(kernel_sigaction); +void __weak sigaction_compat_abi(struct k_sigaction *act, + struct k_sigaction *oact) +{ +} + int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct task_struct *p = current, *t; @@ -3059,6 +3064,8 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) if (oact) *oact = *k; + sigaction_compat_abi(act, oact); + if (act) { sigdelsetmask(&act->sa.sa_mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); From 12adfd882c5f37548acaba4f043a158b3c54468b Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sat, 23 Jul 2016 19:27:50 +0100 Subject: [PATCH 318/538] list: Expand list_first_entry_or_null() Due to the use of READ_ONCE() in list_empty() the compiler cannot optimise !list_empty() ? list_first_entry() : NULL very well. By manually expanding list_first_entry_or_null() we can take advantage of the READ_ONCE() to avoid the list element changing under the test while the compiler can generate smaller code. Signed-off-by: Chris Wilson Cc: "Paul E. McKenney" Cc: Andrew Morton Cc: Dan Williams Cc: Jan Kara Cc: Josef Bacik Cc: linux-kernel@vger.kernel.org Signed-off-by: Paul E. McKenney --- include/linux/list.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index 5183138aa932..5809e9a2de5b 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -381,8 +381,11 @@ static inline void list_splice_tail_init(struct list_head *list, * * Note that if the list is empty, it returns NULL. */ -#define list_first_entry_or_null(ptr, type, member) \ - (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL) +#define list_first_entry_or_null(ptr, type, member) ({ \ + struct list_head *head__ = (ptr); \ + struct list_head *pos__ = READ_ONCE(head__->next); \ + pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ +}) /** * list_next_entry - get the next element in list From 28f4b04143c56135b1ca742fc64b664ed04de6a4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 14 Sep 2016 16:18:47 +0200 Subject: [PATCH 319/538] genirq/msi: Add cpumask allocation to alloc_msi_entry For irq spreading want to store affinity masks in the msi_entry. Add the infrastructure for it. We allocate an array of cpumasks with an array size of the number of used vectors in the entry, so we can hand in the information per linux interrupt later. As we hand in the number of used vectors, we assign them right away. Convert all the call sites. Signed-off-by: Thomas Gleixner Cc: axboe@fb.com Cc: keith.busch@intel.com Cc: agordeev@redhat.com Cc: linux-block@vger.kernel.org Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/1473862739-15032-2-git-send-email-hch@lst.de --- drivers/base/platform-msi.c | 3 +-- drivers/pci/msi.c | 6 ++---- drivers/staging/fsl-mc/bus/mc-msi.c | 3 +-- include/linux/msi.h | 5 +++-- kernel/irq/msi.c | 26 ++++++++++++++++++++++++-- 5 files changed, 31 insertions(+), 12 deletions(-) diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c index 279e53989374..be6a599bc0c1 100644 --- a/drivers/base/platform-msi.c +++ b/drivers/base/platform-msi.c @@ -142,13 +142,12 @@ static int platform_msi_alloc_descs_with_irq(struct device *dev, int virq, } for (i = 0; i < nvec; i++) { - desc = alloc_msi_entry(dev); + desc = alloc_msi_entry(dev, 1, NULL); if (!desc) break; desc->platform.msi_priv_data = data; desc->platform.msi_index = base + i; - desc->nvec_used = 1; desc->irq = virq ? virq + i : 0; list_add_tail(&desc->list, dev_to_msi_list(dev)); diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 98f12223c734..0db72ba24003 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -555,7 +555,7 @@ static struct msi_desc *msi_setup_entry(struct pci_dev *dev, int nvec) struct msi_desc *entry; /* MSI Entry Initialization */ - entry = alloc_msi_entry(&dev->dev); + entry = alloc_msi_entry(&dev->dev, nvec, NULL); if (!entry) return NULL; @@ -568,7 +568,6 @@ static struct msi_desc *msi_setup_entry(struct pci_dev *dev, int nvec) entry->msi_attrib.default_irq = dev->irq; /* Save IOAPIC IRQ */ entry->msi_attrib.multi_cap = (control & PCI_MSI_FLAGS_QMASK) >> 1; entry->msi_attrib.multiple = ilog2(__roundup_pow_of_two(nvec)); - entry->nvec_used = nvec; entry->affinity = dev->irq_affinity; if (control & PCI_MSI_FLAGS_64BIT) @@ -693,7 +692,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, mask = cpumask_of(cpu); } - entry = alloc_msi_entry(&dev->dev); + entry = alloc_msi_entry(&dev->dev, 1, NULL); if (!entry) { if (!i) iounmap(base); @@ -711,7 +710,6 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, entry->msi_attrib.entry_nr = i; entry->msi_attrib.default_irq = dev->irq; entry->mask_base = base; - entry->nvec_used = 1; entry->affinity = mask; list_add_tail(&entry->list, dev_to_msi_list(&dev->dev)); diff --git a/drivers/staging/fsl-mc/bus/mc-msi.c b/drivers/staging/fsl-mc/bus/mc-msi.c index c7be156ae5e0..4fd8e41ef468 100644 --- a/drivers/staging/fsl-mc/bus/mc-msi.c +++ b/drivers/staging/fsl-mc/bus/mc-msi.c @@ -213,7 +213,7 @@ static int fsl_mc_msi_alloc_descs(struct device *dev, unsigned int irq_count) struct msi_desc *msi_desc; for (i = 0; i < irq_count; i++) { - msi_desc = alloc_msi_entry(dev); + msi_desc = alloc_msi_entry(dev, 1, NULL); if (!msi_desc) { dev_err(dev, "Failed to allocate msi entry\n"); error = -ENOMEM; @@ -221,7 +221,6 @@ static int fsl_mc_msi_alloc_descs(struct device *dev, unsigned int irq_count) } msi_desc->fsl_mc.msi_index = i; - msi_desc->nvec_used = 1; INIT_LIST_HEAD(&msi_desc->list); list_add_tail(&msi_desc->list, dev_to_msi_list(dev)); } diff --git a/include/linux/msi.h b/include/linux/msi.h index e8c81fbd5f9c..0db320b7bb15 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -68,7 +68,7 @@ struct msi_desc { unsigned int nvec_used; struct device *dev; struct msi_msg msg; - const struct cpumask *affinity; + struct cpumask *affinity; union { /* PCI MSI/X specific data */ @@ -123,7 +123,8 @@ static inline void *msi_desc_to_pci_sysdata(struct msi_desc *desc) } #endif /* CONFIG_PCI_MSI */ -struct msi_desc *alloc_msi_entry(struct device *dev); +struct msi_desc *alloc_msi_entry(struct device *dev, int nvec, + const struct cpumask *affinity); void free_msi_entry(struct msi_desc *entry); void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 19e9dfbe97fa..8a3e872798f3 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -18,20 +18,42 @@ /* Temparory solution for building, will be removed later */ #include -struct msi_desc *alloc_msi_entry(struct device *dev) +/** + * alloc_msi_entry - Allocate an initialize msi_entry + * @dev: Pointer to the device for which this is allocated + * @nvec: The number of vectors used in this entry + * @affinity: Optional pointer to an affinity mask array size of @nvec + * + * If @affinity is not NULL then a an affinity array[@nvec] is allocated + * and the affinity masks from @affinity are copied. + */ +struct msi_desc * +alloc_msi_entry(struct device *dev, int nvec, const struct cpumask *affinity) { - struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL); + struct msi_desc *desc; + + desc = kzalloc(sizeof(*desc), GFP_KERNEL); if (!desc) return NULL; INIT_LIST_HEAD(&desc->list); desc->dev = dev; + desc->nvec_used = nvec; + if (affinity) { + desc->affinity = kmemdup(affinity, + nvec * sizeof(*desc->affinity), GFP_KERNEL); + if (!desc->affinity) { + kfree(desc); + return NULL; + } + } return desc; } void free_msi_entry(struct msi_desc *entry) { + kfree(entry->affinity); kfree(entry); } From 34c3d9819fda464be4f1bec59b63353814f76c73 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 14 Sep 2016 16:18:48 +0200 Subject: [PATCH 320/538] genirq/affinity: Provide smarter irq spreading infrastructure The current irq spreading infrastructure is just looking at a cpumask and tries to spread the interrupts over the mask. Thats suboptimal as it does not take numa nodes into account. Change the logic so the interrupts are spread across numa nodes and inside the nodes. If there are more cpus than vectors per node, then we set the affinity to several cpus. If HT siblings are available we take that into account and try to set all siblings to a single vector. Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: axboe@fb.com Cc: keith.busch@intel.com Cc: agordeev@redhat.com Cc: linux-block@vger.kernel.org Link: http://lkml.kernel.org/r/1473862739-15032-3-git-send-email-hch@lst.de --- include/linux/interrupt.h | 15 ++++ kernel/irq/affinity.c | 149 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index b6683f0ffc9f..4e59d122cad9 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -279,6 +279,8 @@ extern int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs); +struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, int nvec); +int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec); #else /* CONFIG_SMP */ @@ -316,6 +318,19 @@ static inline struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) *nr_vecs = 1; return NULL; } + +static inline struct cpumask * +irq_create_affinity_masks(const struct cpumask *affinity, int nvec) +{ + return NULL; +} + +static inline int +irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec) +{ + return maxvec; +} + #endif /* CONFIG_SMP */ /* diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 32f6cfcff212..7812fecc6e2f 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -4,6 +4,155 @@ #include #include +static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, + int cpus_per_vec) +{ + const struct cpumask *siblmsk; + int cpu, sibl; + + for ( ; cpus_per_vec > 0; ) { + cpu = cpumask_first(nmsk); + + /* Should not happen, but I'm too lazy to think about it */ + if (cpu >= nr_cpu_ids) + return; + + cpumask_clear_cpu(cpu, nmsk); + cpumask_set_cpu(cpu, irqmsk); + cpus_per_vec--; + + /* If the cpu has siblings, use them first */ + siblmsk = topology_sibling_cpumask(cpu); + for (sibl = -1; cpus_per_vec > 0; ) { + sibl = cpumask_next(sibl, siblmsk); + if (sibl >= nr_cpu_ids) + break; + if (!cpumask_test_and_clear_cpu(sibl, nmsk)) + continue; + cpumask_set_cpu(sibl, irqmsk); + cpus_per_vec--; + } + } +} + +static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk) +{ + int n, nodes; + + /* Calculate the number of nodes in the supplied affinity mask */ + for (n = 0, nodes = 0; n < num_online_nodes(); n++) { + if (cpumask_intersects(mask, cpumask_of_node(n))) { + node_set(n, *nodemsk); + nodes++; + } + } + return nodes; +} + +/** + * irq_create_affinity_masks - Create affinity masks for multiqueue spreading + * @affinity: The affinity mask to spread. If NULL cpu_online_mask + * is used + * @nvecs: The number of vectors + * + * Returns the masks pointer or NULL if allocation failed. + */ +struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, + int nvec) +{ + int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec = 0; + nodemask_t nodemsk = NODE_MASK_NONE; + struct cpumask *masks; + cpumask_var_t nmsk; + + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) + return NULL; + + masks = kzalloc(nvec * sizeof(*masks), GFP_KERNEL); + if (!masks) + goto out; + + /* Stabilize the cpumasks */ + get_online_cpus(); + /* If the supplied affinity mask is NULL, use cpu online mask */ + if (!affinity) + affinity = cpu_online_mask; + + nodes = get_nodes_in_cpumask(affinity, &nodemsk); + + /* + * If the number of nodes in the mask is less than or equal the + * number of vectors we just spread the vectors across the nodes. + */ + if (nvec <= nodes) { + for_each_node_mask(n, nodemsk) { + cpumask_copy(masks + curvec, cpumask_of_node(n)); + if (++curvec == nvec) + break; + } + goto outonl; + } + + /* Spread the vectors per node */ + vecs_per_node = nvec / nodes; + /* Account for rounding errors */ + extra_vecs = nvec - (nodes * vecs_per_node); + + for_each_node_mask(n, nodemsk) { + int ncpus, v, vecs_to_assign = vecs_per_node; + + /* Get the cpus on this node which are in the mask */ + cpumask_and(nmsk, affinity, cpumask_of_node(n)); + + /* Calculate the number of cpus per vector */ + ncpus = cpumask_weight(nmsk); + + for (v = 0; curvec < nvec && v < vecs_to_assign; curvec++, v++) { + cpus_per_vec = ncpus / vecs_to_assign; + + /* Account for extra vectors to compensate rounding errors */ + if (extra_vecs) { + cpus_per_vec++; + if (!--extra_vecs) + vecs_per_node++; + } + irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec); + } + + if (curvec >= nvec) + break; + } + +outonl: + put_online_cpus(); +out: + free_cpumask_var(nmsk); + return masks; +} + +/** + * irq_calc_affinity_vectors - Calculate to optimal number of vectors for a given affinity mask + * @affinity: The affinity mask to spread. If NULL cpu_online_mask + * is used + * @maxvec: The maximum number of vectors available + */ +int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec) +{ + int cpus, ret; + + /* Stabilize the cpumasks */ + get_online_cpus(); + /* If the supplied affinity mask is NULL, use cpu online mask */ + if (!affinity) + affinity = cpu_online_mask; + + cpus = cpumask_weight(affinity); + ret = (cpus < maxvec) ? cpus : maxvec; + + put_online_cpus(); + return ret; +} + static int get_first_sibling(unsigned int cpu) { unsigned int ret; From e75eafb9b0395c338230b0eef2cc92ca8d20dee2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 14 Sep 2016 16:18:49 +0200 Subject: [PATCH 321/538] genirq/msi: Switch to new irq spreading infrastructure Switch MSI over to the new spreading code. If a pci device contains a valid pointer to a cpumask, then this mask is used for spreading otherwise the online cpu mask is used. This allows a driver to restrict the spread to a subset of CPUs, e.g. cpus on a particular node. Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: axboe@fb.com Cc: keith.busch@intel.com Cc: agordeev@redhat.com Cc: linux-block@vger.kernel.org Link: http://lkml.kernel.org/r/1473862739-15032-4-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- drivers/pci/msi.c | 128 ++++++++++++++++++++++++------------------- kernel/irq/irqdesc.c | 31 +++++------ 2 files changed, 87 insertions(+), 72 deletions(-) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 0db72ba24003..06100dde0e86 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -549,15 +549,23 @@ static int populate_msi_sysfs(struct pci_dev *pdev) return ret; } -static struct msi_desc *msi_setup_entry(struct pci_dev *dev, int nvec) +static struct msi_desc * +msi_setup_entry(struct pci_dev *dev, int nvec, bool affinity) { - u16 control; + struct cpumask *masks = NULL; struct msi_desc *entry; + u16 control; + + if (affinity) { + masks = irq_create_affinity_masks(dev->irq_affinity, nvec); + if (!masks) + pr_err("Unable to allocate affinity masks, ignoring\n"); + } /* MSI Entry Initialization */ - entry = alloc_msi_entry(&dev->dev, nvec, NULL); + entry = alloc_msi_entry(&dev->dev, nvec, masks); if (!entry) - return NULL; + goto out; pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &control); @@ -568,7 +576,6 @@ static struct msi_desc *msi_setup_entry(struct pci_dev *dev, int nvec) entry->msi_attrib.default_irq = dev->irq; /* Save IOAPIC IRQ */ entry->msi_attrib.multi_cap = (control & PCI_MSI_FLAGS_QMASK) >> 1; entry->msi_attrib.multiple = ilog2(__roundup_pow_of_two(nvec)); - entry->affinity = dev->irq_affinity; if (control & PCI_MSI_FLAGS_64BIT) entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_64; @@ -579,6 +586,8 @@ static struct msi_desc *msi_setup_entry(struct pci_dev *dev, int nvec) if (entry->msi_attrib.maskbit) pci_read_config_dword(dev, entry->mask_pos, &entry->masked); +out: + kfree(masks); return entry; } @@ -607,7 +616,7 @@ static int msi_verify_entries(struct pci_dev *dev) * an error, and a positive return value indicates the number of interrupts * which could have been allocated. */ -static int msi_capability_init(struct pci_dev *dev, int nvec) +static int msi_capability_init(struct pci_dev *dev, int nvec, bool affinity) { struct msi_desc *entry; int ret; @@ -615,7 +624,7 @@ static int msi_capability_init(struct pci_dev *dev, int nvec) pci_msi_set_enable(dev, 0); /* Disable MSI during set up */ - entry = msi_setup_entry(dev, nvec); + entry = msi_setup_entry(dev, nvec, affinity); if (!entry) return -ENOMEM; @@ -678,28 +687,29 @@ static void __iomem *msix_map_region(struct pci_dev *dev, unsigned nr_entries) } static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, - struct msix_entry *entries, int nvec) + struct msix_entry *entries, int nvec, + bool affinity) { - const struct cpumask *mask = NULL; + struct cpumask *curmsk, *masks = NULL; struct msi_desc *entry; - int cpu = -1, i; - - for (i = 0; i < nvec; i++) { - if (dev->irq_affinity) { - cpu = cpumask_next(cpu, dev->irq_affinity); - if (cpu >= nr_cpu_ids) - cpu = cpumask_first(dev->irq_affinity); - mask = cpumask_of(cpu); - } + int ret, i; - entry = alloc_msi_entry(&dev->dev, 1, NULL); + if (affinity) { + masks = irq_create_affinity_masks(dev->irq_affinity, nvec); + if (!masks) + pr_err("Unable to allocate affinity masks, ignoring\n"); + } + + for (i = 0, curmsk = masks; i < nvec; i++) { + entry = alloc_msi_entry(&dev->dev, 1, curmsk); if (!entry) { if (!i) iounmap(base); else free_msi_irqs(dev); /* No enough memory. Don't try again */ - return -ENOMEM; + ret = -ENOMEM; + goto out; } entry->msi_attrib.is_msix = 1; @@ -710,11 +720,14 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, entry->msi_attrib.entry_nr = i; entry->msi_attrib.default_irq = dev->irq; entry->mask_base = base; - entry->affinity = mask; list_add_tail(&entry->list, dev_to_msi_list(&dev->dev)); + if (masks) + curmsk++; } - + ret = 0; +out: + kfree(masks); return 0; } @@ -743,8 +756,8 @@ static void msix_program_entries(struct pci_dev *dev, * single MSI-X irq. A return of zero indicates the successful setup of * requested MSI-X entries with allocated irqs or non-zero for otherwise. **/ -static int msix_capability_init(struct pci_dev *dev, - struct msix_entry *entries, int nvec) +static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, + int nvec, bool affinity) { int ret; u16 control; @@ -759,7 +772,7 @@ static int msix_capability_init(struct pci_dev *dev, if (!base) return -ENOMEM; - ret = msix_setup_entries(dev, base, entries, nvec); + ret = msix_setup_entries(dev, base, entries, nvec, affinity); if (ret) return ret; @@ -939,22 +952,8 @@ int pci_msix_vec_count(struct pci_dev *dev) } EXPORT_SYMBOL(pci_msix_vec_count); -/** - * pci_enable_msix - configure device's MSI-X capability structure - * @dev: pointer to the pci_dev data structure of MSI-X device function - * @entries: pointer to an array of MSI-X entries (optional) - * @nvec: number of MSI-X irqs requested for allocation by device driver - * - * Setup the MSI-X capability structure of device function with the number - * of requested irqs upon its software driver call to request for - * MSI-X mode enabled on its hardware device function. A return of zero - * indicates the successful configuration of MSI-X capability structure - * with new allocated MSI-X irqs. A return of < 0 indicates a failure. - * Or a return of > 0 indicates that driver request is exceeding the number - * of irqs or MSI-X vectors available. Driver should use the returned value to - * re-send its request. - **/ -int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec) +static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, + int nvec, bool affinity) { int nr_entries; int i, j; @@ -986,7 +985,27 @@ int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec) dev_info(&dev->dev, "can't enable MSI-X (MSI IRQ already assigned)\n"); return -EINVAL; } - return msix_capability_init(dev, entries, nvec); + return msix_capability_init(dev, entries, nvec, affinity); +} + +/** + * pci_enable_msix - configure device's MSI-X capability structure + * @dev: pointer to the pci_dev data structure of MSI-X device function + * @entries: pointer to an array of MSI-X entries (optional) + * @nvec: number of MSI-X irqs requested for allocation by device driver + * + * Setup the MSI-X capability structure of device function with the number + * of requested irqs upon its software driver call to request for + * MSI-X mode enabled on its hardware device function. A return of zero + * indicates the successful configuration of MSI-X capability structure + * with new allocated MSI-X irqs. A return of < 0 indicates a failure. + * Or a return of > 0 indicates that driver request is exceeding the number + * of irqs or MSI-X vectors available. Driver should use the returned value to + * re-send its request. + **/ +int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec) +{ + return __pci_enable_msix(dev, entries, nvec, false); } EXPORT_SYMBOL(pci_enable_msix); @@ -1039,6 +1058,7 @@ EXPORT_SYMBOL(pci_msi_enabled); static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec, unsigned int flags) { + bool affinity = flags & PCI_IRQ_AFFINITY; int nvec; int rc; @@ -1067,19 +1087,17 @@ static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec, nvec = maxvec; for (;;) { - if (flags & PCI_IRQ_AFFINITY) { - dev->irq_affinity = irq_create_affinity_mask(&nvec); + if (affinity) { + nvec = irq_calc_affinity_vectors(dev->irq_affinity, + nvec); if (nvec < minvec) return -ENOSPC; } - rc = msi_capability_init(dev, nvec); + rc = msi_capability_init(dev, nvec, affinity); if (rc == 0) return nvec; - kfree(dev->irq_affinity); - dev->irq_affinity = NULL; - if (rc < 0) return rc; if (rc < minvec) @@ -1111,26 +1129,24 @@ static int __pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, int minvec, int maxvec, unsigned int flags) { - int nvec = maxvec; - int rc; + bool affinity = flags & PCI_IRQ_AFFINITY; + int rc, nvec = maxvec; if (maxvec < minvec) return -ERANGE; for (;;) { - if (flags & PCI_IRQ_AFFINITY) { - dev->irq_affinity = irq_create_affinity_mask(&nvec); + if (affinity) { + nvec = irq_calc_affinity_vectors(dev->irq_affinity, + nvec); if (nvec < minvec) return -ENOSPC; } - rc = pci_enable_msix(dev, entries, nvec); + rc = __pci_enable_msix(dev, entries, nvec, affinity); if (rc == 0) return nvec; - kfree(dev->irq_affinity); - dev->irq_affinity = NULL; - if (rc < 0) return rc; if (rc < minvec) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index a623b44f2d4b..5a5a685aba33 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -236,25 +236,24 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, const struct cpumask *mask = NULL; struct irq_desc *desc; unsigned int flags; - int i, cpu = -1; + int i; - if (affinity && cpumask_empty(affinity)) - return -EINVAL; + /* Validate affinity mask(s) */ + if (affinity) { + for (i = 0, mask = affinity; i < cnt; i++, mask++) { + if (cpumask_empty(mask)) + return -EINVAL; + } + } flags = affinity ? IRQD_AFFINITY_MANAGED : 0; + mask = NULL; for (i = 0; i < cnt; i++) { if (affinity) { - cpu = cpumask_next(cpu, affinity); - if (cpu >= nr_cpu_ids) - cpu = cpumask_first(affinity); - node = cpu_to_node(cpu); - - /* - * For single allocations we use the caller provided - * mask otherwise we use the mask of the target cpu - */ - mask = cnt == 1 ? affinity : cpumask_of(cpu); + node = cpu_to_node(cpumask_first(affinity)); + mask = affinity; + affinity++; } desc = alloc_desc(start + i, node, flags, mask, owner); if (!desc) @@ -481,9 +480,9 @@ EXPORT_SYMBOL_GPL(irq_free_descs); * @cnt: Number of consecutive irqs to allocate. * @node: Preferred node on which the irq descriptor should be allocated * @owner: Owning module (can be NULL) - * @affinity: Optional pointer to an affinity mask which hints where the - * irq descriptors should be allocated and which default - * affinities to use + * @affinity: Optional pointer to an affinity mask array of size @cnt which + * hints where the irq descriptors should be allocated and which + * default affinities to use * * Returns the first irq number or error code */ From 44082fd6702fb12020967fd375f8bf6dd7c111bf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 14 Sep 2016 16:18:50 +0200 Subject: [PATCH 322/538] genirq/affinity: Remove old irq spread infrastructure No more users. Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: axboe@fb.com Cc: keith.busch@intel.com Cc: agordeev@redhat.com Cc: linux-block@vger.kernel.org Link: http://lkml.kernel.org/r/1473862739-15032-5-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- include/linux/interrupt.h | 7 ----- kernel/irq/affinity.c | 58 --------------------------------------- 2 files changed, 65 deletions(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 4e59d122cad9..72f0721f75e7 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -278,7 +278,6 @@ extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m); extern int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); -struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs); struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, int nvec); int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec); @@ -313,12 +312,6 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) return 0; } -static inline struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) -{ - *nr_vecs = 1; - return NULL; -} - static inline struct cpumask * irq_create_affinity_masks(const struct cpumask *affinity, int nvec) { diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 7812fecc6e2f..17f51d63da56 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -152,61 +152,3 @@ int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec) put_online_cpus(); return ret; } - -static int get_first_sibling(unsigned int cpu) -{ - unsigned int ret; - - ret = cpumask_first(topology_sibling_cpumask(cpu)); - if (ret < nr_cpu_ids) - return ret; - return cpu; -} - -/* - * Take a map of online CPUs and the number of available interrupt vectors - * and generate an output cpumask suitable for spreading MSI/MSI-X vectors - * so that they are distributed as good as possible around the CPUs. If - * more vectors than CPUs are available we'll map one to each CPU, - * otherwise we map one to the first sibling of each socket. - * - * If there are more vectors than CPUs we will still only have one bit - * set per CPU, but interrupt code will keep on assigning the vectors from - * the start of the bitmap until we run out of vectors. - */ -struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) -{ - struct cpumask *affinity_mask; - unsigned int max_vecs = *nr_vecs; - - if (max_vecs == 1) - return NULL; - - affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL); - if (!affinity_mask) { - *nr_vecs = 1; - return NULL; - } - - get_online_cpus(); - if (max_vecs >= num_online_cpus()) { - cpumask_copy(affinity_mask, cpu_online_mask); - *nr_vecs = num_online_cpus(); - } else { - unsigned int vecs = 0, cpu; - - for_each_online_cpu(cpu) { - if (cpu == get_first_sibling(cpu)) { - cpumask_set_cpu(cpu, affinity_mask); - vecs++; - } - - if (--max_vecs == 0) - break; - } - *nr_vecs = vecs; - } - put_online_cpus(); - - return affinity_mask; -} From ee8d41e53efe14bfc5ea5866e1178b06d78a7c95 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 14 Sep 2016 16:18:51 +0200 Subject: [PATCH 323/538] pci/msi: Retrieve affinity for a vector Add a helper to get the affinity mask for a given PCI irq vector. For MSI or MSI-X vectors these are stored by the IRQ core, while for legacy interrupts we will always return cpu_possible_map. [hch: updated to follow the style of pci_irq_vector()] Signed-off-by: Thomas Gleixner Signed-off-by: Christoph Hellwig Cc: axboe@fb.com Cc: keith.busch@intel.com Cc: agordeev@redhat.com Cc: linux-block@vger.kernel.org Link: http://lkml.kernel.org/r/1473862739-15032-6-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- drivers/pci/msi.c | 31 +++++++++++++++++++++++++++++++ include/linux/pci.h | 6 ++++++ 2 files changed, 37 insertions(+) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 06100dde0e86..9da5ecb41f0b 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -1270,6 +1270,37 @@ int pci_irq_vector(struct pci_dev *dev, unsigned int nr) } EXPORT_SYMBOL(pci_irq_vector); +/** + * pci_irq_get_affinity - return the affinity of a particular msi vector + * @dev: PCI device to operate on + * @nr: device-relative interrupt vector index (0-based). + */ +const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr) +{ + if (dev->msix_enabled) { + struct msi_desc *entry; + int i = 0; + + for_each_pci_msi_entry(entry, dev) { + if (i == nr) + return entry->affinity; + i++; + } + WARN_ON_ONCE(1); + return NULL; + } else if (dev->msi_enabled) { + struct msi_desc *entry = first_pci_msi_entry(dev); + + if (WARN_ON_ONCE(!entry || nr >= entry->nvec_used)) + return NULL; + + return &entry->affinity[nr]; + } else { + return cpu_possible_mask; + } +} +EXPORT_SYMBOL(pci_irq_get_affinity); + struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc) { return to_pci_dev(desc->dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index 0ab835965669..3b0a8004f313 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1300,6 +1300,7 @@ int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, unsigned int max_vecs, unsigned int flags); void pci_free_irq_vectors(struct pci_dev *dev); int pci_irq_vector(struct pci_dev *dev, unsigned int nr); +const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev, int vec); #else static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; } @@ -1342,6 +1343,11 @@ static inline int pci_irq_vector(struct pci_dev *dev, unsigned int nr) return -EINVAL; return dev->irq; } +static inline const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev, + int vec) +{ + return cpu_possible_mask; +} #endif #ifdef CONFIG_PCIEPORTBUS From 9c00390757fd9f5851f7973b2f0e1e41550bb3b8 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 14 Sep 2016 21:07:41 -0500 Subject: [PATCH 324/538] x86/dumpstack: Simplify in_exception_stack() in_exception_stack() does some bad, bad things just so the unwinder can print different values for different areas of the debug exception stack. There's no need to clarify where exactly on the stack it is. Just print "#DB" and be done with it. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e91cb410169dd576678dd427c35efb716fd0cee1.1473905218.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_64.c | 89 ++++++++++------------------------ 1 file changed, 26 insertions(+), 63 deletions(-) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 07373bec76f1..904fb46d7d65 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -16,83 +16,46 @@ #include +static char *exception_stack_names[N_EXCEPTION_STACKS] = { + [ DOUBLEFAULT_STACK-1 ] = "#DF", + [ NMI_STACK-1 ] = "NMI", + [ DEBUG_STACK-1 ] = "#DB", + [ MCE_STACK-1 ] = "#MC", +}; -#define N_EXCEPTION_STACKS_END \ - (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) - -static char x86_stack_ids[][8] = { - [ DEBUG_STACK-1 ] = "#DB", - [ NMI_STACK-1 ] = "NMI", - [ DOUBLEFAULT_STACK-1 ] = "#DF", - [ MCE_STACK-1 ] = "#MC", -#if DEBUG_STKSZ > EXCEPTION_STKSZ - [ N_EXCEPTION_STACKS ... - N_EXCEPTION_STACKS_END ] = "#DB[?]" -#endif +static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, + [DEBUG_STACK - 1] = DEBUG_STKSZ }; static unsigned long *in_exception_stack(unsigned long stack, unsigned *usedp, char **idp) { + unsigned long begin, end; unsigned k; - /* - * Iterate over all exception stacks, and figure out whether - * 'stack' is in one of them: - */ + BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + for (k = 0; k < N_EXCEPTION_STACKS; k++) { - unsigned long end = raw_cpu_ptr(&orig_ist)->ist[k]; - /* - * Is 'stack' above this exception frame's end? - * If yes then skip to the next frame. - */ - if (stack >= end) + end = raw_cpu_ptr(&orig_ist)->ist[k]; + begin = end - exception_stack_sizes[k]; + + if (stack < begin || stack >= end) continue; + /* - * Is 'stack' above this exception frame's start address? - * If yes then we found the right frame. - */ - if (stack >= end - EXCEPTION_STKSZ) { - /* - * Make sure we only iterate through an exception - * stack once. If it comes up for the second time - * then there's something wrong going on - just - * break out and return NULL: - */ - if (*usedp & (1U << k)) - break; - *usedp |= 1U << k; - *idp = x86_stack_ids[k]; - return (unsigned long *)end; - } - /* - * If this is a debug stack, and if it has a larger size than - * the usual exception stacks, then 'stack' might still - * be within the lower portion of the debug stack: + * Make sure we only iterate through an exception stack once. + * If it comes up for the second time then there's something + * wrong going on - just break and return NULL: */ -#if DEBUG_STKSZ > EXCEPTION_STKSZ - if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { - unsigned j = N_EXCEPTION_STACKS - 1; + if (*usedp & (1U << k)) + break; + *usedp |= 1U << k; - /* - * Black magic. A large debug stack is composed of - * multiple exception stack entries, which we - * iterate through now. Dont look: - */ - do { - ++j; - end -= EXCEPTION_STKSZ; - x86_stack_ids[j][4] = '1' + - (j - N_EXCEPTION_STACKS); - } while (stack < end - EXCEPTION_STKSZ); - if (*usedp & (1U << j)) - break; - *usedp |= 1U << j; - *idp = x86_stack_ids[j]; - return (unsigned long *)end; - } -#endif + *idp = exception_stack_names[k]; + return (unsigned long *)end; } + return NULL; } From cb76c93982404273d746f3ccd5085b47689099a8 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 14 Sep 2016 21:07:42 -0500 Subject: [PATCH 325/538] x86/dumpstack: Add get_stack_info() interface valid_stack_ptr() is buggy: it assumes that all stacks are of size THREAD_SIZE, which is not true for exception stacks. So the walk_stack() callbacks will need to know the location of the beginning of the stack as well as the end. Another issue is that in general the various features of a stack (type, size, next stack pointer, description string) are scattered around in various places throughout the stack dump code. Encapsulate all that information in a single place with a new stack_info struct and a get_stack_info() interface. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/8164dd0db96b7e6a279fa17ae5e6dc375eecb4a9.1473905218.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 2 +- arch/x86/include/asm/stacktrace.h | 41 +++++++- arch/x86/kernel/dumpstack.c | 40 +++---- arch/x86/kernel/dumpstack_32.c | 106 ++++++++++++++----- arch/x86/kernel/dumpstack_64.c | 169 ++++++++++++++++-------------- arch/x86/kernel/stacktrace.c | 2 +- arch/x86/oprofile/backtrace.c | 2 +- 7 files changed, 234 insertions(+), 128 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index c1319ac19ebb..477dc38b62b1 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2251,7 +2251,7 @@ void arch_perf_update_userpage(struct perf_event *event, * callchain support */ -static int backtrace_stack(void *data, char *name) +static int backtrace_stack(void *data, const char *name) { return 0; } diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 3552f5e7189e..780a83efcfd3 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -10,6 +10,39 @@ #include #include +enum stack_type { + STACK_TYPE_UNKNOWN, + STACK_TYPE_TASK, + STACK_TYPE_IRQ, + STACK_TYPE_SOFTIRQ, + STACK_TYPE_EXCEPTION, + STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, +}; + +struct stack_info { + enum stack_type type; + unsigned long *begin, *end, *next_sp; +}; + +bool in_task_stack(unsigned long *stack, struct task_struct *task, + struct stack_info *info); + +int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask); + +void stack_type_str(enum stack_type type, const char **begin, + const char **end); + +static inline bool on_stack(struct stack_info *info, void *addr, size_t len) +{ + void *begin = info->begin; + void *end = info->end; + + return (info->type != STACK_TYPE_UNKNOWN && + addr >= begin && addr < end && + addr + len > begin && addr + len <= end); +} + extern int kstack_depth_to_print; struct thread_info; @@ -20,27 +53,27 @@ typedef unsigned long (*walk_stack_t)(struct task_struct *task, unsigned long bp, const struct stacktrace_ops *ops, void *data, - unsigned long *end, + struct stack_info *info, int *graph); extern unsigned long print_context_stack(struct task_struct *task, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, - unsigned long *end, int *graph); + struct stack_info *info, int *graph); extern unsigned long print_context_stack_bp(struct task_struct *task, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, - unsigned long *end, int *graph); + struct stack_info *info, int *graph); /* Generic stack tracer with callbacks */ struct stacktrace_ops { int (*address)(void *data, unsigned long address, int reliable); /* On negative return stop dumping */ - int (*stack)(void *data, char *name); + int (*stack)(void *data, const char *name); walk_stack_t walk_stack; }; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c6c6c39c367f..aa208e565b03 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -25,6 +25,23 @@ unsigned int code_bytes = 64; int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; +bool in_task_stack(unsigned long *stack, struct task_struct *task, + struct stack_info *info) +{ + unsigned long *begin = task_stack_page(task); + unsigned long *end = task_stack_page(task) + THREAD_SIZE; + + if (stack < begin || stack >= end) + return false; + + info->type = STACK_TYPE_TASK; + info->begin = begin; + info->end = end; + info->next_sp = NULL; + + return true; +} + static void printk_stack_address(unsigned long address, int reliable, char *log_lvl) { @@ -46,24 +63,11 @@ void printk_address(unsigned long address) * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -static inline int valid_stack_ptr(struct task_struct *task, - void *p, unsigned int size, void *end) -{ - void *t = task_stack_page(task); - if (end) { - if (p < end && p >= (end-THREAD_SIZE)) - return 1; - else - return 0; - } - return p >= t && p < t + THREAD_SIZE - size; -} - unsigned long print_context_stack(struct task_struct *task, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, - unsigned long *end, int *graph) + struct stack_info *info, int *graph) { struct stack_frame *frame = (struct stack_frame *)bp; @@ -75,7 +79,7 @@ print_context_stack(struct task_struct *task, PAGE_SIZE) stack = (unsigned long *)task_stack_page(task); - while (valid_stack_ptr(task, stack, sizeof(*stack), end)) { + while (on_stack(info, stack, sizeof(*stack))) { unsigned long addr = *stack; if (__kernel_text_address(addr)) { @@ -114,12 +118,12 @@ unsigned long print_context_stack_bp(struct task_struct *task, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, - unsigned long *end, int *graph) + struct stack_info *info, int *graph) { struct stack_frame *frame = (struct stack_frame *)bp; unsigned long *retp = &frame->return_address; - while (valid_stack_ptr(task, retp, sizeof(*retp), end)) { + while (on_stack(info, stack, sizeof(*stack) * 2)) { unsigned long addr = *retp; unsigned long real_addr; @@ -138,7 +142,7 @@ print_context_stack_bp(struct task_struct *task, } EXPORT_SYMBOL_GPL(print_context_stack_bp); -static int print_trace_stack(void *data, char *name) +static int print_trace_stack(void *data, const char *name) { printk("%s <%s> ", (char *)data, name); return 0; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index da5cd62f93ab..c92da5a4d663 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -16,61 +16,117 @@ #include -static void *is_irq_stack(void *p, void *irq) +void stack_type_str(enum stack_type type, const char **begin, const char **end) { - if (p < irq || p >= (irq + THREAD_SIZE)) - return NULL; - return irq + THREAD_SIZE; + switch (type) { + case STACK_TYPE_IRQ: + case STACK_TYPE_SOFTIRQ: + *begin = "IRQ"; + *end = "EOI"; + break; + default: + *begin = NULL; + *end = NULL; + } } +static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) +{ + unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack); + unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); + + if (stack < begin || stack >= end) + return false; + + info->type = STACK_TYPE_IRQ; + info->begin = begin; + info->end = end; + + /* + * See irq_32.c -- the next stack pointer is stored at the beginning of + * the stack. + */ + info->next_sp = (unsigned long *)*begin; + + return true; +} -static void *is_hardirq_stack(unsigned long *stack) +static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) { - void *irq = this_cpu_read(hardirq_stack); + unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack); + unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); + + if (stack < begin || stack >= end) + return false; + + info->type = STACK_TYPE_SOFTIRQ; + info->begin = begin; + info->end = end; + + /* + * The next stack pointer is stored at the beginning of the stack. + * See irq_32.c. + */ + info->next_sp = (unsigned long *)*begin; - return is_irq_stack(stack, irq); + return true; } -static void *is_softirq_stack(unsigned long *stack) +int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask) { - void *irq = this_cpu_read(softirq_stack); + if (!stack) + goto unknown; - return is_irq_stack(stack, irq); + task = task ? : current; + + if (in_task_stack(stack, task, info)) + return 0; + + if (task != current) + goto unknown; + + if (in_hardirq_stack(stack, info)) + return 0; + + if (in_softirq_stack(stack, info)) + return 0; + +unknown: + info->type = STACK_TYPE_UNKNOWN; + return -EINVAL; } void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { + unsigned long visit_mask = 0; int graph = 0; - u32 *prev_esp; task = task ? : current; stack = stack ? : get_stack_pointer(task, regs); bp = bp ? : (unsigned long)get_frame_pointer(task, regs); for (;;) { - void *end_stack; + const char *begin_str, *end_str; + struct stack_info info; - end_stack = is_hardirq_stack(stack); - if (!end_stack) - end_stack = is_softirq_stack(stack); + if (get_stack_info(stack, task, &info, &visit_mask)) + break; - bp = ops->walk_stack(task, stack, bp, ops, data, - end_stack, &graph); + stack_type_str(info.type, &begin_str, &end_str); - /* Stop if not on irq stack */ - if (!end_stack) + if (begin_str && ops->stack(data, begin_str) < 0) break; - /* The previous esp is saved on the bottom of the stack */ - prev_esp = (u32 *)(end_stack - THREAD_SIZE); - stack = (unsigned long *)*prev_esp; - if (!stack) - break; + bp = ops->walk_stack(task, stack, bp, ops, data, &info, &graph); - if (ops->stack(data, "IRQ") < 0) + if (end_str && ops->stack(data, end_str) < 0) break; + + stack = info.next_sp; + touch_nmi_watchdog(); } } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 904fb46d7d65..41813abc7380 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -28,76 +28,109 @@ static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { [DEBUG_STACK - 1] = DEBUG_STKSZ }; -static unsigned long *in_exception_stack(unsigned long stack, unsigned *usedp, - char **idp) +void stack_type_str(enum stack_type type, const char **begin, const char **end) { - unsigned long begin, end; + BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + + switch (type) { + case STACK_TYPE_IRQ: + *begin = "IRQ"; + *end = "EOI"; + break; + case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST: + *begin = exception_stack_names[type - STACK_TYPE_EXCEPTION]; + *end = "EOE"; + break; + default: + *begin = NULL; + *end = NULL; + } +} + +static bool in_exception_stack(unsigned long *stack, struct stack_info *info, + unsigned long *visit_mask) +{ + unsigned long *begin, *end; + struct pt_regs *regs; unsigned k; BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); for (k = 0; k < N_EXCEPTION_STACKS; k++) { - end = raw_cpu_ptr(&orig_ist)->ist[k]; - begin = end - exception_stack_sizes[k]; + end = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k]; + begin = end - (exception_stack_sizes[k] / sizeof(long)); + regs = (struct pt_regs *)end - 1; if (stack < begin || stack >= end) continue; /* - * Make sure we only iterate through an exception stack once. - * If it comes up for the second time then there's something - * wrong going on - just break and return NULL: + * Make sure we don't iterate through an exception stack more + * than once. If it comes up a second time then there's + * something wrong going on - just break out and report an + * unknown stack type. */ - if (*usedp & (1U << k)) + if (*visit_mask & (1U << k)) break; - *usedp |= 1U << k; + *visit_mask |= 1U << k; - *idp = exception_stack_names[k]; - return (unsigned long *)end; + info->type = STACK_TYPE_EXCEPTION + k; + info->begin = begin; + info->end = end; + info->next_sp = (unsigned long *)regs->sp; + + return true; } - return NULL; + return false; } -static inline int -in_irq_stack(unsigned long *stack, unsigned long *irq_stack, - unsigned long *irq_stack_end) +static bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - return (stack >= irq_stack && stack < irq_stack_end); -} + unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr); + unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); -enum stack_type { - STACK_IS_UNKNOWN, - STACK_IS_NORMAL, - STACK_IS_EXCEPTION, - STACK_IS_IRQ, -}; + if (stack < begin || stack >= end) + return false; + + info->type = STACK_TYPE_IRQ; + info->begin = begin; + info->end = end; + + /* + * The next stack pointer is the first thing pushed by the entry code + * after switching to the irq stack. + */ + info->next_sp = (unsigned long *)*(end - 1); + + return true; +} -static enum stack_type -analyze_stack(struct task_struct *task, unsigned long *stack, - unsigned long **stack_end, unsigned long *irq_stack, - unsigned *used, char **id) +int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask) { - unsigned long addr; + if (!stack) + goto unknown; - addr = ((unsigned long)stack & (~(THREAD_SIZE - 1))); - if ((unsigned long)task_stack_page(task) == addr) - return STACK_IS_NORMAL; + task = task ? : current; + + if (in_task_stack(stack, task, info)) + return 0; - *stack_end = in_exception_stack((unsigned long)stack, used, id); - if (*stack_end) - return STACK_IS_EXCEPTION; + if (task != current) + goto unknown; - if (!irq_stack) - return STACK_IS_NORMAL; + if (in_exception_stack(stack, info, visit_mask)) + return 0; - *stack_end = irq_stack; - irq_stack -= (IRQ_STACK_SIZE / sizeof(long)); + if (in_irq_stack(stack, info)) + return 0; - if (in_irq_stack(stack, irq_stack, *stack_end)) - return STACK_IS_IRQ; + return 0; - return STACK_IS_UNKNOWN; +unknown: + info->type = STACK_TYPE_UNKNOWN; + return -EINVAL; } /* @@ -111,8 +144,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { - unsigned long *irq_stack = (unsigned long *)this_cpu_read(irq_stack_ptr); - unsigned used = 0; + unsigned long visit_mask = 0; + struct stack_info info; int graph = 0; int done = 0; @@ -126,57 +159,37 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, * exceptions */ while (!done) { - unsigned long *stack_end; - enum stack_type stype; - char *id; + const char *begin_str, *end_str; - stype = analyze_stack(task, stack, &stack_end, irq_stack, &used, - &id); + get_stack_info(stack, task, &info, &visit_mask); /* Default finish unless specified to continue */ done = 1; - switch (stype) { + switch (info.type) { /* Break out early if we are on the thread stack */ - case STACK_IS_NORMAL: + case STACK_TYPE_TASK: break; - case STACK_IS_EXCEPTION: + case STACK_TYPE_IRQ: + case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST: + + stack_type_str(info.type, &begin_str, &end_str); - if (ops->stack(data, id) < 0) + if (ops->stack(data, begin_str) < 0) break; bp = ops->walk_stack(task, stack, bp, ops, - data, stack_end, &graph); - ops->stack(data, "EOE"); - /* - * We link to the next stack via the - * second-to-last pointer (index -2 to end) in the - * exception stack: - */ - stack = (unsigned long *) stack_end[-2]; - done = 0; - break; + data, &info, &graph); - case STACK_IS_IRQ: + ops->stack(data, end_str); - if (ops->stack(data, "IRQ") < 0) - break; - bp = ops->walk_stack(task, stack, bp, - ops, data, stack_end, &graph); - /* - * We link to the next stack (which would be - * the process stack normally) the last - * pointer (index -1 to end) in the IRQ stack: - */ - stack = (unsigned long *) (stack_end[-1]); - irq_stack = NULL; - ops->stack(data, "EOI"); + stack = info.next_sp; done = 0; break; - case STACK_IS_UNKNOWN: + default: ops->stack(data, "UNK"); break; } @@ -185,7 +198,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, /* * This handles the process stack: */ - bp = ops->walk_stack(task, stack, bp, ops, data, NULL, &graph); + bp = ops->walk_stack(task, stack, bp, ops, data, &info, &graph); } EXPORT_SYMBOL(dump_trace); diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 4738f5e0f2ab..785aef1c7ef5 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -9,7 +9,7 @@ #include #include -static int save_stack_stack(void *data, char *name) +static int save_stack_stack(void *data, const char *name) { return 0; } diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index d950f9ea9a8c..75391488130b 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -17,7 +17,7 @@ #include #include -static int backtrace_stack(void *data, char *name) +static int backtrace_stack(void *data, const char *name) { /* Yes, we want all stacks */ return 0; From 5fe599e02e41550c59831613a11c8ae057897c29 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 14 Sep 2016 21:07:43 -0500 Subject: [PATCH 326/538] x86/dumpstack: Add support for unwinding empty IRQ stacks When an interrupt happens in entry code while running on a software IRQ stack, and the IRQ stack was empty, regs->sp will contain the stack end address (e.g., irq_stack_ptr). If the regs are passed to dump_trace(), get_stack_info() will report STACK_TYPE_UNKNOWN, causing dump_trace() to return prematurely without trying to go to the next stack. Update the bounds checking for software interrupt stacks so that the ending address is now considered part of the stack. This means that it's now possible for the 'walk_stack' callbacks -- print_context_stack() and print_context_stack_bp() -- to be called with an empty stack. But that's fine; they're already prepared to deal with that due to their on_stack() checks. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5a5e5de92dcf11e8dc6b6e8e50ad7639d067830b.1473905218.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 12 ++++++++++-- arch/x86/kernel/dumpstack_64.c | 6 +++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index c92da5a4d663..50076d4366c4 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -35,7 +35,11 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); - if (stack < begin || stack >= end) + /* + * This is a software stack, so 'end' can be a valid stack pointer. + * It just means the stack is empty. + */ + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_IRQ; @@ -56,7 +60,11 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); - if (stack < begin || stack >= end) + /* + * This is a software stack, so 'end' can be a valid stack pointer. + * It just means the stack is empty. + */ + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_SOFTIRQ; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 41813abc7380..2e708afe146d 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -90,7 +90,11 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info) unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr); unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); - if (stack < begin || stack >= end) + /* + * This is a software stack, so 'end' can be a valid stack pointer. + * It just means the stack is empty. + */ + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_IRQ; From fcd709ef20a9d83bdb7524d27cd6719dac8690a0 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 14 Sep 2016 21:07:44 -0500 Subject: [PATCH 327/538] x86/dumpstack: Add recursion checking for all stacks in_exception_stack() has some recursion checking which makes sure the stack trace code never traverses a given exception stack more than once. This prevents an infinite loop if corruption somehow causes a stack's "next stack" pointer to point to itself (directly or indirectly). The recursion checking can be useful for other stacks in addition to the exception stack, so extend it to work for all stacks. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/95de5db4cfe111754845a5cef04e20630d01423f.1473905218.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 22 ++++++++++++++++++--- arch/x86/kernel/dumpstack_64.c | 35 ++++++++++++++++++---------------- 2 files changed, 38 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 50076d4366c4..2d65cfa5e0b4 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -89,16 +89,32 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, task = task ? : current; if (in_task_stack(stack, task, info)) - return 0; + goto recursion_check; if (task != current) goto unknown; if (in_hardirq_stack(stack, info)) - return 0; + goto recursion_check; if (in_softirq_stack(stack, info)) - return 0; + goto recursion_check; + + goto unknown; + +recursion_check: + /* + * Make sure we don't iterate through any given stack more than once. + * If it comes up a second time then there's something wrong going on: + * just break out and report an unknown stack type. + */ + if (visit_mask) { + if (*visit_mask & (1UL << info->type)) + goto unknown; + *visit_mask |= 1UL << info->type; + } + + return 0; unknown: info->type = STACK_TYPE_UNKNOWN; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 2e708afe146d..8cb6004a4dfd 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -47,8 +47,7 @@ void stack_type_str(enum stack_type type, const char **begin, const char **end) } } -static bool in_exception_stack(unsigned long *stack, struct stack_info *info, - unsigned long *visit_mask) +static bool in_exception_stack(unsigned long *stack, struct stack_info *info) { unsigned long *begin, *end; struct pt_regs *regs; @@ -64,16 +63,6 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info, if (stack < begin || stack >= end) continue; - /* - * Make sure we don't iterate through an exception stack more - * than once. If it comes up a second time then there's - * something wrong going on - just break out and report an - * unknown stack type. - */ - if (*visit_mask & (1U << k)) - break; - *visit_mask |= 1U << k; - info->type = STACK_TYPE_EXCEPTION + k; info->begin = begin; info->end = end; @@ -119,16 +108,30 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, task = task ? : current; if (in_task_stack(stack, task, info)) - return 0; + goto recursion_check; if (task != current) goto unknown; - if (in_exception_stack(stack, info, visit_mask)) - return 0; + if (in_exception_stack(stack, info)) + goto recursion_check; if (in_irq_stack(stack, info)) - return 0; + goto recursion_check; + + goto unknown; + +recursion_check: + /* + * Make sure we don't iterate through any given stack more than once. + * If it comes up a second time then there's something wrong going on: + * just break out and report an unknown stack type. + */ + if (visit_mask) { + if (*visit_mask & (1UL << info->type)) + goto unknown; + *visit_mask |= 1UL << info->type; + } return 0; From b9d989c7218ac922185d82ad46f3e58b27a4bea9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 13 Sep 2016 14:29:21 -0700 Subject: [PATCH 328/538] x86/asm: Move the thread_info::status field to thread_struct Because sched.h and thread_info.h are a tangled mess, I turned in_compat_syscall() into a macro. If we had current_thread_struct() or similar and we could use it from thread_info.h, then this would be a bit cleaner. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ccc8a1b2f41f9c264a41f771bb4a6539a642ad72.1473801993.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 4 ++-- arch/x86/include/asm/processor.h | 12 ++++++++++++ arch/x86/include/asm/syscall.h | 20 +++++--------------- arch/x86/include/asm/thread_info.h | 23 ++++------------------- arch/x86/kernel/asm-offsets.c | 1 - arch/x86/kernel/fpu/init.c | 1 - arch/x86/kernel/process_64.c | 4 ++-- arch/x86/kernel/ptrace.c | 2 +- arch/x86/kernel/signal.c | 2 +- 9 files changed, 27 insertions(+), 42 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 1433f6b4607d..871bbf975d4c 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -209,7 +209,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) * special case only applies after poking regs and before the * very next return to user mode. */ - ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); + current->thread.status &= ~(TS_COMPAT|TS_I386_REGS_POKED); #endif user_enter_irqoff(); @@ -307,7 +307,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) unsigned int nr = (unsigned int)regs->orig_ax; #ifdef CONFIG_IA32_EMULATION - ti->status |= TS_COMPAT; + current->thread.status |= TS_COMPAT; #endif if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b22fb5a4ff3c..984a7bf17f6a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -389,6 +389,9 @@ struct thread_struct { unsigned short fsindex; unsigned short gsindex; #endif + + u32 status; /* thread synchronous flags */ + #ifdef CONFIG_X86_64 unsigned long fsbase; unsigned long gsbase; @@ -434,6 +437,15 @@ struct thread_struct { */ }; +/* + * Thread-synchronous status. + * + * This is different from the flags in that nobody else + * ever touches our thread-synchronous status, so we don't + * have to worry about atomic accesses. + */ +#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ + /* * Set IOPL bits in EFLAGS from given mask */ diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 4e23dd15c661..e3c95e8e61c5 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -60,7 +60,7 @@ static inline long syscall_get_error(struct task_struct *task, * TS_COMPAT is set for 32-bit syscall entries and then * remains set until we return to user mode. */ - if (task_thread_info(task)->status & (TS_COMPAT|TS_I386_REGS_POKED)) + if (task->thread.status & (TS_COMPAT|TS_I386_REGS_POKED)) /* * Sign-extend the value so (int)-EFOO becomes (long)-EFOO * and will match correctly in comparisons. @@ -116,7 +116,7 @@ static inline void syscall_get_arguments(struct task_struct *task, unsigned long *args) { # ifdef CONFIG_IA32_EMULATION - if (task_thread_info(task)->status & TS_COMPAT) + if (task->thread.status & TS_COMPAT) switch (i) { case 0: if (!n--) break; @@ -177,7 +177,7 @@ static inline void syscall_set_arguments(struct task_struct *task, const unsigned long *args) { # ifdef CONFIG_IA32_EMULATION - if (task_thread_info(task)->status & TS_COMPAT) + if (task->thread.status & TS_COMPAT) switch (i) { case 0: if (!n--) break; @@ -234,18 +234,8 @@ static inline void syscall_set_arguments(struct task_struct *task, static inline int syscall_get_arch(void) { -#ifdef CONFIG_IA32_EMULATION - /* - * TS_COMPAT is set for 32-bit syscall entry and then - * remains set until we return to user mode. - * - * x32 tasks should be considered AUDIT_ARCH_X86_64. - */ - if (task_thread_info(current)->status & TS_COMPAT) - return AUDIT_ARCH_I386; -#endif - /* Both x32 and x86_64 are considered "64-bit". */ - return AUDIT_ARCH_X86_64; + /* x32 tasks should be considered AUDIT_ARCH_X86_64. */ + return in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; } #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 494c4b5ada34..c9dcfe7c7e4b 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -55,7 +55,6 @@ struct task_struct; struct thread_info { struct task_struct *task; /* main task structure */ __u32 flags; /* low level flags */ - __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ }; @@ -253,31 +252,17 @@ static inline int arch_within_stack_frames(const void * const stack, #endif -/* - * Thread-synchronous status. - * - * This is different from the flags in that nobody else - * ever touches our thread-synchronous status, so we don't - * have to worry about atomic accesses. - */ -#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ #ifdef CONFIG_COMPAT #define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */ #endif - #ifndef __ASSEMBLY__ -static inline bool in_ia32_syscall(void) -{ #ifdef CONFIG_X86_32 - return true; -#endif -#ifdef CONFIG_IA32_EMULATION - if (current_thread_info()->status & TS_COMPAT) - return true; +#define in_ia32_syscall() true +#else +#define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \ + current->thread.status & TS_COMPAT) #endif - return false; -} /* * Force syscall return via IRET by making it look as if there was diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index db3a0af9b9ec..add5f90b93d4 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -36,7 +36,6 @@ void common(void) { BLANK(); OFFSET(TI_flags, thread_info, flags); - OFFSET(TI_status, thread_info, status); BLANK(); OFFSET(TASK_addr_limit, task_struct, thread.addr_limit); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 93982aebb398..2f2b8c7ccb85 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -317,7 +317,6 @@ static void __init fpu__init_system_ctx_switch(void) on_boot_cpu = 0; WARN_ON_FPU(current->thread.fpu.fpstate_active); - current_thread_info()->status = 0; if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE) eagerfpu = ENABLE; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b812cd0d7889..de9acaf2d371 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -510,7 +510,7 @@ void set_personality_ia32(bool x32) current->personality &= ~READ_IMPLIES_EXEC; /* in_compat_syscall() uses the presence of the x32 syscall bit flag to determine compat status */ - current_thread_info()->status &= ~TS_COMPAT; + current->thread.status &= ~TS_COMPAT; } else { set_thread_flag(TIF_IA32); clear_thread_flag(TIF_X32); @@ -518,7 +518,7 @@ void set_personality_ia32(bool x32) current->mm->context.ia32_compat = TIF_IA32; current->personality |= force_personality32; /* Prepare the first "return" to user space */ - current_thread_info()->status |= TS_COMPAT; + current->thread.status |= TS_COMPAT; } } EXPORT_SYMBOL_GPL(set_personality_ia32); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 5b88a1b26fc7..ce94c38cf4d6 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -934,7 +934,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) */ regs->orig_ax = value; if (syscall_get_nr(child, regs) >= 0) - task_thread_info(child)->status |= TS_I386_REGS_POKED; + child->thread.status |= TS_I386_REGS_POKED; break; case offsetof(struct user32, regs.eflags): diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 04cb3212db2d..da20ecb5397a 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -783,7 +783,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) * than the tracee. */ #ifdef CONFIG_IA32_EMULATION - if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED)) + if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED)) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI From 97245d00585d82540f4538cf72d92a1e853c7b0e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 13 Sep 2016 14:29:22 -0700 Subject: [PATCH 329/538] x86/entry: Get rid of pt_regs_to_thread_info() It was a nice optimization while it lasted, but thread_info is moving and this optimization will no longer work. Quoting Linus: Oh Gods, Andy. That pt_regs_to_thread_info() thing made me want to do unspeakable acts on a poor innocent wax figure that looked _exactly_ like you. [ Changelog written by Andy. ] Signed-off-by: Linus Torvalds Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/6376aa81c68798cc81631673f52bd91a3e078944.1473801993.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 871bbf975d4c..bdd9cc59d20f 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -31,13 +31,6 @@ #define CREATE_TRACE_POINTS #include -static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) -{ - unsigned long top_of_stack = - (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING; - return (struct thread_info *)(top_of_stack - THREAD_SIZE); -} - #ifdef CONFIG_CONTEXT_TRACKING /* Called on entry from user mode with IRQs off. */ __visible inline void enter_from_user_mode(void) @@ -71,7 +64,7 @@ static long syscall_trace_enter(struct pt_regs *regs) { u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); unsigned long ret = 0; bool emulated = false; u32 work; @@ -173,18 +166,17 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) /* Disable IRQs and retry */ local_irq_disable(); - cached_flags = READ_ONCE(pt_regs_to_thread_info(regs)->flags); + cached_flags = READ_ONCE(current_thread_info()->flags); if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) break; - } } /* Called with IRQs disabled. */ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) { - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); u32 cached_flags; if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) @@ -247,7 +239,7 @@ static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags) */ __visible inline void syscall_return_slowpath(struct pt_regs *regs) { - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); u32 cached_flags = READ_ONCE(ti->flags); CT_WARN_ON(ct_state() != CONTEXT_KERNEL); @@ -270,7 +262,7 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs) #ifdef CONFIG_X86_64 __visible void do_syscall_64(struct pt_regs *regs) { - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); unsigned long nr = regs->orig_ax; enter_from_user_mode(); @@ -303,7 +295,7 @@ __visible void do_syscall_64(struct pt_regs *regs) */ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) { - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); unsigned int nr = (unsigned int)regs->orig_ax; #ifdef CONFIG_IA32_EMULATION From d896fa20a70c9e596438728561e058a74ed3196b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 13 Sep 2016 14:29:23 -0700 Subject: [PATCH 330/538] um/Stop conflating task_struct::stack with thread_info thread_info may move in the future, so use the accessors. [ Andy Lutomirski wrote this changelog message and changed "task_thread_info(child)->cpu" to "task_cpu(child)". ] Signed-off-by: Linus Torvalds Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/3439705d9838940cc82733a7335fa8c654c37db8.1473801993.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/um/ptrace_32.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index a7ef7b131e25..5766ead6fdb9 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -194,7 +194,7 @@ int peek_user(struct task_struct *child, long addr, long data) static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) { - int err, n, cpu = ((struct thread_info *) child->stack)->cpu; + int err, n, cpu = task_cpu(child); struct user_i387_struct fpregs; err = save_i387_registers(userspace_pid[cpu], @@ -211,7 +211,7 @@ static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *c static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) { - int n, cpu = ((struct thread_info *) child->stack)->cpu; + int n, cpu = task_cpu(child); struct user_i387_struct fpregs; n = copy_from_user(&fpregs, buf, sizeof(fpregs)); @@ -224,7 +224,7 @@ static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *c static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child) { - int err, n, cpu = ((struct thread_info *) child->stack)->cpu; + int err, n, cpu = task_cpu(child); struct user_fxsr_struct fpregs; err = save_fpx_registers(userspace_pid[cpu], (unsigned long *) &fpregs); @@ -240,7 +240,7 @@ static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct * static int set_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child) { - int n, cpu = ((struct thread_info *) child->stack)->cpu; + int n, cpu = task_cpu(child); struct user_fxsr_struct fpregs; n = copy_from_user(&fpregs, buf, sizeof(fpregs)); From c65eacbe290b8141554c71b2c94489e73ade8c8d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 13 Sep 2016 14:29:24 -0700 Subject: [PATCH 331/538] sched/core: Allow putting thread_info into task_struct If an arch opts in by setting CONFIG_THREAD_INFO_IN_TASK_STRUCT, then thread_info is defined as a single 'u32 flags' and is the first entry of task_struct. thread_info::task is removed (it serves no purpose if thread_info is embedded in task_struct), and thread_info::cpu gets its own slot in task_struct. This is heavily based on a patch written by Linus. Originally-from: Linus Torvalds Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a0898196f0476195ca02713691a5037a14f2aac5.1473801993.git.luto@kernel.org Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 9 +++++++++ include/linux/sched.h | 36 ++++++++++++++++++++++++++++++++++-- include/linux/thread_info.h | 15 +++++++++++++++ init/Kconfig | 7 +++++++ init/init_task.c | 7 +++++-- kernel/sched/sched.h | 4 ++++ 6 files changed, 74 insertions(+), 4 deletions(-) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index f8834f820ec2..9c04d44eeb3c 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -15,6 +15,8 @@ #include #include +#include + #ifdef CONFIG_SMP # define INIT_PUSHABLE_TASKS(tsk) \ .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), @@ -183,12 +185,19 @@ extern struct task_group root_task_group; # define INIT_KASAN(tsk) #endif +#ifdef CONFIG_THREAD_INFO_IN_TASK +# define INIT_TASK_TI(tsk) .thread_info = INIT_THREAD_INFO(tsk), +#else +# define INIT_TASK_TI(tsk) +#endif + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) */ #define INIT_TASK(tsk) \ { \ + INIT_TASK_TI(tsk) \ .state = 0, \ .stack = init_stack, \ .usage = ATOMIC_INIT(2), \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 20f9f47bcfd0..a287e8b13549 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1458,6 +1458,13 @@ struct tlbflush_unmap_batch { }; struct task_struct { +#ifdef CONFIG_THREAD_INFO_IN_TASK + /* + * For reasons of header soup (see current_thread_info()), this + * must be the first element of task_struct. + */ + struct thread_info thread_info; +#endif volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ void *stack; atomic_t usage; @@ -1467,6 +1474,9 @@ struct task_struct { #ifdef CONFIG_SMP struct llist_node wake_entry; int on_cpu; +#ifdef CONFIG_THREAD_INFO_IN_TASK + unsigned int cpu; /* current CPU */ +#endif unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; @@ -2588,7 +2598,9 @@ extern void set_curr_task(int cpu, struct task_struct *p); void yield(void); union thread_union { +#ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; +#endif unsigned long stack[THREAD_SIZE/sizeof(long)]; }; @@ -3076,10 +3088,26 @@ static inline void threadgroup_change_end(struct task_struct *tsk) cgroup_threadgroup_change_end(tsk); } -#ifndef __HAVE_THREAD_FUNCTIONS +#ifdef CONFIG_THREAD_INFO_IN_TASK + +static inline struct thread_info *task_thread_info(struct task_struct *task) +{ + return &task->thread_info; +} +static inline void *task_stack_page(const struct task_struct *task) +{ + return task->stack; +} +#define setup_thread_stack(new,old) do { } while(0) +static inline unsigned long *end_of_stack(const struct task_struct *task) +{ + return task->stack; +} + +#elif !defined(__HAVE_THREAD_FUNCTIONS) #define task_thread_info(task) ((struct thread_info *)(task)->stack) -#define task_stack_page(task) ((task)->stack) +#define task_stack_page(task) ((void *)(task)->stack) static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) { @@ -3379,7 +3407,11 @@ static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) static inline unsigned int task_cpu(const struct task_struct *p) { +#ifdef CONFIG_THREAD_INFO_IN_TASK + return p->cpu; +#else return task_thread_info(p)->cpu; +#endif } static inline int task_node(const struct task_struct *p) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 2b5b10eed74f..e2d0fd81b1ba 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -13,6 +13,21 @@ struct timespec; struct compat_timespec; +#ifdef CONFIG_THREAD_INFO_IN_TASK +struct thread_info { + u32 flags; /* low level flags */ +}; + +#define INIT_THREAD_INFO(tsk) \ +{ \ + .flags = 0, \ +} +#endif + +#ifdef CONFIG_THREAD_INFO_IN_TASK +#define current_thread_info() ((struct thread_info *)current) +#endif + /* * System call restart block. */ diff --git a/init/Kconfig b/init/Kconfig index cac3f096050d..ec8d43894b02 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -26,6 +26,13 @@ config IRQ_WORK config BUILDTIME_EXTABLE_SORT bool +config THREAD_INFO_IN_TASK + bool + help + Select this to move thread_info off the stack into task_struct. To + make this work, an arch will need to remove all thread_info fields + except flags and fix any runtime bugs. + menu "General setup" config BROKEN diff --git a/init/init_task.c b/init/init_task.c index ba0a7f362d9e..11f83be1fa79 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -22,5 +22,8 @@ EXPORT_SYMBOL(init_task); * Initial thread structure. Alignment of this is handled by a special * linker map entry. */ -union thread_union init_thread_union __init_task_data = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = { +#ifndef CONFIG_THREAD_INFO_IN_TASK + INIT_THREAD_INFO(init_task) +#endif +}; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c64fc5114004..3655c9625e5b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1000,7 +1000,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * per-task data have been completed by this moment. */ smp_wmb(); +#ifdef CONFIG_THREAD_INFO_IN_TASK + p->cpu = cpu; +#else task_thread_info(p)->cpu = cpu; +#endif p->wake_cpu = cpu; #endif } From 15f4eae70d365bba26854c90b6002aaabb18c8aa Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 13 Sep 2016 14:29:25 -0700 Subject: [PATCH 332/538] x86: Move thread_info into task_struct Now that most of the thread_info users have been cleaned up, this is straightforward. Most of this code was written by Linus. Originally-from: Linus Torvalds Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a50eab40abeaec9cb9a9e3cbdeafd32190206654.1473801993.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/entry/entry_64.S | 7 +++-- arch/x86/include/asm/thread_info.h | 46 ------------------------------ arch/x86/kernel/asm-offsets.c | 4 +-- arch/x86/kernel/irq_64.c | 3 +- arch/x86/kernel/process.c | 6 ++-- 6 files changed, 10 insertions(+), 57 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4c3972847c2a..2a83bc8b24c6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -157,6 +157,7 @@ config X86 select SPARSE_IRQ select SRCU select SYSCTL_EXCEPTION_TRACE + select THREAD_INFO_IN_TASK select USER_STACKTRACE_SUPPORT select VIRT_TO_BUS select X86_DEV_DMA_OPS if X86_64 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e7fba58f4d9c..2b46384b4a4f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -179,7 +179,8 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) * If we need to do entry work or if we guess we'll need to do * exit work, go straight to the slow path. */ - testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + movq PER_CPU_VAR(current_task), %r11 + testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) jnz entry_SYSCALL64_slow_path entry_SYSCALL_64_fastpath: @@ -217,7 +218,8 @@ entry_SYSCALL_64_fastpath: */ DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + movq PER_CPU_VAR(current_task), %r11 + testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) jnz 1f LOCKDEP_SYS_EXIT @@ -370,6 +372,7 @@ END(ptregs_\func) /* * %rdi: prev task * %rsi: next task + * rsi: task we're switching to */ ENTRY(__switch_to_asm) /* diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index c9dcfe7c7e4b..2aaca53c0974 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -52,20 +52,6 @@ struct task_struct; #include #include -struct thread_info { - struct task_struct *task; /* main task structure */ - __u32 flags; /* low level flags */ - __u32 cpu; /* current CPU */ -}; - -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .flags = 0, \ - .cpu = 0, \ -} - -#define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) #else /* !__ASSEMBLY__ */ @@ -157,11 +143,6 @@ struct thread_info { */ #ifndef __ASSEMBLY__ -static inline struct thread_info *current_thread_info(void) -{ - return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); -} - static inline unsigned long current_stack_pointer(void) { unsigned long sp; @@ -223,33 +204,6 @@ static inline int arch_within_stack_frames(const void * const stack, # define cpu_current_top_of_stack (cpu_tss + TSS_sp0) #endif -/* - * ASM operand which evaluates to a 'thread_info' address of - * the current task, if it is known that "reg" is exactly "off" - * bytes below the top of the stack currently. - * - * ( The kernel stack's size is known at build time, it is usually - * 2 or 4 pages, and the bottom of the kernel stack contains - * the thread_info structure. So to access the thread_info very - * quickly from assembly code we can calculate down from the - * top of the kernel stack to the bottom, using constant, - * build-time calculations only. ) - * - * For example, to fetch the current thread_info->flags value into %eax - * on x86-64 defconfig kernels, in syscall entry code where RSP is - * currently at exactly SIZEOF_PTREGS bytes away from the top of the - * stack: - * - * mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax - * - * will translate to: - * - * 8b 84 24 b8 c0 ff ff mov -0x3f48(%rsp), %eax - * - * which is below the current RSP by almost 16K. - */ -#define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg) - #endif #ifdef CONFIG_COMPAT diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index add5f90b93d4..c62e015b126c 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -35,9 +35,7 @@ void common(void) { #endif BLANK(); - OFFSET(TI_flags, thread_info, flags); - - BLANK(); + OFFSET(TASK_TI_flags, task_struct, thread_info.flags); OFFSET(TASK_addr_limit, task_struct, thread.addr_limit); BLANK(); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 4a7903714065..9ebd0b0e73d9 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -40,8 +40,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) if (user_mode(regs)) return; - if (regs->sp >= curbase + sizeof(struct thread_info) + - sizeof(struct pt_regs) + STACK_TOP_MARGIN && + if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN && regs->sp <= curbase + THREAD_SIZE) return; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c1fa790c81cd..0b9ed8ec5226 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -549,9 +549,7 @@ unsigned long get_wchan(struct task_struct *p) * PADDING * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING * stack - * ----------- bottom = start + sizeof(thread_info) - * thread_info - * ----------- start + * ----------- bottom = start * * The tasks stack pointer points at the location where the * framepointer is stored. The data on the stack is: @@ -562,7 +560,7 @@ unsigned long get_wchan(struct task_struct *p) */ top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; top -= 2 * sizeof(unsigned long); - bottom = start + sizeof(struct thread_info); + bottom = start; sp = READ_ONCE(p->thread.sp); if (sp < bottom || sp > top) From 91b7bd39e62e190700aa1398e451a6dfa6d24465 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 Sep 2016 08:42:51 +0200 Subject: [PATCH 333/538] x86/vdso: Only define prctl_map_vdso() if CONFIG_CHECKPOINT_RESTORE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... otherwise the compiler complains: arch/x86/kernel/process_64.c:528:13: warning: ‘prctl_map_vdso’ defined but not used [-Wunused-function] Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Safonov Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: gorcunov@openvz.org Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: oleg@redhat.com Cc: xemul@virtuozzo.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index f240a465920b..b26a0092a01d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -525,6 +525,7 @@ void set_personality_ia32(bool x32) } EXPORT_SYMBOL_GPL(set_personality_ia32); +#ifdef CONFIG_CHECKPOINT_RESTORE static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) { int ret; @@ -535,6 +536,7 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) return (long)image->size; } +#endif long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) { @@ -590,14 +592,14 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) } #ifdef CONFIG_CHECKPOINT_RESTORE -#ifdef CONFIG_X86_X32 +# ifdef CONFIG_X86_X32 case ARCH_MAP_VDSO_X32: return prctl_map_vdso(&vdso_image_x32, addr); -#endif -#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION +# endif +# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION case ARCH_MAP_VDSO_32: return prctl_map_vdso(&vdso_image_32, addr); -#endif +# endif case ARCH_MAP_VDSO_64: return prctl_map_vdso(&vdso_image_64, addr); #endif From 3947f49302e4d1576ee58addd8d20b477faef5ea Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 Sep 2016 08:56:21 +0200 Subject: [PATCH 334/538] x86/vdso: Only define map_vdso_randomized() if CONFIG_X86_64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... otherwise the compiler complains: arch/x86/entry/vdso/vma.c:252:12: warning: ‘map_vdso_randomized’ defined but not used [-Wunused-function] But the #ifdeffery here is getting pretty ugly, so move around vdso_addr() as well to cluster the dependencies a bit more. It's still not particulary pretty though ... Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Safonov Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: gorcunov@openvz.org Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: oleg@redhat.com Cc: xemul@virtuozzo.com Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vma.c | 98 +++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 50 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 4459e73e234d..23c881caabd1 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -37,54 +37,6 @@ void __init init_vdso_image(const struct vdso_image *image) struct linux_binprm; -/* - * Put the vdso above the (randomized) stack with another randomized - * offset. This way there is no hole in the middle of address space. - * To save memory make sure it is still in the same PTE as the stack - * top. This doesn't give that many random bits. - * - * Note that this algorithm is imperfect: the distribution of the vdso - * start address within a PMD is biased toward the end. - * - * Only used for the 64-bit and x32 vdsos. - */ -static unsigned long vdso_addr(unsigned long start, unsigned len) -{ -#ifdef CONFIG_X86_32 - return 0; -#else - unsigned long addr, end; - unsigned offset; - - /* - * Round up the start address. It can start out unaligned as a result - * of stack start randomization. - */ - start = PAGE_ALIGN(start); - - /* Round the lowest possible end address up to a PMD boundary. */ - end = (start + len + PMD_SIZE - 1) & PMD_MASK; - if (end >= TASK_SIZE_MAX) - end = TASK_SIZE_MAX; - end -= len; - - if (end > start) { - offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); - addr = start + (offset << PAGE_SHIFT); - } else { - addr = start; - } - - /* - * Forcibly align the final address in case we have a hardware - * issue that requires alignment for performance reasons. - */ - addr = align_vdso_addr(addr); - - return addr; -#endif -} - static int vdso_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -249,12 +201,58 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) return ret; } +#ifdef CONFIG_X86_64 +/* + * Put the vdso above the (randomized) stack with another randomized + * offset. This way there is no hole in the middle of address space. + * To save memory make sure it is still in the same PTE as the stack + * top. This doesn't give that many random bits. + * + * Note that this algorithm is imperfect: the distribution of the vdso + * start address within a PMD is biased toward the end. + * + * Only used for the 64-bit and x32 vdsos. + */ +static unsigned long vdso_addr(unsigned long start, unsigned len) +{ + unsigned long addr, end; + unsigned offset; + + /* + * Round up the start address. It can start out unaligned as a result + * of stack start randomization. + */ + start = PAGE_ALIGN(start); + + /* Round the lowest possible end address up to a PMD boundary. */ + end = (start + len + PMD_SIZE - 1) & PMD_MASK; + if (end >= TASK_SIZE_MAX) + end = TASK_SIZE_MAX; + end -= len; + + if (end > start) { + offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); + addr = start + (offset << PAGE_SHIFT); + } else { + addr = start; + } + + /* + * Forcibly align the final address in case we have a hardware + * issue that requires alignment for performance reasons. + */ + addr = align_vdso_addr(addr); + + return addr; +} + static int map_vdso_randomized(const struct vdso_image *image) { - unsigned long addr = vdso_addr(current->mm->start_stack, - image->size - image->sym_vvar_start); + unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start); + return map_vdso(image, addr); } +#endif int map_vdso_once(const struct vdso_image *image, unsigned long addr) { From 85d5313ed717ad60769491c7c072d23bc0a68e7a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 14 Sep 2016 11:38:31 +0200 Subject: [PATCH 335/538] mac80211: reject TSPEC TIDs (TSIDs) for aggregation Since mac80211 doesn't currently support TSIDs 8-15 which can only be used after QoS TSPEC negotiation (and not even after WMM negotiation), reject attempts to set up aggregation sessions for them, which might confuse drivers. In mac80211 we do correctly handle that, but the TSIDs should never get used anyway, and drivers might not be able to handle it. Cc: stable@vger.kernel.org Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 8 +++++++- net/mac80211/agg-tx.c | 3 +++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index a9aff6079c42..afa94687d5e1 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -261,10 +261,16 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, .timeout = timeout, .ssn = start_seq_num, }; - int i, ret = -EOPNOTSUPP; u16 status = WLAN_STATUS_REQUEST_DECLINED; + if (tid >= IEEE80211_FIRST_TSPEC_TSID) { + ht_dbg(sta->sdata, + "STA %pM requests BA session on unsupported tid %d\n", + sta->sta.addr, tid); + goto end_no_lock; + } + if (!sta->sta.ht_cap.ht_supported) { ht_dbg(sta->sdata, "STA %pM erroneously requests BA session on tid %d w/o QoS\n", diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 5650c46bf91a..45319cc01121 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -584,6 +584,9 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, ieee80211_hw_check(&local->hw, TX_AMPDU_SETUP_IN_HW)) return -EINVAL; + if (WARN_ON(tid >= IEEE80211_FIRST_TSPEC_TSID)) + return -EINVAL; + ht_dbg(sdata, "Open BA session requested for %pM tid %u\n", pubsta->addr, tid); From 4bf5beef578e46393f11eb69dda7d17a065e05ff Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 14 Sep 2016 11:41:59 +0200 Subject: [PATCH 336/538] iommu/amd: Don't put completion-wait semaphore on stack The semaphore used by the AMD IOMMU to signal command completion lived on the stack until now, which was safe as the driver busy-waited on the semaphore with IRQs disabled, so the stack can't go away under the driver. But the recently introduced vmap-based stacks break this as the physical address of the semaphore can't be determinded easily anymore. The driver used the __pa() macro, but that only works in the direct-mapping. The result were Completion-Wait timeout errors seen by the IOMMU driver, breaking system boot. Since putting the semaphore on the stack is bad design anyway, move the semaphore into 'struct amd_iommu'. It is protected by the per-iommu lock and now in the direct mapping again. This fixes the Completion-Wait timeout errors and makes AMD IOMMU systems boot again with vmap-based stacks enabled. Reported-by: Borislav Petkov Signed-off-by: Joerg Roedel Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- drivers/iommu/amd_iommu.c | 51 ++++++++++++++++++++++----------- drivers/iommu/amd_iommu_types.h | 2 ++ 2 files changed, 37 insertions(+), 16 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 96de97a46079..4025291ea0ae 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -940,15 +940,13 @@ static void build_inv_irt(struct iommu_cmd *cmd, u16 devid) * Writes the command to the IOMMUs command buffer and informs the * hardware about the new command. */ -static int iommu_queue_command_sync(struct amd_iommu *iommu, - struct iommu_cmd *cmd, - bool sync) +static int __iommu_queue_command_sync(struct amd_iommu *iommu, + struct iommu_cmd *cmd, + bool sync) { u32 left, tail, head, next_tail; - unsigned long flags; again: - spin_lock_irqsave(&iommu->lock, flags); head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); @@ -957,15 +955,14 @@ static int iommu_queue_command_sync(struct amd_iommu *iommu, if (left <= 2) { struct iommu_cmd sync_cmd; - volatile u64 sem = 0; int ret; - build_completion_wait(&sync_cmd, (u64)&sem); - copy_cmd_to_buffer(iommu, &sync_cmd, tail); + iommu->cmd_sem = 0; - spin_unlock_irqrestore(&iommu->lock, flags); + build_completion_wait(&sync_cmd, (u64)&iommu->cmd_sem); + copy_cmd_to_buffer(iommu, &sync_cmd, tail); - if ((ret = wait_on_sem(&sem)) != 0) + if ((ret = wait_on_sem(&iommu->cmd_sem)) != 0) return ret; goto again; @@ -976,9 +973,21 @@ static int iommu_queue_command_sync(struct amd_iommu *iommu, /* We need to sync now to make sure all commands are processed */ iommu->need_sync = sync; + return 0; +} + +static int iommu_queue_command_sync(struct amd_iommu *iommu, + struct iommu_cmd *cmd, + bool sync) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&iommu->lock, flags); + ret = __iommu_queue_command_sync(iommu, cmd, sync); spin_unlock_irqrestore(&iommu->lock, flags); - return 0; + return ret; } static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) @@ -993,19 +1002,29 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) static int iommu_completion_wait(struct amd_iommu *iommu) { struct iommu_cmd cmd; - volatile u64 sem = 0; + unsigned long flags; int ret; if (!iommu->need_sync) return 0; - build_completion_wait(&cmd, (u64)&sem); - ret = iommu_queue_command_sync(iommu, &cmd, false); + build_completion_wait(&cmd, (u64)&iommu->cmd_sem); + + spin_lock_irqsave(&iommu->lock, flags); + + iommu->cmd_sem = 0; + + ret = __iommu_queue_command_sync(iommu, &cmd, false); if (ret) - return ret; + goto out_unlock; + + ret = wait_on_sem(&iommu->cmd_sem); - return wait_on_sem(&sem); +out_unlock: + spin_unlock_irqrestore(&iommu->lock, flags); + + return ret; } static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index caf5e3822715..9652848e3155 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -524,6 +524,8 @@ struct amd_iommu { struct irq_domain *ir_domain; struct irq_domain *msi_domain; #endif + + volatile u64 __aligned(8) cmd_sem; }; #define ACPIHID_UID_LEN 256 From 7892a1f64a447b6f65fe2888688883b7c26d81d3 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 9 Aug 2016 12:36:41 -0300 Subject: [PATCH 337/538] [media] rcar-fcp: Make sure rcar_fcp_enable() returns 0 on success When resuming from suspend-to-RAM on r8a7795/salvator-x: dpm_run_callback(): pm_genpd_resume_noirq+0x0/0x90 returns 1 PM: Device fe940000.fdp1 failed to resume noirq: error 1 dpm_run_callback(): pm_genpd_resume_noirq+0x0/0x90 returns 1 PM: Device fe944000.fdp1 failed to resume noirq: error 1 dpm_run_callback(): pm_genpd_resume_noirq+0x0/0x90 returns 1 PM: Device fe948000.fdp1 failed to resume noirq: error 1 According to its documentation, rcar_fcp_enable() returns 0 on success or a negative error code if an error occurs. Hence fdp1_pm_runtime_resume() and vsp1_pm_runtime_resume() forward its return value to their callers. However, rcar_fcp_enable() forwards the return value of pm_runtime_get_sync(), which can actually be 1 on success, leading to the resume failure above. To fix this, consider only negative values returned by pm_runtime_get_sync() to be failures. Fixes: 7b49235e83b2347c ("[media] v4l: Add Renesas R-Car FCP driver") Signed-off-by: Geert Uytterhoeven Reviewed-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/rcar-fcp.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/media/platform/rcar-fcp.c b/drivers/media/platform/rcar-fcp.c index 6a7bcc3028b1..bc50c69ee0c5 100644 --- a/drivers/media/platform/rcar-fcp.c +++ b/drivers/media/platform/rcar-fcp.c @@ -99,10 +99,16 @@ EXPORT_SYMBOL_GPL(rcar_fcp_put); */ int rcar_fcp_enable(struct rcar_fcp_device *fcp) { + int error; + if (!fcp) return 0; - return pm_runtime_get_sync(fcp->dev); + error = pm_runtime_get_sync(fcp->dev); + if (error < 0) + return error; + + return 0; } EXPORT_SYMBOL_GPL(rcar_fcp_enable); From 54c5ef2e93ea002dc5dd63349298b2778fe59edb Mon Sep 17 00:00:00 2001 From: Beni Lev Date: Wed, 10 Aug 2016 17:03:43 +0300 Subject: [PATCH 338/538] iwlwifi: mvm: update TX queue before making a copy of the skb Off-channel action frames (such as ANQP frames) must be sent either on the AUX queue or on the offchannel queue, otherwise the firmware will cause a SYSASSERT. In the current implementation, the queue to be used is correctly set in the original skb, but this is done after it is copied. Thus the copy remains with the original, incorrect queue. Fix this by setting the queue in the original skb before copying it. Fixes: commit 5c08b0f5026f ("iwlwifi: mvm: don't override the rate with the AMSDU len") Cc: stable@vger.kernel.org # v4.6+ Signed-off-by: Beni Lev Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/tx.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c index c6585ab48df3..b3a87a31de30 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c @@ -513,6 +513,15 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb) int hdrlen = ieee80211_hdrlen(hdr->frame_control); int queue; + /* IWL_MVM_OFFCHANNEL_QUEUE is used for ROC packets that can be used + * in 2 different types of vifs, P2P & STATION. P2P uses the offchannel + * queue. STATION (HS2.0) uses the auxiliary context of the FW, + * and hence needs to be sent on the aux queue + */ + if (IEEE80211_SKB_CB(skb)->hw_queue == IWL_MVM_OFFCHANNEL_QUEUE && + skb_info->control.vif->type == NL80211_IFTYPE_STATION) + IEEE80211_SKB_CB(skb)->hw_queue = mvm->aux_queue; + memcpy(&info, skb->cb, sizeof(info)); if (WARN_ON_ONCE(info.flags & IEEE80211_TX_CTL_AMPDU)) @@ -526,16 +535,6 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb) /* This holds the amsdu headers length */ skb_info->driver_data[0] = (void *)(uintptr_t)0; - /* - * IWL_MVM_OFFCHANNEL_QUEUE is used for ROC packets that can be used - * in 2 different types of vifs, P2P & STATION. P2P uses the offchannel - * queue. STATION (HS2.0) uses the auxiliary context of the FW, - * and hence needs to be sent on the aux queue - */ - if (IEEE80211_SKB_CB(skb)->hw_queue == IWL_MVM_OFFCHANNEL_QUEUE && - info.control.vif->type == NL80211_IFTYPE_STATION) - IEEE80211_SKB_CB(skb)->hw_queue = mvm->aux_queue; - queue = info.hw_queue; /* From ff0071c03684485495e06f3936399eb9c93141a6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 15 Sep 2016 22:45:42 -0700 Subject: [PATCH 339/538] x86/entry/64: Fix a minor comment rebase error When I rebased my thread_info changes onto Brian's switch_to() changes, I carefully checked that I fixed up all the code correctly, but I missed a comment :( Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 15f4eae70d36 ("x86: Move thread_info into task_struct") Link: http://lkml.kernel.org/r/089fe1e1cbe8b258b064fccbb1a5a5fd23861031.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 2b46384b4a4f..80ab68a42621 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -372,7 +372,6 @@ END(ptregs_\func) /* * %rdi: prev task * %rsi: next task - * rsi: task we're switching to */ ENTRY(__switch_to_asm) /* From c6c314a613cd7d03fb97713e0d642b493de42e69 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 15 Sep 2016 22:45:43 -0700 Subject: [PATCH 340/538] sched/core: Add try_get_task_stack() and put_task_stack() There are a few places in the kernel that access stack memory belonging to a different task. Before we can start freeing task stacks before the task_struct is freed, we need a way for those code paths to pin the stack. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/17a434f50ad3d77000104f21666575e10a9c1fbd.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 16 ++++++++++++++++ init/Kconfig | 3 +++ 2 files changed, 19 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index a287e8b13549..a95867267e9f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3094,11 +3094,19 @@ static inline struct thread_info *task_thread_info(struct task_struct *task) { return &task->thread_info; } + +/* + * When accessing the stack of a non-current task that might exit, use + * try_get_task_stack() instead. task_stack_page will return a pointer + * that could get freed out from under you. + */ static inline void *task_stack_page(const struct task_struct *task) { return task->stack; } + #define setup_thread_stack(new,old) do { } while(0) + static inline unsigned long *end_of_stack(const struct task_struct *task) { return task->stack; @@ -3134,6 +3142,14 @@ static inline unsigned long *end_of_stack(struct task_struct *p) } #endif + +static inline void *try_get_task_stack(struct task_struct *tsk) +{ + return task_stack_page(tsk); +} + +static inline void put_task_stack(struct task_struct *tsk) {} + #define task_stack_end_corrupted(task) \ (*(end_of_stack(task)) != STACK_END_MAGIC) diff --git a/init/Kconfig b/init/Kconfig index ec8d43894b02..3b9a47fe843b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -33,6 +33,9 @@ config THREAD_INFO_IN_TASK make this work, an arch will need to remove all thread_info fields except flags and fix any runtime bugs. + One subtle change that will be needed is to use try_get_task_stack() + and put_task_stack() in save_thread_stack_tsk() and get_wchan(). + menu "General setup" config BROKEN From 23196f2e5f5d810578a772785807dcdc2b9fdce9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 15 Sep 2016 22:45:44 -0700 Subject: [PATCH 341/538] kthread: Pin the stack via try_get_task_stack()/put_task_stack() in to_live_kthread() function get_task_struct(tsk) no longer pins tsk->stack so all users of to_live_kthread() should do try_get_task_stack/put_task_stack to protect "struct kthread" which lives on kthread's stack. TODO: Kill to_live_kthread(), perhaps we can even kill "struct kthread" too, and rework kthread_stop(), it can use task_work_add() to sync with the exiting kernel thread. Message-Id: <20160629180357.GA7178@redhat.com> Signed-off-by: Oleg Nesterov Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/cb9b16bbc19d4aea4507ab0552e4644c1211d130.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar --- kernel/kthread.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/kthread.c b/kernel/kthread.c index 9ff173dca1ae..4ab4c3766a80 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -64,7 +64,7 @@ static inline struct kthread *to_kthread(struct task_struct *k) static struct kthread *to_live_kthread(struct task_struct *k) { struct completion *vfork = ACCESS_ONCE(k->vfork_done); - if (likely(vfork)) + if (likely(vfork) && try_get_task_stack(k)) return __to_kthread(vfork); return NULL; } @@ -425,8 +425,10 @@ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_live_kthread(k); - if (kthread) + if (kthread) { __kthread_unpark(k, kthread); + put_task_stack(k); + } } EXPORT_SYMBOL_GPL(kthread_unpark); @@ -455,6 +457,7 @@ int kthread_park(struct task_struct *k) wait_for_completion(&kthread->parked); } } + put_task_stack(k); ret = 0; } return ret; @@ -490,6 +493,7 @@ int kthread_stop(struct task_struct *k) __kthread_unpark(k, kthread); wake_up_process(k); wait_for_completion(&kthread->exited); + put_task_stack(k); } ret = k->exit_code; put_task_struct(k); From 1959a60182f48879635812a03a99c02231ea8677 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 15 Sep 2016 22:45:45 -0700 Subject: [PATCH 342/538] x86/dumpstack: Pin the target stack when dumping it Specifically, pin the stack in save_stack_trace_tsk() and show_trace_log_lvl(). This will prevent a crash if the target task dies before or while dumping its stack once we start freeing task stacks early. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/cf0082cde65d1941a996d026f2b2cdbfaca17bfa.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 5 +++++ arch/x86/kernel/dumpstack_64.c | 5 +++++ arch/x86/kernel/stacktrace.c | 5 +++++ 3 files changed, 15 insertions(+) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 2d65cfa5e0b4..122f37d7bb7e 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -163,6 +163,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack; int i; + if (!try_get_task_stack(task)) + return; + sp = sp ? : get_stack_pointer(task, regs); stack = sp; @@ -179,6 +182,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, } pr_cont("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); + + put_task_stack(task); } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 8cb6004a4dfd..16c0d5f89b5e 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -218,6 +218,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack; int i; + if (!try_get_task_stack(task)) + return; + irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr); irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long)); @@ -253,6 +256,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, pr_cont("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); + + put_task_stack(task); } void show_regs(struct pt_regs *regs) diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 785aef1c7ef5..23fa81e24c8a 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -79,9 +79,14 @@ void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { + if (!try_get_task_stack(tsk)) + return; + dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; + + put_task_stack(tsk); } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); From 74327a3e884a0ff895ba7b51d3488e6a177407b2 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 15 Sep 2016 22:45:46 -0700 Subject: [PATCH 343/538] x86/process: Pin the target stack in get_wchan() This will prevent a crash if get_wchan() runs after the task stack is freed. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/337aeca8614024aa4d8d9c81053bbf8fcffbe4ad.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 0b9ed8ec5226..4002b475171c 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -532,15 +532,18 @@ unsigned long thread_saved_pc(struct task_struct *tsk) */ unsigned long get_wchan(struct task_struct *p) { - unsigned long start, bottom, top, sp, fp, ip; + unsigned long start, bottom, top, sp, fp, ip, ret = 0; int count = 0; if (!p || p == current || p->state == TASK_RUNNING) return 0; + if (!try_get_task_stack(p)) + return 0; + start = (unsigned long)task_stack_page(p); if (!start) - return 0; + goto out; /* * Layout of the stack page: @@ -564,16 +567,21 @@ unsigned long get_wchan(struct task_struct *p) sp = READ_ONCE(p->thread.sp); if (sp < bottom || sp > top) - return 0; + goto out; fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp); do { if (fp < bottom || fp > top) - return 0; + goto out; ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long))); - if (!in_sched_functions(ip)) - return ip; + if (!in_sched_functions(ip)) { + ret = ip; + goto out; + } fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); } while (count++ < 16 && p->state != TASK_RUNNING); - return 0; + +out: + put_task_stack(p); + return ret; } From aa1f1a639621672b68f654dc815a7d8298ff396f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 15 Sep 2016 22:45:47 -0700 Subject: [PATCH 344/538] lib/syscall: Pin the task stack in collect_syscall() This will avoid a potential read-after-free if collect_syscall() (e.g. /proc/PID/syscall) is called on an exiting task. Reported-by: Jann Horn Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/0bfd8e6d4729c97745d3781a29610a33d0a8091d.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar --- lib/syscall.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/lib/syscall.c b/lib/syscall.c index e30e03932480..63239e097b13 100644 --- a/lib/syscall.c +++ b/lib/syscall.c @@ -7,9 +7,19 @@ static int collect_syscall(struct task_struct *target, long *callno, unsigned long args[6], unsigned int maxargs, unsigned long *sp, unsigned long *pc) { - struct pt_regs *regs = task_pt_regs(target); - if (unlikely(!regs)) + struct pt_regs *regs; + + if (!try_get_task_stack(target)) { + /* Task has no stack, so the task isn't in a syscall. */ + *callno = -1; + return 0; + } + + regs = task_pt_regs(target); + if (unlikely(!regs)) { + put_task_stack(target); return -EAGAIN; + } *sp = user_stack_pointer(regs); *pc = instruction_pointer(regs); @@ -18,6 +28,7 @@ static int collect_syscall(struct task_struct *target, long *callno, if (*callno != -1L && maxargs > 0) syscall_get_arguments(target, regs, 0, maxargs, args); + put_task_stack(target); return 0; } From 68f24b08ee892d47bdef925d676e1ae1ccc316f8 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 15 Sep 2016 22:45:48 -0700 Subject: [PATCH 345/538] sched/core: Free the stack early if CONFIG_THREAD_INFO_IN_TASK We currently keep every task's stack around until the task_struct itself is freed. This means that we keep the stack allocation alive for longer than necessary and that, under load, we free stacks in big batches whenever RCU drops the last task reference. Neither of these is good for reuse of cache-hot memory, and freeing in batches prevents us from usefully caching small numbers of vmalloced stacks. On architectures that have thread_info on the stack, we can't easily change this, but on architectures that set THREAD_INFO_IN_TASK, we can free it as soon as the task is dead. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/08ca06cde00ebed0046c5d26cbbf3fbb7ef5b812.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 4 +++- include/linux/sched.h | 14 ++++++++++++++ kernel/fork.c | 35 ++++++++++++++++++++++++++++++++++- kernel/sched/core.c | 4 ++++ 4 files changed, 55 insertions(+), 2 deletions(-) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 9c04d44eeb3c..325f649d77ff 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -186,7 +186,9 @@ extern struct task_group root_task_group; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK -# define INIT_TASK_TI(tsk) .thread_info = INIT_THREAD_INFO(tsk), +# define INIT_TASK_TI(tsk) \ + .thread_info = INIT_THREAD_INFO(tsk), \ + .stack_refcount = ATOMIC_INIT(1), #else # define INIT_TASK_TI(tsk) #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index a95867267e9f..abb795afc823 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1936,6 +1936,10 @@ struct task_struct { #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; #endif +#ifdef CONFIG_THREAD_INFO_IN_TASK + /* A live task holds one reference. */ + atomic_t stack_refcount; +#endif /* CPU-specific state of this task */ struct thread_struct thread; /* @@ -3143,12 +3147,22 @@ static inline unsigned long *end_of_stack(struct task_struct *p) #endif +#ifdef CONFIG_THREAD_INFO_IN_TASK +static inline void *try_get_task_stack(struct task_struct *tsk) +{ + return atomic_inc_not_zero(&tsk->stack_refcount) ? + task_stack_page(tsk) : NULL; +} + +extern void put_task_stack(struct task_struct *tsk); +#else static inline void *try_get_task_stack(struct task_struct *tsk) { return task_stack_page(tsk); } static inline void put_task_stack(struct task_struct *tsk) {} +#endif #define task_stack_end_corrupted(task) \ (*(end_of_stack(task)) != STACK_END_MAGIC) diff --git a/kernel/fork.c b/kernel/fork.c index 0c240fd5beba..5dd0a516626d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -269,11 +269,40 @@ static void account_kernel_stack(struct task_struct *tsk, int account) } } -void free_task(struct task_struct *tsk) +static void release_task_stack(struct task_struct *tsk) { account_kernel_stack(tsk, -1); arch_release_thread_stack(tsk->stack); free_thread_stack(tsk); + tsk->stack = NULL; +#ifdef CONFIG_VMAP_STACK + tsk->stack_vm_area = NULL; +#endif +} + +#ifdef CONFIG_THREAD_INFO_IN_TASK +void put_task_stack(struct task_struct *tsk) +{ + if (atomic_dec_and_test(&tsk->stack_refcount)) + release_task_stack(tsk); +} +#endif + +void free_task(struct task_struct *tsk) +{ +#ifndef CONFIG_THREAD_INFO_IN_TASK + /* + * The task is finally done with both the stack and thread_info, + * so free both. + */ + release_task_stack(tsk); +#else + /* + * If the task had a separate stack allocation, it should be gone + * by now. + */ + WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); +#endif rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); @@ -411,6 +440,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_VMAP_STACK tsk->stack_vm_area = stack_vm_area; #endif +#ifdef CONFIG_THREAD_INFO_IN_TASK + atomic_set(&tsk->stack_refcount, 1); +#endif if (err) goto free_stack; @@ -1771,6 +1803,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, atomic_dec(&p->cred->user->processes); exit_creds(p); bad_fork_free: + put_task_stack(p); free_task(p); fork_out: return ERR_PTR(retval); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0b6238f18da2..23c6037e2d89 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2772,6 +2772,10 @@ static struct rq *finish_task_switch(struct task_struct *prev) * task and put them back on the free list. */ kprobe_flush_task(prev); + + /* Task is done with its stack. */ + put_task_stack(prev); + put_task_struct(prev); } From ac496bf48d97f2503eaa353996a4dd5e4383eaf0 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 15 Sep 2016 22:45:49 -0700 Subject: [PATCH 346/538] fork: Optimize task creation by caching two thread stacks per CPU if CONFIG_VMAP_STACK=y MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vmalloc() is a bit slow, and pounding vmalloc()/vfree() will eventually force a global TLB flush. To reduce pressure on them, if CONFIG_VMAP_STACK=y, cache two thread stacks per CPU. This will let us quickly allocate a hopefully cache-hot, TLB-hot stack under heavy forking workloads (shell script style). On my silly pthread_create() benchmark, it saves about 2 µs per pthread_create()+join() with CONFIG_VMAP_STACK=y. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/94811d8e3994b2e962f88866290017d498eb069c.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar --- kernel/fork.c | 62 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 9 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 5dd0a516626d..c060c7e7c247 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -159,15 +159,41 @@ void __weak arch_release_thread_stack(unsigned long *stack) * kmemcache based allocator. */ # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) + +#ifdef CONFIG_VMAP_STACK +/* + * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB + * flush. Try to minimize the number of calls by caching stacks. + */ +#define NR_CACHED_STACKS 2 +static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); +#endif + static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { #ifdef CONFIG_VMAP_STACK - void *stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, - VMALLOC_START, VMALLOC_END, - THREADINFO_GFP | __GFP_HIGHMEM, - PAGE_KERNEL, - 0, node, - __builtin_return_address(0)); + void *stack; + int i; + + local_irq_disable(); + for (i = 0; i < NR_CACHED_STACKS; i++) { + struct vm_struct *s = this_cpu_read(cached_stacks[i]); + + if (!s) + continue; + this_cpu_write(cached_stacks[i], NULL); + + tsk->stack_vm_area = s; + local_irq_enable(); + return s->addr; + } + local_irq_enable(); + + stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, + VMALLOC_START, VMALLOC_END, + THREADINFO_GFP | __GFP_HIGHMEM, + PAGE_KERNEL, + 0, node, __builtin_return_address(0)); /* * We can't call find_vm_area() in interrupt context, and @@ -187,10 +213,28 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) static inline void free_thread_stack(struct task_struct *tsk) { - if (task_stack_vm_area(tsk)) +#ifdef CONFIG_VMAP_STACK + if (task_stack_vm_area(tsk)) { + unsigned long flags; + int i; + + local_irq_save(flags); + for (i = 0; i < NR_CACHED_STACKS; i++) { + if (this_cpu_read(cached_stacks[i])) + continue; + + this_cpu_write(cached_stacks[i], tsk->stack_vm_area); + local_irq_restore(flags); + return; + } + local_irq_restore(flags); + vfree(tsk->stack); - else - __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); + return; + } +#endif + + __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_stack_cache; From 2a292822f00f7409fc0bd6b2d09efc5b8e6c9c5d Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Wed, 14 Sep 2016 13:09:24 +0200 Subject: [PATCH 347/538] net/mlx4_en: fix off by one in error handling If an error occurs in mlx4_init_eq_table the index used in the err_out_unmap label is one too big which results in a panic in mlx4_free_eq. This patch fixes the index in the error path. Signed-off-by: Sebastian Ott Reviewed-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/eq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c index f613977455e0..cf8f8a72a801 100644 --- a/drivers/net/ethernet/mellanox/mlx4/eq.c +++ b/drivers/net/ethernet/mellanox/mlx4/eq.c @@ -1305,8 +1305,8 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) return 0; err_out_unmap: - while (i >= 0) - mlx4_free_eq(dev, &priv->eq_table.eq[i--]); + while (i > 0) + mlx4_free_eq(dev, &priv->eq_table.eq[--i]); #ifdef CONFIG_RFS_ACCEL for (i = 1; i <= dev->caps.num_ports; i++) { if (mlx4_priv(dev)->port[i].rmap) { From 7077dc415b113ac17a6696c432bad2d66574e4fb Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Wed, 14 Sep 2016 21:29:34 +0800 Subject: [PATCH 348/538] net: ethernet: mediatek: fix module loading automatically based on MODULE_DEVICE_TABLE The device table is required to load modules based on modaliases. After adding MODULE_DEVICE_TABLE, below entries for example will be added to modules.alias: alias of:N*T*Cmediatek,mt7623-ethC* mtk_eth_soc Signed-off-by: Sean Wang Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index d9199151a83e..3743af8f1ded 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -1923,6 +1923,7 @@ const struct of_device_id of_mtk_match[] = { { .compatible = "mediatek,mt7623-eth" }, {}, }; +MODULE_DEVICE_TABLE(of, of_mtk_match); static struct platform_driver mtk_driver = { .probe = mtk_probe, From 01afd972a737879c1466a12f696601a2ce91ea84 Mon Sep 17 00:00:00 2001 From: Ivan Mikhaylov Date: Wed, 14 Sep 2016 19:06:44 +0300 Subject: [PATCH 349/538] net/ibm/emac: add set mac addr callback add realization for mac address set and remove dummy callback. Signed-off-by: Ivan Mikhaylov Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/emac/core.c | 31 ++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c index 4c9771d57d6e..2dfc60308648 100644 --- a/drivers/net/ethernet/ibm/emac/core.c +++ b/drivers/net/ethernet/ibm/emac/core.c @@ -980,6 +980,33 @@ static void emac_set_multicast_list(struct net_device *ndev) __emac_set_multicast_list(dev); } +static int emac_set_mac_address(struct net_device *ndev, void *sa) +{ + struct emac_instance *dev = netdev_priv(ndev); + struct sockaddr *addr = sa; + struct emac_regs __iomem *p = dev->emacp; + + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + + mutex_lock(&dev->link_lock); + + memcpy(ndev->dev_addr, addr->sa_data, ndev->addr_len); + + emac_rx_disable(dev); + emac_tx_disable(dev); + out_be32(&p->iahr, (ndev->dev_addr[0] << 8) | ndev->dev_addr[1]); + out_be32(&p->ialr, (ndev->dev_addr[2] << 24) | + (ndev->dev_addr[3] << 16) | (ndev->dev_addr[4] << 8) | + ndev->dev_addr[5]); + emac_tx_enable(dev); + emac_rx_enable(dev); + + mutex_unlock(&dev->link_lock); + + return 0; +} + static int emac_resize_rx_ring(struct emac_instance *dev, int new_mtu) { int rx_sync_size = emac_rx_sync_size(new_mtu); @@ -2686,7 +2713,7 @@ static const struct net_device_ops emac_netdev_ops = { .ndo_do_ioctl = emac_ioctl, .ndo_tx_timeout = emac_tx_timeout, .ndo_validate_addr = eth_validate_addr, - .ndo_set_mac_address = eth_mac_addr, + .ndo_set_mac_address = emac_set_mac_address, .ndo_start_xmit = emac_start_xmit, .ndo_change_mtu = eth_change_mtu, }; @@ -2699,7 +2726,7 @@ static const struct net_device_ops emac_gige_netdev_ops = { .ndo_do_ioctl = emac_ioctl, .ndo_tx_timeout = emac_tx_timeout, .ndo_validate_addr = eth_validate_addr, - .ndo_set_mac_address = eth_mac_addr, + .ndo_set_mac_address = emac_set_mac_address, .ndo_start_xmit = emac_start_xmit_sg, .ndo_change_mtu = emac_change_mtu, }; From 7106a069f45b15e63d14484e72969e64798e641c Mon Sep 17 00:00:00 2001 From: Ivan Mikhaylov Date: Wed, 14 Sep 2016 19:06:45 +0300 Subject: [PATCH 350/538] net/ibm/emac: add mutex to 'set multicast list' for preventing race conditions within ioctl calls. Signed-off-by: Ivan Mikhaylov Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/emac/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c index 2dfc60308648..7af09cbc53f0 100644 --- a/drivers/net/ethernet/ibm/emac/core.c +++ b/drivers/net/ethernet/ibm/emac/core.c @@ -977,7 +977,10 @@ static void emac_set_multicast_list(struct net_device *ndev) dev->mcast_pending = 1; return; } + + mutex_lock(&dev->link_lock); __emac_set_multicast_list(dev); + mutex_unlock(&dev->link_lock); } static int emac_set_mac_address(struct net_device *ndev, void *sa) From d6f64d725bac20df66b2eacd847fc41d7a1905e0 Mon Sep 17 00:00:00 2001 From: Mark Tomlinson Date: Thu, 15 Sep 2016 11:40:05 +1200 Subject: [PATCH 351/538] net: VRF: Pass original iif to ip_route_input() The function ip_rcv_finish() calls l3mdev_ip_rcv(). On any VRF except the global VRF, this replaces skb->dev with the VRF master interface. When calling ip_route_input_noref() from here, the checks for forwarding look at this master device instead of the initial ingress interface. This will allow packets to be routed which normally would be dropped. For example, an interface that is not assigned an IP address should drop packets, but because the checking is against the master device, the packet will be forwarded. The fix here is to still call l3mdev_ip_rcv(), but remember the initial net_device. This is passed to the other functions within ip_rcv_finish, so they still see the original interface. Signed-off-by: Mark Tomlinson Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 4b351af3e67b..d6feabb03516 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -312,6 +312,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; + struct net_device *dev = skb->dev; /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -341,7 +342,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) */ if (!skb_valid_dst(skb)) { int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev); + iph->tos, dev); if (unlikely(err)) { if (err == -EXDEV) __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); @@ -370,7 +371,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len); } else if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); /* RFC 1122 3.3.6: * From bc6c03fa3cacd31b873e36ca16ef9678269deae6 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 15 Sep 2016 03:45:07 +0000 Subject: [PATCH 352/538] nfp: fix error return code in nfp_net_netdev_open() Fix to return a negative error code from the error handling case instead of 0, as done elsewhere in this function. Fixes: 73725d9dfd99 ("nfp: allocate ring SW structs dynamically") Signed-off-by: Wei Yongjun Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 252e4924de0f..39dadfca84ef 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -2044,12 +2044,16 @@ static int nfp_net_netdev_open(struct net_device *netdev) nn->rx_rings = kcalloc(nn->num_rx_rings, sizeof(*nn->rx_rings), GFP_KERNEL); - if (!nn->rx_rings) + if (!nn->rx_rings) { + err = -ENOMEM; goto err_free_lsc; + } nn->tx_rings = kcalloc(nn->num_tx_rings, sizeof(*nn->tx_rings), GFP_KERNEL); - if (!nn->tx_rings) + if (!nn->tx_rings) { + err = -ENOMEM; goto err_free_rx_rings; + } for (r = 0; r < nn->num_r_vecs; r++) { err = nfp_net_prepare_vector(nn, &nn->r_vecs[r], r); From e830baa9c3f0023769ba9aab19eb44c892769d87 Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Thu, 15 Sep 2016 14:39:21 +0200 Subject: [PATCH 353/538] qeth: restore device features after recovery After device recovery, only a basic set of network device features is enabled on the device. If features like checksum offloading or TSO were enabled by the user before the recovery, this results in a mismatch between the network device features, that the kernel assumes to be enabled on the device, and the features actually enabled on the device. This patch tries to restore previously set features, that require changes on the device, after the recovery of a device. In case of an error, the network device's features are changed to contain only the features that are actually turned on. Signed-off-by: Hans Wippel Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core.h | 1 + drivers/s390/net/qeth_core_main.c | 29 +++++++++++++++++++++++++++++ drivers/s390/net/qeth_l2_main.c | 3 +++ drivers/s390/net/qeth_l3_main.c | 1 + 4 files changed, 34 insertions(+) diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h index bf40063de202..6d4b68c483f3 100644 --- a/drivers/s390/net/qeth_core.h +++ b/drivers/s390/net/qeth_core.h @@ -999,6 +999,7 @@ struct qeth_cmd_buffer *qeth_get_setassparms_cmd(struct qeth_card *, __u16, __u16, enum qeth_prot_versions); int qeth_set_features(struct net_device *, netdev_features_t); +int qeth_recover_features(struct net_device *); netdev_features_t qeth_fix_features(struct net_device *, netdev_features_t); /* exports for OSN */ diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 7dba6c8537a1..6ad5a14669e7 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -6131,6 +6131,35 @@ static int qeth_set_ipa_tso(struct qeth_card *card, int on) return rc; } +/* try to restore device features on a device after recovery */ +int qeth_recover_features(struct net_device *dev) +{ + struct qeth_card *card = dev->ml_priv; + netdev_features_t recover = dev->features; + + if (recover & NETIF_F_IP_CSUM) { + if (qeth_set_ipa_csum(card, 1, IPA_OUTBOUND_CHECKSUM)) + recover ^= NETIF_F_IP_CSUM; + } + if (recover & NETIF_F_RXCSUM) { + if (qeth_set_ipa_csum(card, 1, IPA_INBOUND_CHECKSUM)) + recover ^= NETIF_F_RXCSUM; + } + if (recover & NETIF_F_TSO) { + if (qeth_set_ipa_tso(card, 1)) + recover ^= NETIF_F_TSO; + } + + if (recover == dev->features) + return 0; + + dev_warn(&card->gdev->dev, + "Device recovery failed to restore all offload features\n"); + dev->features = recover; + return -EIO; +} +EXPORT_SYMBOL_GPL(qeth_recover_features); + int qeth_set_features(struct net_device *dev, netdev_features_t features) { struct qeth_card *card = dev->ml_priv; diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 7bc20c5188bc..54fd89110ac7 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -1246,6 +1246,9 @@ static int __qeth_l2_set_online(struct ccwgroup_device *gdev, int recovery_mode) } /* this also sets saved unicast addresses */ qeth_l2_set_rx_mode(card->dev); + rtnl_lock(); + qeth_recover_features(card->dev); + rtnl_unlock(); } /* let user_space know that device is online */ kobject_uevent(&gdev->dev.kobj, KOBJ_CHANGE); diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 72934666fedf..2f512715403b 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -3269,6 +3269,7 @@ static int __qeth_l3_set_online(struct ccwgroup_device *gdev, int recovery_mode) else dev_open(card->dev); qeth_l3_set_multicast_list(card->dev); + qeth_recover_features(card->dev); rtnl_unlock(); } qeth_trace_features(card); From 016930b88a1d6eb6e6b3287d593e13ca06986acc Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 15 Sep 2016 14:39:22 +0200 Subject: [PATCH 354/538] s390/qeth: use ip_lock for hsuid configuration qeth_l3_dev_hsuid_store() changes the ip hash table, which requires the ip_lock. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l3_sys.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c index 65645b11fc19..0e00a5ce0f00 100644 --- a/drivers/s390/net/qeth_l3_sys.c +++ b/drivers/s390/net/qeth_l3_sys.c @@ -297,7 +297,9 @@ static ssize_t qeth_l3_dev_hsuid_store(struct device *dev, addr->u.a6.pfxlen = 0; addr->type = QETH_IP_TYPE_NORMAL; + spin_lock_bh(&card->ip_lock); qeth_l3_delete_ip(card, addr); + spin_unlock_bh(&card->ip_lock); kfree(addr); } @@ -329,7 +331,10 @@ static ssize_t qeth_l3_dev_hsuid_store(struct device *dev, addr->type = QETH_IP_TYPE_NORMAL; } else return -ENOMEM; + + spin_lock_bh(&card->ip_lock); qeth_l3_add_ip(card, addr); + spin_unlock_bh(&card->ip_lock); kfree(addr); return count; From a7531c1cc09855df5e33ceefe4fdfc2d74ccab19 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 15 Sep 2016 14:39:23 +0200 Subject: [PATCH 355/538] s390/qeth: allow hsuid configuration in DOWN state The qeth IP address mapping logic has been reworked recently. It causes now problems to specify qeth sysfs attribute "hsuid" in DOWN state, which is allowed. Postpone registering or deregistering of IP-addresses in this case. Signed-off-by: Ursula Braun Reviewed-by: Thomas Richter Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l3_main.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 2f512715403b..4ba82e12d6f9 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -257,6 +257,11 @@ int qeth_l3_delete_ip(struct qeth_card *card, struct qeth_ipaddr *tmp_addr) if (addr->in_progress) return -EINPROGRESS; + if (!qeth_card_hw_is_reachable(card)) { + addr->disp_flag = QETH_DISP_ADDR_DELETE; + return 0; + } + rc = qeth_l3_deregister_addr_entry(card, addr); hash_del(&addr->hnode); @@ -296,6 +301,11 @@ int qeth_l3_add_ip(struct qeth_card *card, struct qeth_ipaddr *tmp_addr) hash_add(card->ip_htable, &addr->hnode, qeth_l3_ipaddr_hash(addr)); + if (!qeth_card_hw_is_reachable(card)) { + addr->disp_flag = QETH_DISP_ADDR_ADD; + return 0; + } + /* qeth_l3_register_addr_entry can go to sleep * if we add a IPV4 addr. It is caused by the reason * that SETIP ipa cmd starts ARP staff for IPV4 addr. @@ -390,12 +400,16 @@ static void qeth_l3_recover_ip(struct qeth_card *card) int i; int rc; - QETH_CARD_TEXT(card, 4, "recoverip"); + QETH_CARD_TEXT(card, 4, "recovrip"); spin_lock_bh(&card->ip_lock); hash_for_each_safe(card->ip_htable, i, tmp, addr, hnode) { - if (addr->disp_flag == QETH_DISP_ADDR_ADD) { + if (addr->disp_flag == QETH_DISP_ADDR_DELETE) { + qeth_l3_deregister_addr_entry(card, addr); + hash_del(&addr->hnode); + kfree(addr); + } else if (addr->disp_flag == QETH_DISP_ADDR_ADD) { if (addr->proto == QETH_PROT_IPV4) { addr->in_progress = 1; spin_unlock_bh(&card->ip_lock); @@ -407,10 +421,8 @@ static void qeth_l3_recover_ip(struct qeth_card *card) if (!rc) { addr->disp_flag = QETH_DISP_ADDR_DO_NOTHING; - if (addr->ref_counter < 1) { + if (addr->ref_counter < 1) qeth_l3_delete_ip(card, addr); - kfree(addr); - } } else { hash_del(&addr->hnode); kfree(addr); From 903e48531e8b5d414c8f1960eacac24c31f60344 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 15 Sep 2016 14:39:24 +0200 Subject: [PATCH 356/538] qeth: check not more than 16 SBALEs on the completion queue af_iucv socket programs with HiperSockets as transport make use of the qdio completion queue. Running such an af_iucv socket program may result in a crash: [90341.677709] Oops: 0038 ilc:2 [#1] SMP [90341.677743] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.6.0-20160720.0.0e86ec7.5e62689.fc23.s390xperformance #1 [90341.677744] Hardware name: IBM 2964 N96 703 (LPAR) [90341.677746] task: 00000000edb79f00 ti: 00000000edb84000 task.ti: 00000000edb84000 [90341.677748] Krnl PSW : 0704d00180000000 000000000075bc50 (qeth_qdio_input_handler+0x258/0x4e0) [90341.677756] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 RI:0 EA:3 Krnl GPRS: 000003d10391e900 0000000000000001 00000000e61e6000 0000000000000005 [90341.677759] 0000000000a9e6ec 5420040001a77400 0000000000000001 000000000000006f [90341.677761] 00000000e0d83f00 0000000000000003 0000000000000010 5420040001a77400 [90341.677784] 000000007ba8b000 0000000000943fd0 000000000075bc4e 00000000ed3b3c10 [90341.677793] Krnl Code: 000000000075bc42: e320cc180004 lg %r2,3096(%r12) 000000000075bc48: c0e5ffffc5cc brasl %r14,7547e0 #000000000075bc4e: 1816 lr %r1,%r6 >000000000075bc50: ba19b008 cs %r1,%r9,8(%r11) 000000000075bc54: ec180041017e cij %r1,1,8,75bcd6 000000000075bc5a: 5810b008 l %r1,8(%r11) 000000000075bc5e: ec16005c027e cij %r1,2,6,75bd16 000000000075bc64: 5090b008 st %r9,8(%r11) [90341.677807] Call Trace: [90341.677810] ([<000000000075bbc0>] qeth_qdio_input_handler+0x1c8/0x4e0) [90341.677812] ([<000000000070efbc>] qdio_kick_handler+0x124/0x2a8) [90341.677814] ([<0000000000713570>] __tiqdio_inbound_processing+0xf0/0xcd0) [90341.677818] ([<0000000000143312>] tasklet_action+0x92/0x120) [90341.677823] ([<00000000008b6e72>] __do_softirq+0x112/0x308) [90341.677824] ([<0000000000142bce>] irq_exit+0xd6/0xf8) [90341.677829] ([<000000000010b1d2>] do_IRQ+0x6a/0x88) [90341.677830] ([<00000000008b6322>] io_int_handler+0x112/0x220) [90341.677832] ([<0000000000102b2e>] enabled_wait+0x56/0xa8) [90341.677833] ([<0000000000000000>] (null)) [90341.677835] ([<0000000000102e32>] arch_cpu_idle+0x32/0x48) [90341.677838] ([<000000000018a126>] cpu_startup_entry+0x266/0x2b0) [90341.677841] ([<0000000000113b38>] smp_start_secondary+0x100/0x110) [90341.677843] ([<00000000008b68a6>] restart_int_handler+0x62/0x78) [90341.677845] ([<00000000008b6588>] psw_idle+0x3c/0x40) [90341.677846] Last Breaking-Event-Address: [90341.677848] [<00000000007547ec>] qeth_dbf_longtext+0xc/0xc0 [90341.677849] [90341.677850] Kernel panic - not syncing: Fatal exception in interrupt qeth_qdio_cq_handler() analyzes SBALs on this completion queue, but does not observe the limit of 16 SBAL elements per SBAL. This patch adds the additional check to process not more than 16 SBAL elements. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 6ad5a14669e7..20cf29613043 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -3619,7 +3619,8 @@ static void qeth_qdio_cq_handler(struct qeth_card *card, int e; e = 0; - while (buffer->element[e].addr) { + while ((e < QDIO_MAX_ELEMENTS_PER_BUFFER) && + buffer->element[e].addr) { unsigned long phys_aob_addr; phys_aob_addr = (unsigned long) buffer->element[e].addr; From 243f750fc6f5d8e4dec984a9a785941c67452b8f Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 15 Sep 2016 14:39:25 +0200 Subject: [PATCH 357/538] qeth: do not limit number of gso segments To reduce the need of skb_linearize() calls, gso_max_segs of qeth net_devices had been limited according to the maximum number of qdio SBAL elements. But a gso segment cannot be larger than the mtu-size, while an SBAL element can contain up to 4096 bytes. The gso_max_segs limitation limits the maximum packet size given to the qeth driver. Performance measurements with tso-enabled qeth network interfaces and mtu-size 1500 showed, that the disadvantage of smaller packets is much more severe than the advantage of fewer skb_linearize() calls. This patch gets rid of the gso_max_segs limitations in the qeth driver. Signed-off-by: Ursula Braun Reviewed-by: Thomas Richter Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l2_main.c | 1 - drivers/s390/net/qeth_l3_main.c | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 54fd89110ac7..2081c1895638 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -1131,7 +1131,6 @@ static int qeth_l2_setup_netdev(struct qeth_card *card) qeth_l2_request_initial_mac(card); card->dev->gso_max_size = (QETH_MAX_BUFFER_ELEMENTS(card) - 1) * PAGE_SIZE; - card->dev->gso_max_segs = (QETH_MAX_BUFFER_ELEMENTS(card) - 1); SET_NETDEV_DEV(card->dev, &card->gdev->dev); netif_napi_add(card->dev, &card->napi, qeth_l2_poll, QETH_NAPI_WEIGHT); netif_carrier_off(card->dev); diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 4ba82e12d6f9..0cbbc803310f 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -3148,7 +3148,6 @@ static int qeth_l3_setup_netdev(struct qeth_card *card) netif_keep_dst(card->dev); card->dev->gso_max_size = (QETH_MAX_BUFFER_ELEMENTS(card) - 1) * PAGE_SIZE; - card->dev->gso_max_segs = (QETH_MAX_BUFFER_ELEMENTS(card) - 1); SET_NETDEV_DEV(card->dev, &card->gdev->dev); netif_napi_add(card->dev, &card->napi, qeth_l3_poll, QETH_NAPI_WEIGHT); From 5722963a8e83309dad831cf6968c4c805aa342c0 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 15 Sep 2016 14:39:26 +0200 Subject: [PATCH 358/538] qeth: do not turn on SG per default According to recent performance measurements, turning on net_device feature NETIF_F_SG only behaves well, but turning on feature NETIF_F_GSO shows bad results. Since the kernel activates NETIF_F_GSO automatically as soon as the driver configures feature NETIF_F_SG, qeth should not activate feature NETIF_F_SG per default, until the qeth problems with NETIF_F_GSO are solved. Signed-off-by: Ursula Braun Reviewed-by: Thomas Richter Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l2_main.c | 2 -- drivers/s390/net/qeth_l3_main.c | 1 - 2 files changed, 3 deletions(-) diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 2081c1895638..bb27058fa9f0 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -1124,8 +1124,6 @@ static int qeth_l2_setup_netdev(struct qeth_card *card) card->dev->hw_features |= NETIF_F_RXCSUM; card->dev->vlan_features |= NETIF_F_RXCSUM; } - /* Turn on SG per default */ - card->dev->features |= NETIF_F_SG; } card->info.broadcast_capable = 1; qeth_l2_request_initial_mac(card); diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 0cbbc803310f..c00f6db812ac 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -3120,7 +3120,6 @@ static int qeth_l3_setup_netdev(struct qeth_card *card) card->dev->vlan_features = NETIF_F_SG | NETIF_F_RXCSUM | NETIF_F_IP_CSUM | NETIF_F_TSO; - card->dev->features = NETIF_F_SG; } } } else if (card->info.type == QETH_CARD_TYPE_IQD) { From 732a59cb6e7faed7a40da6665a517945c95fc895 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Thu, 15 Sep 2016 14:39:27 +0200 Subject: [PATCH 359/538] s390/qeth: fix setting VIPA address commit 5f78e29ceebf ("qeth: optimize IP handling in rx_mode callback") restructured the internal address handling. This work broke setting a virtual IP address. The command echo 10.1.1.1 > /sys/bus/ccwgroup/devices//vipa/add4 fails with file exist error even if the IP address has not been set before. It turned out that the search result for the IP address search is handled incorrectly in the VIPA case. This patch fixes the setting of an virtual IP address. Signed-off-by: Thomas Richter Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l3_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index c00f6db812ac..272d9e7419be 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -701,7 +701,7 @@ int qeth_l3_add_vipa(struct qeth_card *card, enum qeth_prot_versions proto, spin_lock_bh(&card->ip_lock); - if (!qeth_l3_ip_from_hash(card, ipaddr)) + if (qeth_l3_ip_from_hash(card, ipaddr)) rc = -EEXIST; else qeth_l3_add_ip(card, ipaddr); @@ -769,7 +769,7 @@ int qeth_l3_add_rxip(struct qeth_card *card, enum qeth_prot_versions proto, spin_lock_bh(&card->ip_lock); - if (!qeth_l3_ip_from_hash(card, ipaddr)) + if (qeth_l3_ip_from_hash(card, ipaddr)) rc = -EEXIST; else qeth_l3_add_ip(card, ipaddr); From 42857cf512cb34c2c8cb50f1e766689d979d64e0 Mon Sep 17 00:00:00 2001 From: Phil Turnbull Date: Thu, 15 Sep 2016 12:20:12 -0400 Subject: [PATCH 360/538] configfs: Return -EFBIG from configfs_write_bin_file. The check for writing more than cb_max_size bytes does not 'goto out' so it is a no-op which allows users to vmalloc an arbitrary amount. Fixes: 03607ace807b ("configfs: implement binary attributes") Cc: stable@kernel.org Signed-off-by: Phil Turnbull Signed-off-by: Christoph Hellwig --- fs/configfs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/configfs/file.c b/fs/configfs/file.c index c30cf49b69d2..2c6312db8516 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -333,6 +333,7 @@ configfs_write_bin_file(struct file *file, const char __user *buf, if (bin_attr->cb_max_size && *ppos + count > bin_attr->cb_max_size) { len = -EFBIG; + goto out; } tbuf = vmalloc(*ppos + count); From 81539169f283329fd8bc58457cc15754f683ba69 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 16 Sep 2016 08:05:20 -0500 Subject: [PATCH 361/538] x86/dumpstack: Remove NULL task pointer convention show_stack_log_lvl() and friends allow a NULL pointer for the task_struct to indicate the current task. This creates confusion and can cause sneaky bugs. Instead require the caller to pass 'current' directly. This only changes the internal workings of the dumpstack code. The dump_trace() and show_stack() interfaces still allow a NULL task pointer. Those interfaces should also probably be fixed as well. Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/stacktrace.h | 4 ++-- arch/x86/kernel/dumpstack.c | 4 +++- arch/x86/kernel/dumpstack_32.c | 2 +- arch/x86/kernel/dumpstack_64.c | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 780a83efcfd3..ed2be1b5ada8 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -94,7 +94,7 @@ get_frame_pointer(struct task_struct *task, struct pt_regs *regs) if (regs) return (unsigned long *)regs->bp; - if (!task || task == current) + if (task == current) return __builtin_frame_address(0); return (unsigned long *)((struct inactive_task_frame *)task->thread.sp)->bp; @@ -113,7 +113,7 @@ get_stack_pointer(struct task_struct *task, struct pt_regs *regs) if (regs) return (unsigned long *)kernel_stack_pointer(regs); - if (!task || task == current) + if (task == current) return __builtin_frame_address(0); return (unsigned long *)task->thread.sp; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index aa208e565b03..e0648f755158 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -175,11 +175,13 @@ void show_stack(struct task_struct *task, unsigned long *sp) { unsigned long bp = 0; + task = task ? : current; + /* * Stack frames below this one aren't interesting. Don't show them * if we're printing for %current. */ - if (!sp && (!task || task == current)) { + if (!sp && task == current) { sp = get_stack_pointer(current, NULL); bp = (unsigned long)get_frame_pointer(current, NULL); } diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 122f37d7bb7e..4ff000811e03 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -205,7 +205,7 @@ void show_regs(struct pt_regs *regs) u8 *ip; pr_emerg("Stack:\n"); - show_stack_log_lvl(NULL, regs, NULL, 0, KERN_EMERG); + show_stack_log_lvl(current, regs, NULL, 0, KERN_EMERG); pr_emerg("Code:"); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 16c0d5f89b5e..008a29837cab 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -278,7 +278,7 @@ void show_regs(struct pt_regs *regs) u8 *ip; printk(KERN_DEFAULT "Stack:\n"); - show_stack_log_lvl(NULL, regs, NULL, 0, KERN_DEFAULT); + show_stack_log_lvl(current, regs, NULL, 0, KERN_DEFAULT); printk(KERN_DEFAULT "Code: "); From cce94483e47e8e3d74cf4475dea33f9fd4b6ad9f Mon Sep 17 00:00:00 2001 From: Filipe Manco Date: Thu, 15 Sep 2016 17:10:46 +0200 Subject: [PATCH 362/538] xen-netback: fix error handling on netback_probe() In case of error during netback_probe() (e.g. an entry missing on the xenstore) netback_remove() is called on the new device, which will set the device backend state to XenbusStateClosed by calling set_backend_state(). However, the backend state wasn't initialized by netback_probe() at this point, which will cause and invalid transaction and set_backend_state() to BUG(). Initialize the backend state at the beginning of netback_probe() to XenbusStateInitialising, and create two new valid state transitions on set_backend_state(), from XenbusStateInitialising to XenbusStateClosed, and from XenbusStateInitialising to XenbusStateInitWait. Signed-off-by: Filipe Manco Acked-by: Wei Liu Signed-off-by: David S. Miller --- drivers/net/xen-netback/xenbus.c | 46 +++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index 6a31f2610c23..daf4c7867102 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -271,6 +271,11 @@ static int netback_probe(struct xenbus_device *dev, be->dev = dev; dev_set_drvdata(&dev->dev, be); + be->state = XenbusStateInitialising; + err = xenbus_switch_state(dev, XenbusStateInitialising); + if (err) + goto fail; + sg = 1; do { @@ -383,11 +388,6 @@ static int netback_probe(struct xenbus_device *dev, be->hotplug_script = script; - err = xenbus_switch_state(dev, XenbusStateInitWait); - if (err) - goto fail; - - be->state = XenbusStateInitWait; /* This kicks hotplug scripts, so do it immediately. */ err = backend_create_xenvif(be); @@ -492,20 +492,20 @@ static inline void backend_switch_state(struct backend_info *be, /* Handle backend state transitions: * - * The backend state starts in InitWait and the following transitions are + * The backend state starts in Initialising and the following transitions are * allowed. * - * InitWait -> Connected - * - * ^ \ | - * | \ | - * | \ | - * | \ | - * | \ | - * | \ | - * | V V + * Initialising -> InitWait -> Connected + * \ + * \ ^ \ | + * \ | \ | + * \ | \ | + * \ | \ | + * \ | \ | + * \ | \ | + * V | V V * - * Closed <-> Closing + * Closed <-> Closing * * The state argument specifies the eventual state of the backend and the * function transitions to that state via the shortest path. @@ -515,6 +515,20 @@ static void set_backend_state(struct backend_info *be, { while (be->state != state) { switch (be->state) { + case XenbusStateInitialising: + switch (state) { + case XenbusStateInitWait: + case XenbusStateConnected: + case XenbusStateClosing: + backend_switch_state(be, XenbusStateInitWait); + break; + case XenbusStateClosed: + backend_switch_state(be, XenbusStateClosed); + break; + default: + BUG(); + } + break; case XenbusStateClosed: switch (state) { case XenbusStateInitWait: From ffb4d6c8508657824bcef68a36b2a0f9d8c09d10 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 15 Sep 2016 08:12:33 -0700 Subject: [PATCH 363/538] tcp: fix overflow in __tcp_retransmit_skb() If a TCP socket gets a large write queue, an overflow can happen in a test in __tcp_retransmit_skb() preventing all retransmits. The flow then stalls and resets after timeouts. Tested: sysctl -w net.core.wmem_max=1000000000 netperf -H dest -- -s 1000000000 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bdaef7fd6e47..f53d0cca5fa4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2605,7 +2605,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) * copying overhead: fragmentation, tunneling, mangling etc. */ if (atomic_read(&sk->sk_wmem_alloc) > - min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) + min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), + sk->sk_sndbuf)) return -EAGAIN; if (skb_still_in_host_queue(sk, skb)) From 20c64d5cd5a2bdcdc8982a06cb05e5e1bd851a3d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 15 Sep 2016 08:48:46 -0700 Subject: [PATCH 364/538] net: avoid sk_forward_alloc overflows A malicious TCP receiver, sending SACK, can force the sender to split skbs in write queue and increase its memory usage. Then, when socket is closed and its write queue purged, we might overflow sk_forward_alloc (It becomes negative) sk_mem_reclaim() does nothing in this case, and more than 2GB are leaked from TCP perspective (tcp_memory_allocated is not changed) Then warnings trigger from inet_sock_destruct() and sk_stream_kill_queues() seeing a not zero sk_forward_alloc All TCP stack can be stuck because TCP is under memory pressure. A simple fix is to preemptively reclaim from sk_mem_uncharge(). This makes sure a socket wont have more than 2 MB forward allocated, after burst and idle period. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/net/sock.h b/include/net/sock.h index ff5be7e8ddea..8741988e6880 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1332,6 +1332,16 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) if (!sk_has_account(sk)) return; sk->sk_forward_alloc += size; + + /* Avoid a possible overflow. + * TCP send queues can make this happen, if sk_mem_reclaim() + * is not called and more than 2 GBytes are released at once. + * + * If we reach 2 MBytes, reclaim 1 MBytes right now, there is + * no need to hold that much forward allocation anyway. + */ + if (unlikely(sk->sk_forward_alloc >= 1 << 21)) + __sk_mem_reclaim(sk, 1 << 20); } static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) From 8ab86c00e349cef9fb14719093a7f198bcc72629 Mon Sep 17 00:00:00 2001 From: "phil.turnbull@oracle.com" Date: Thu, 15 Sep 2016 12:41:44 -0400 Subject: [PATCH 365/538] irda: Free skb on irda_accept error path. skb is not freed if newsk is NULL. Rework the error path so free_skb is unconditionally called on function exit. Fixes: c3ea9fa27413 ("[IrDA] af_irda: IRDA_ASSERT cleanups") Signed-off-by: Phil Turnbull Signed-off-by: David S. Miller --- net/irda/af_irda.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 8d2f7c9b491d..ccc244406fb9 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -832,7 +832,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) struct sock *sk = sock->sk; struct irda_sock *new, *self = irda_sk(sk); struct sock *newsk; - struct sk_buff *skb; + struct sk_buff *skb = NULL; int err; err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0); @@ -900,7 +900,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) err = -EPERM; /* value does not seem to make sense. -arnd */ if (!new->tsap) { pr_debug("%s(), dup failed!\n", __func__); - kfree_skb(skb); goto out; } @@ -919,7 +918,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) /* Clean up the original one to keep it in listen state */ irttp_listen(self->tsap); - kfree_skb(skb); sk->sk_ack_backlog--; newsock->state = SS_CONNECTED; @@ -927,6 +925,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) irda_connect_response(new); err = 0; out: + kfree_skb(skb); release_sock(sk); return err; } From 4496195ddd75c4ad57b783739414e69b7d79843e Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 15 Sep 2016 15:02:38 -0300 Subject: [PATCH 366/538] sctp: fix SSN comparision This function actually operates on u32 yet its paramteres were declared as u16, causing integer truncation upon calling. Note in patch context that ADDIP_SERIAL_SIGN_BIT is already 32 bits. Signed-off-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/sm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index efc01743b9d6..bafe2a0ab908 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -382,7 +382,7 @@ enum { ADDIP_SERIAL_SIGN_BIT = (1<<31) }; -static inline int ADDIP_SERIAL_gte(__u16 s, __u16 t) +static inline int ADDIP_SERIAL_gte(__u32 s, __u32 t) { return ((s) == (t)) || (((t) - (s)) & ADDIP_SERIAL_SIGN_BIT); } From 2835d2d9e366a2985b24051d228333bfba82f3a7 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Thu, 15 Sep 2016 22:47:51 +0200 Subject: [PATCH 367/538] bna: add missing per queue ethtool stat Commit ba5ca784 "bna: check for dma mapping errors" added besides other things a statistic that counts number of DMA buffer mapping failures per each Rx queue. This counter is not included in ethtool stats output. Fixes: ba5ca784 "bna: check for dma mapping errors" Signed-off-by: Ivan Vecera Signed-off-by: David S. Miller --- drivers/net/ethernet/brocade/bna/bnad_ethtool.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/brocade/bna/bnad_ethtool.c b/drivers/net/ethernet/brocade/bna/bnad_ethtool.c index 0e4fdc3dd729..5671353fc7bc 100644 --- a/drivers/net/ethernet/brocade/bna/bnad_ethtool.c +++ b/drivers/net/ethernet/brocade/bna/bnad_ethtool.c @@ -31,7 +31,7 @@ #define BNAD_NUM_TXF_COUNTERS 12 #define BNAD_NUM_RXF_COUNTERS 10 #define BNAD_NUM_CQ_COUNTERS (3 + 5) -#define BNAD_NUM_RXQ_COUNTERS 6 +#define BNAD_NUM_RXQ_COUNTERS 7 #define BNAD_NUM_TXQ_COUNTERS 5 #define BNAD_ETHTOOL_STATS_NUM \ @@ -658,6 +658,8 @@ bnad_get_strings(struct net_device *netdev, u32 stringset, u8 *string) string += ETH_GSTRING_LEN; sprintf(string, "rxq%d_allocbuf_failed", q_num); string += ETH_GSTRING_LEN; + sprintf(string, "rxq%d_mapbuf_failed", q_num); + string += ETH_GSTRING_LEN; sprintf(string, "rxq%d_producer_index", q_num); string += ETH_GSTRING_LEN; sprintf(string, "rxq%d_consumer_index", q_num); @@ -678,6 +680,9 @@ bnad_get_strings(struct net_device *netdev, u32 stringset, u8 *string) sprintf(string, "rxq%d_allocbuf_failed", q_num); string += ETH_GSTRING_LEN; + sprintf(string, "rxq%d_mapbuf_failed", + q_num); + string += ETH_GSTRING_LEN; sprintf(string, "rxq%d_producer_index", q_num); string += ETH_GSTRING_LEN; From 37dd348270c1a48f0234354a06c0ce052b6c85b1 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Thu, 15 Sep 2016 22:47:52 +0200 Subject: [PATCH 368/538] bna: fix crash in bnad_get_strings() Commit 6e7333d "net: add rx_nohandler stat counter" added the new entry rx_nohandler into struct rtnl_link_stats64. Unfortunately the bna driver foolishly depends on the structure. It uses part of it for ethtool statistics and it's not bad but the driver assumes its size is constant as it defines string for each existing entry. The problem occurs when the structure is extended because you need to modify bna driver as well. If not any attempt to retrieve ethtool statistics results in crash in bnad_get_strings(). The patch changes BNAD_ETHTOOL_STATS_NUM so it counts real number of strings in the array and also removes rtnl_link_stats64 entries that are not used in output and are always zero. Fixes: 6e7333d "net: add rx_nohandler stat counter" Signed-off-by: Ivan Vecera Signed-off-by: David S. Miller --- .../net/ethernet/brocade/bna/bnad_ethtool.c | 50 +++++++++---------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/drivers/net/ethernet/brocade/bna/bnad_ethtool.c b/drivers/net/ethernet/brocade/bna/bnad_ethtool.c index 5671353fc7bc..31f61a744d66 100644 --- a/drivers/net/ethernet/brocade/bna/bnad_ethtool.c +++ b/drivers/net/ethernet/brocade/bna/bnad_ethtool.c @@ -34,12 +34,7 @@ #define BNAD_NUM_RXQ_COUNTERS 7 #define BNAD_NUM_TXQ_COUNTERS 5 -#define BNAD_ETHTOOL_STATS_NUM \ - (sizeof(struct rtnl_link_stats64) / sizeof(u64) + \ - sizeof(struct bnad_drv_stats) / sizeof(u64) + \ - offsetof(struct bfi_enet_stats, rxf_stats[0]) / sizeof(u64)) - -static const char *bnad_net_stats_strings[BNAD_ETHTOOL_STATS_NUM] = { +static const char *bnad_net_stats_strings[] = { "rx_packets", "tx_packets", "rx_bytes", @@ -50,22 +45,10 @@ static const char *bnad_net_stats_strings[BNAD_ETHTOOL_STATS_NUM] = { "tx_dropped", "multicast", "collisions", - "rx_length_errors", - "rx_over_errors", "rx_crc_errors", "rx_frame_errors", - "rx_fifo_errors", - "rx_missed_errors", - - "tx_aborted_errors", - "tx_carrier_errors", "tx_fifo_errors", - "tx_heartbeat_errors", - "tx_window_errors", - - "rx_compressed", - "tx_compressed", "netif_queue_stop", "netif_queue_wakeup", @@ -254,6 +237,8 @@ static const char *bnad_net_stats_strings[BNAD_ETHTOOL_STATS_NUM] = { "fc_tx_fid_parity_errors", }; +#define BNAD_ETHTOOL_STATS_NUM ARRAY_SIZE(bnad_net_stats_strings) + static int bnad_get_settings(struct net_device *netdev, struct ethtool_cmd *cmd) { @@ -859,9 +844,9 @@ bnad_get_ethtool_stats(struct net_device *netdev, struct ethtool_stats *stats, u64 *buf) { struct bnad *bnad = netdev_priv(netdev); - int i, j, bi; + int i, j, bi = 0; unsigned long flags; - struct rtnl_link_stats64 *net_stats64; + struct rtnl_link_stats64 net_stats64; u64 *stats64; u32 bmap; @@ -876,14 +861,25 @@ bnad_get_ethtool_stats(struct net_device *netdev, struct ethtool_stats *stats, * under the same lock */ spin_lock_irqsave(&bnad->bna_lock, flags); - bi = 0; - memset(buf, 0, stats->n_stats * sizeof(u64)); - - net_stats64 = (struct rtnl_link_stats64 *)buf; - bnad_netdev_qstats_fill(bnad, net_stats64); - bnad_netdev_hwstats_fill(bnad, net_stats64); - bi = sizeof(*net_stats64) / sizeof(u64); + memset(&net_stats64, 0, sizeof(net_stats64)); + bnad_netdev_qstats_fill(bnad, &net_stats64); + bnad_netdev_hwstats_fill(bnad, &net_stats64); + + buf[bi++] = net_stats64.rx_packets; + buf[bi++] = net_stats64.tx_packets; + buf[bi++] = net_stats64.rx_bytes; + buf[bi++] = net_stats64.tx_bytes; + buf[bi++] = net_stats64.rx_errors; + buf[bi++] = net_stats64.tx_errors; + buf[bi++] = net_stats64.rx_dropped; + buf[bi++] = net_stats64.tx_dropped; + buf[bi++] = net_stats64.multicast; + buf[bi++] = net_stats64.collisions; + buf[bi++] = net_stats64.rx_length_errors; + buf[bi++] = net_stats64.rx_crc_errors; + buf[bi++] = net_stats64.rx_frame_errors; + buf[bi++] = net_stats64.tx_fifo_errors; /* Get netif_queue_stopped from stack */ bnad->stats.drv_stats.netif_queue_stopped = netif_queue_stopped(netdev); From 6244bd651236d86f59387d43c531b5f942a92b38 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 8 Aug 2016 17:48:20 -0600 Subject: [PATCH 369/538] exynos-drm: Fix unsupported GEM memory type error message to be clear Fix unsupported GEM memory type error message to include the memory type information. Signed-off-by: Shuah Khan Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_fb.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_fb.c b/drivers/gpu/drm/exynos/exynos_drm_fb.c index e0166403b4bd..40ce841eb952 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fb.c +++ b/drivers/gpu/drm/exynos/exynos_drm_fb.c @@ -55,11 +55,11 @@ static int check_fb_gem_memory_type(struct drm_device *drm_dev, flags = exynos_gem->flags; /* - * without iommu support, not support physically non-continuous memory - * for framebuffer. + * Physically non-contiguous memory type for framebuffer is not + * supported without IOMMU. */ if (IS_NONCONTIG_BUFFER(flags)) { - DRM_ERROR("cannot use this gem memory type for fb.\n"); + DRM_ERROR("Non-contiguous GEM memory is not supported.\n"); return -EINVAL; } From 479f12545460809cfc9093d90d6ed82d76388e97 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 31 Aug 2016 14:55:54 +0200 Subject: [PATCH 370/538] drm/exynos: fimc: fix system and runtime pm integration Use generic helpers instead of open-coding usage of runtime pm for system sleep pm, which was potentially broken for some corner cases. Signed-off-by: Marek Szyprowski Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_fimc.c | 29 ++---------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_fimc.c b/drivers/gpu/drm/exynos/exynos_drm_fimc.c index 0525c56145db..147ef0d298cb 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fimc.c +++ b/drivers/gpu/drm/exynos/exynos_drm_fimc.c @@ -1753,32 +1753,6 @@ static int fimc_clk_ctrl(struct fimc_context *ctx, bool enable) return 0; } -#ifdef CONFIG_PM_SLEEP -static int fimc_suspend(struct device *dev) -{ - struct fimc_context *ctx = get_fimc_context(dev); - - DRM_DEBUG_KMS("id[%d]\n", ctx->id); - - if (pm_runtime_suspended(dev)) - return 0; - - return fimc_clk_ctrl(ctx, false); -} - -static int fimc_resume(struct device *dev) -{ - struct fimc_context *ctx = get_fimc_context(dev); - - DRM_DEBUG_KMS("id[%d]\n", ctx->id); - - if (!pm_runtime_suspended(dev)) - return fimc_clk_ctrl(ctx, true); - - return 0; -} -#endif - static int fimc_runtime_suspend(struct device *dev) { struct fimc_context *ctx = get_fimc_context(dev); @@ -1799,7 +1773,8 @@ static int fimc_runtime_resume(struct device *dev) #endif static const struct dev_pm_ops fimc_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(fimc_suspend, fimc_resume) + SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, + pm_runtime_force_resume) SET_RUNTIME_PM_OPS(fimc_runtime_suspend, fimc_runtime_resume, NULL) }; From 83bd7b20aaf499030bf857ef64de3c19309b107d Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 31 Aug 2016 14:55:55 +0200 Subject: [PATCH 371/538] drm/exynos: gsc: fix system and runtime pm integration Use generic helpers instead of open-coding usage of runtime pm for system sleep pm, which was potentially broken for some corner cases. Signed-off-by: Marek Szyprowski Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_gsc.c | 29 ++----------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_gsc.c b/drivers/gpu/drm/exynos/exynos_drm_gsc.c index 5d20da8f957e..b1894aa9286e 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gsc.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gsc.c @@ -1760,32 +1760,6 @@ static int gsc_remove(struct platform_device *pdev) return 0; } -#ifdef CONFIG_PM_SLEEP -static int gsc_suspend(struct device *dev) -{ - struct gsc_context *ctx = get_gsc_context(dev); - - DRM_DEBUG_KMS("id[%d]\n", ctx->id); - - if (pm_runtime_suspended(dev)) - return 0; - - return gsc_clk_ctrl(ctx, false); -} - -static int gsc_resume(struct device *dev) -{ - struct gsc_context *ctx = get_gsc_context(dev); - - DRM_DEBUG_KMS("id[%d]\n", ctx->id); - - if (!pm_runtime_suspended(dev)) - return gsc_clk_ctrl(ctx, true); - - return 0; -} -#endif - #ifdef CONFIG_PM static int gsc_runtime_suspend(struct device *dev) { @@ -1807,7 +1781,8 @@ static int gsc_runtime_resume(struct device *dev) #endif static const struct dev_pm_ops gsc_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(gsc_suspend, gsc_resume) + SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, + pm_runtime_force_resume) SET_RUNTIME_PM_OPS(gsc_runtime_suspend, gsc_runtime_resume, NULL) }; From 5b67723e6096f5470f361656cd108430d3b12c67 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 31 Aug 2016 14:55:56 +0200 Subject: [PATCH 372/538] drm/exynos: rotator: fix system and runtime pm integration Use generic helpers instead of open-coding usage of runtime pm for system sleep pm, which was potentially broken for some corner cases. Signed-off-by: Marek Szyprowski Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_rotator.c | 26 ++------------------- 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_rotator.c b/drivers/gpu/drm/exynos/exynos_drm_rotator.c index 404367a430b5..6591e406084c 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_rotator.c +++ b/drivers/gpu/drm/exynos/exynos_drm_rotator.c @@ -794,29 +794,6 @@ static int rotator_clk_crtl(struct rot_context *rot, bool enable) return 0; } - -#ifdef CONFIG_PM_SLEEP -static int rotator_suspend(struct device *dev) -{ - struct rot_context *rot = dev_get_drvdata(dev); - - if (pm_runtime_suspended(dev)) - return 0; - - return rotator_clk_crtl(rot, false); -} - -static int rotator_resume(struct device *dev) -{ - struct rot_context *rot = dev_get_drvdata(dev); - - if (!pm_runtime_suspended(dev)) - return rotator_clk_crtl(rot, true); - - return 0; -} -#endif - static int rotator_runtime_suspend(struct device *dev) { struct rot_context *rot = dev_get_drvdata(dev); @@ -833,7 +810,8 @@ static int rotator_runtime_resume(struct device *dev) #endif static const struct dev_pm_ops rotator_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(rotator_suspend, rotator_resume) + SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, + pm_runtime_force_resume) SET_RUNTIME_PM_OPS(rotator_runtime_suspend, rotator_runtime_resume, NULL) }; From b05984e21a7e000bf5074ace00d7a574944b2c16 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 31 Aug 2016 14:55:57 +0200 Subject: [PATCH 373/538] drm/exynos: g2d: fix system and runtime pm integration Move code from system sleep pm to runtime pm callbacks to ensure proper driver state preservation when device is under power domain. Then, use generic helpers for using runtime pm for system sleep pm. Signed-off-by: Marek Szyprowski Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_g2d.c | 29 ++++++------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c b/drivers/gpu/drm/exynos/exynos_drm_g2d.c index 4bf00f57ffe8..6eca8bb88648 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c +++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c @@ -1475,8 +1475,8 @@ static int g2d_remove(struct platform_device *pdev) return 0; } -#ifdef CONFIG_PM_SLEEP -static int g2d_suspend(struct device *dev) +#ifdef CONFIG_PM +static int g2d_runtime_suspend(struct device *dev) { struct g2d_data *g2d = dev_get_drvdata(dev); @@ -1490,25 +1490,6 @@ static int g2d_suspend(struct device *dev) flush_work(&g2d->runqueue_work); - return 0; -} - -static int g2d_resume(struct device *dev) -{ - struct g2d_data *g2d = dev_get_drvdata(dev); - - g2d->suspended = false; - g2d_exec_runqueue(g2d); - - return 0; -} -#endif - -#ifdef CONFIG_PM -static int g2d_runtime_suspend(struct device *dev) -{ - struct g2d_data *g2d = dev_get_drvdata(dev); - clk_disable_unprepare(g2d->gate_clk); return 0; @@ -1523,12 +1504,16 @@ static int g2d_runtime_resume(struct device *dev) if (ret < 0) dev_warn(dev, "failed to enable clock.\n"); + g2d->suspended = false; + g2d_exec_runqueue(g2d); + return ret; } #endif static const struct dev_pm_ops g2d_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(g2d_suspend, g2d_resume) + SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, + pm_runtime_force_resume) SET_RUNTIME_PM_OPS(g2d_runtime_suspend, g2d_runtime_resume, NULL) }; From 4158dbe1be9b420e1fdd9ec5c033647a605ca485 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 18 Sep 2016 22:51:38 +0900 Subject: [PATCH 374/538] Subject: [PATCH, RESEND] drm: exynos: avoid unused function warning When CONFIG_PM is not set, we get a warning about an unused function: drivers/gpu/drm/exynos/exynos_drm_gsc.c:1219:12: error: 'gsc_clk_ctrl' defined but not used [-Werror=unused-function] static int gsc_clk_ctrl(struct gsc_context *ctx, bool enable) ^~~~~~~~~~~~ This removes the two #ifdef checks in this file and instead marks the functions as __maybe_unused, which is a more reliable way of doing the same, allowing better build coverage and avoiding the warning above. Signed-off-by: Arnd Bergmann Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_gsc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_gsc.c b/drivers/gpu/drm/exynos/exynos_drm_gsc.c index b1894aa9286e..52a9d269484e 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gsc.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gsc.c @@ -1760,8 +1760,7 @@ static int gsc_remove(struct platform_device *pdev) return 0; } -#ifdef CONFIG_PM -static int gsc_runtime_suspend(struct device *dev) +static int __maybe_unused gsc_runtime_suspend(struct device *dev) { struct gsc_context *ctx = get_gsc_context(dev); @@ -1770,7 +1769,7 @@ static int gsc_runtime_suspend(struct device *dev) return gsc_clk_ctrl(ctx, false); } -static int gsc_runtime_resume(struct device *dev) +static int __maybe_unused gsc_runtime_resume(struct device *dev) { struct gsc_context *ctx = get_gsc_context(dev); @@ -1778,7 +1777,6 @@ static int gsc_runtime_resume(struct device *dev) return gsc_clk_ctrl(ctx, true); } -#endif static const struct dev_pm_ops gsc_pm_ops = { SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, From 19cd120319ef5390404a5d9c829c3a7962f184a8 Mon Sep 17 00:00:00 2001 From: Giuseppe CAVALLARO Date: Fri, 16 Sep 2016 10:50:13 +0200 Subject: [PATCH 375/538] stmmac: fix PWRDWN into the PMT register for global unicast. MAC devices use the RWKPKTEN and MGKPKTEN bits of the PMT Control/Status register to generate power management events. So this patch is to properly set the RWKPKTEN [BIT(2)] inside the PMT register (needed in case of global unicast). Reported-by: Aditi SHARMA Signed-off-by: Giuseppe Cavallaro Cc: Alexandre TORGUE Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c index cbefe9e2207c..885a5e64519d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c @@ -261,7 +261,7 @@ static void dwmac1000_pmt(struct mac_device_info *hw, unsigned long mode) } if (mode & WAKE_UCAST) { pr_debug("GMAC: WOL on global unicast\n"); - pmt |= global_unicast; + pmt |= power_down | global_unicast | wake_up_frame_en; } writel(pmt, ioaddr + GMAC_PMT); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index df5580dcdfed..51019b794be5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -102,7 +102,7 @@ static void dwmac4_pmt(struct mac_device_info *hw, unsigned long mode) } if (mode & WAKE_UCAST) { pr_debug("GMAC: WOL on global unicast\n"); - pmt |= global_unicast; + pmt |= power_down | global_unicast | wake_up_frame_en; } writel(pmt, ioaddr + GMAC_PMT); From 47a66e45d7a7613322549c2475ea9d809baaf514 Mon Sep 17 00:00:00 2001 From: "Kristian H. Kristensen" Date: Tue, 13 Sep 2016 14:20:45 -0700 Subject: [PATCH 376/538] drm: Only use compat ioctl for addfb2 on X86/IA64 Similar to struct drm_update_draw, struct drm_mode_fb_cmd2 has an unaligned 64 bit field (modifier). This get packed differently between 32 bit and 64 bit modes on architectures that can handle unaligned 64 bit access (X86 and IA64). Other architectures pack the structs the same and don't need the compat wrapper. Use the same condition for drm_mode_fb_cmd2 as we use for drm_update_draw. Note that only the modifier will be packed differently between compat and non-compat versions. Reviewed-by: Rob Clark Signed-off-by: Kristian H. Kristensen [seanpaul added not at bottom of commit msg re: modifier] Signed-off-by: Sean Paul Link: http://patchwork.freedesktop.org/patch/msgid/1473801645-116011-1-git-send-email-hoegsberg@chromium.org Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_ioc32.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/drm_ioc32.c b/drivers/gpu/drm/drm_ioc32.c index 57676f8d7ecf..a6289752be16 100644 --- a/drivers/gpu/drm/drm_ioc32.c +++ b/drivers/gpu/drm/drm_ioc32.c @@ -1015,6 +1015,7 @@ static int compat_drm_wait_vblank(struct file *file, unsigned int cmd, return 0; } +#if defined(CONFIG_X86) || defined(CONFIG_IA64) typedef struct drm_mode_fb_cmd232 { u32 fb_id; u32 width; @@ -1071,6 +1072,7 @@ static int compat_drm_mode_addfb2(struct file *file, unsigned int cmd, return 0; } +#endif static drm_ioctl_compat_t *drm_compat_ioctls[] = { [DRM_IOCTL_NR(DRM_IOCTL_VERSION32)] = compat_drm_version, @@ -1104,7 +1106,9 @@ static drm_ioctl_compat_t *drm_compat_ioctls[] = { [DRM_IOCTL_NR(DRM_IOCTL_UPDATE_DRAW32)] = compat_drm_update_draw, #endif [DRM_IOCTL_NR(DRM_IOCTL_WAIT_VBLANK32)] = compat_drm_wait_vblank, +#if defined(CONFIG_X86) || defined(CONFIG_IA64) [DRM_IOCTL_NR(DRM_IOCTL_MODE_ADDFB232)] = compat_drm_mode_addfb2, +#endif }; /** From 1984e075915cbae65336a99b1879865080d8e55e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 19 Sep 2016 09:49:27 +0100 Subject: [PATCH 377/538] genirq: Skip chained interrupt trigger setup if type is IRQ_TYPE_NONE There is no point in trying to configure the trigger of a chained interrupt if no trigger information has been configured. At best this is ignored, and at the worse this confuses the underlying irqchip (which is likely not to handle such a thing), and unnecessarily alarms the user. Only apply the configuration if type is not IRQ_TYPE_NONE. Fixes: 1e12c4a9393b ("genirq: Correctly configure the trigger on chained interrupts") Reported-and-tested-by: Geert Uytterhoeven Signed-off-by: Marc Zyngier Link: https://lkml.kernel.org/r/CAMuHMdVW1eTn20=EtYcJ8hkVwohaSuH_yQXrY2MGBEvZ8fpFOg@mail.gmail.com Link: http://lkml.kernel.org/r/1474274967-15984-1-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 637389088b3f..26ba5654d9d5 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -820,6 +820,8 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, desc->name = name; if (handle != handle_bad_irq && is_chained) { + unsigned int type = irqd_get_trigger_type(&desc->irq_data); + /* * We're about to start this interrupt immediately, * hence the need to set the trigger configuration. @@ -828,8 +830,10 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, * chained interrupt. Reset it immediately because we * do know better. */ - __irq_set_trigger(desc, irqd_get_trigger_type(&desc->irq_data)); - desc->handle_irq = handle; + if (type != IRQ_TYPE_NONE) { + __irq_set_trigger(desc, type); + desc->handle_irq = handle; + } irq_settings_set_noprobe(desc); irq_settings_set_norequest(desc); From 7a353289925f01cb188ebc6fc4f4a33456b7de44 Mon Sep 17 00:00:00 2001 From: RogerCC Lin Date: Mon, 19 Sep 2016 10:53:25 +0800 Subject: [PATCH 378/538] mtd: nand: fix generating over-boundary ECC data when writing When mtk_ecc_encode() is writing the ECC parity data to the OOB region,because each register is 4 bytes in length,but the len's unit is in bytes,the operation in the for loop will cross the ECC's boundary. Signed-off-by: RogerCC Lin Fixes: 1d6b1e464950 ("mtd: mediatek: driver for MTK Smart Device") Signed-off-by: Boris Brezillon --- drivers/mtd/nand/mtk_ecc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/mtd/nand/mtk_ecc.c b/drivers/mtd/nand/mtk_ecc.c index 25a4fbd4d24a..d54f666417e1 100644 --- a/drivers/mtd/nand/mtk_ecc.c +++ b/drivers/mtd/nand/mtk_ecc.c @@ -366,7 +366,8 @@ int mtk_ecc_encode(struct mtk_ecc *ecc, struct mtk_ecc_config *config, u8 *data, u32 bytes) { dma_addr_t addr; - u32 *p, len, i; + u8 *p; + u32 len, i, val; int ret = 0; addr = dma_map_single(ecc->dev, data, bytes, DMA_TO_DEVICE); @@ -392,11 +393,14 @@ int mtk_ecc_encode(struct mtk_ecc *ecc, struct mtk_ecc_config *config, /* Program ECC bytes to OOB: per sector oob = FDM + ECC + SPARE */ len = (config->strength * ECC_PARITY_BITS + 7) >> 3; - p = (u32 *)(data + bytes); + p = data + bytes; /* write the parity bytes generated by the ECC back to the OOB region */ - for (i = 0; i < len; i++) - p[i] = readl(ecc->regs + ECC_ENCPAR(i)); + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = readl(ecc->regs + ECC_ENCPAR(i / 4)); + p[i] = (val >> ((i % 4) * 8)) & 0xff; + } timeout: dma_unmap_single(ecc->dev, addr, bytes, DMA_TO_DEVICE); From 559e58e7ed2dadc310f174e609ead8a3e8acfc4e Mon Sep 17 00:00:00 2001 From: RogerCC Lin Date: Mon, 19 Sep 2016 10:53:26 +0800 Subject: [PATCH 379/538] mtd: nand: fix chances to create incomplete ECC data when writing When mtk_nfc_do_write_page() comparing the sector number,because the sector number field is at the 12th-bit position of NFI_BYTELEN register,the masked register should be shifted 12 bits before being compared.The result of this bug may cause the second subpage has incomplete ECC parity bytes. Signed-off-by: RogerCC Lin Fixes: 1d6b1e464950 ("mtd: mediatek: driver for MTK Smart Device") Signed-off-by: Boris Brezillon --- drivers/mtd/nand/mtk_nand.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/nand/mtk_nand.c b/drivers/mtd/nand/mtk_nand.c index ddaa2acb9dd7..5223a2182ee4 100644 --- a/drivers/mtd/nand/mtk_nand.c +++ b/drivers/mtd/nand/mtk_nand.c @@ -93,6 +93,9 @@ #define NFI_FSM_MASK (0xf << 16) #define NFI_ADDRCNTR (0x70) #define CNTR_MASK GENMASK(16, 12) +#define ADDRCNTR_SEC_SHIFT (12) +#define ADDRCNTR_SEC(val) \ + (((val) & CNTR_MASK) >> ADDRCNTR_SEC_SHIFT) #define NFI_STRADDR (0x80) #define NFI_BYTELEN (0x84) #define NFI_CSEL (0x90) @@ -699,7 +702,7 @@ static int mtk_nfc_do_write_page(struct mtd_info *mtd, struct nand_chip *chip, } ret = readl_poll_timeout_atomic(nfc->regs + NFI_ADDRCNTR, reg, - (reg & CNTR_MASK) >= chip->ecc.steps, + ADDRCNTR_SEC(reg) >= chip->ecc.steps, 10, MTK_TIMEOUT); if (ret) dev_err(dev, "hwecc write timeout\n"); @@ -902,7 +905,7 @@ static int mtk_nfc_read_subpage(struct mtd_info *mtd, struct nand_chip *chip, dev_warn(nfc->dev, "read ahb/dma done timeout\n"); rc = readl_poll_timeout_atomic(nfc->regs + NFI_BYTELEN, reg, - (reg & CNTR_MASK) >= sectors, 10, + ADDRCNTR_SEC(reg) >= sectors, 10, MTK_TIMEOUT); if (rc < 0) { dev_err(nfc->dev, "subpage done timeout\n"); From 38178e7b88dcbe1ab384f27a7370074e774dda81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lothar=20Wa=C3=9Fmann?= Date: Mon, 19 Sep 2016 11:09:40 +0200 Subject: [PATCH 380/538] mtd: nand: mxc: fix obiwan error in mxc_nand_v[12]_ooblayout_free() functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a894cf6c5a82 ("mtd: nand: mxc: switch to mtd_ooblayout_ops") introduced a regression accessing the OOB area from the mxc_nand driver due to an Obiwan error in the mxc_nand_v[12]_ooblayout_free() functions. They report a bogus oobregion { 64, 7 } which leads to errors accessing bogus data when reading the oob area. Prior to the commit the mtd-oobtest module could be run without any errors. With the offending commit, this test fails with results like: |Running mtd-oobtest | |================================================= |mtd_oobtest: MTD device: 5 |mtd_oobtest: MTD device size 524288, eraseblock size 131072, page size 2048, count of eraseblocks 4, pages per eraseblock 64, OOB size 64 |mtd_test: scanning for bad eraseblocks |mtd_test: scanned 4 eraseblocks, 0 are bad |mtd_oobtest: test 1 of 5 |mtd_oobtest: writing OOBs of whole device |mtd_oobtest: written up to eraseblock 0 |mtd_oobtest: written 4 eraseblocks |mtd_oobtest: verifying all eraseblocks |mtd_oobtest: error @addr[0x0:0x19] 0x9a -> 0x78 diff 0xe2 |mtd_oobtest: error @addr[0x0:0x1a] 0xcc -> 0x0 diff 0xcc |mtd_oobtest: error @addr[0x0:0x1b] 0xe0 -> 0x85 diff 0x65 |mtd_oobtest: error @addr[0x0:0x1c] 0x60 -> 0x62 diff 0x2 |mtd_oobtest: error @addr[0x0:0x1d] 0x69 -> 0x45 diff 0x2c |mtd_oobtest: error @addr[0x0:0x1e] 0xcd -> 0xa0 diff 0x6d |mtd_oobtest: error @addr[0x0:0x1f] 0xf2 -> 0x60 diff 0x92 |mtd_oobtest: error: verify failed at 0x0 [...] Signed-off-by: Lothar Waßmann Fixes: a894cf6c5a82 ("mtd: nand: mxc: switch to mtd_ooblayout_ops") Cc: Signed-off-by: Boris Brezillon --- drivers/mtd/nand/mxc_nand.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/mxc_nand.c b/drivers/mtd/nand/mxc_nand.c index 5173fadc9a4e..57cbe2b83849 100644 --- a/drivers/mtd/nand/mxc_nand.c +++ b/drivers/mtd/nand/mxc_nand.c @@ -943,7 +943,7 @@ static int mxc_v2_ooblayout_free(struct mtd_info *mtd, int section, struct nand_chip *nand_chip = mtd_to_nand(mtd); int stepsize = nand_chip->ecc.bytes == 9 ? 16 : 26; - if (section > nand_chip->ecc.steps) + if (section >= nand_chip->ecc.steps) return -ERANGE; if (!section) { From b588479358ce26f32138e0f0a7ab0678f8e3e601 Mon Sep 17 00:00:00 2001 From: Ilan Tayari Date: Sun, 18 Sep 2016 07:42:53 +0000 Subject: [PATCH 381/538] xfrm: Fix memory leak of aead algorithm name commit 1a6509d99122 ("[IPSEC]: Add support for combined mode algorithms") introduced aead. The function attach_aead kmemdup()s the algorithm name during xfrm_state_construct(). However this memory is never freed. Implementation has since been slightly modified in commit ee5c23176fcc ("xfrm: Clone states properly on migration") without resolving this leak. This patch adds a kfree() call for the aead algorithm name. Fixes: 1a6509d99122 ("[IPSEC]: Add support for combined mode algorithms") Signed-off-by: Ilan Tayari Acked-by: Rami Rosen Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 9895a8c56d8c..a30f898dc1c5 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -332,6 +332,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) { tasklet_hrtimer_cancel(&x->mtimer); del_timer_sync(&x->rtimer); + kfree(x->aead); kfree(x->aalg); kfree(x->ealg); kfree(x->calg); From 4de349e786a3a2d51bd02d56f3de151bbc3c3df9 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 17 Aug 2016 12:41:08 -0300 Subject: [PATCH 382/538] can: flexcan: fix resume function On a imx6ul-pico board the following error is seen during system suspend: dpm_run_callback(): platform_pm_resume+0x0/0x54 returns -110 PM: Device 2090000.flexcan failed to resume: error -110 The reason for this suspend error is because when the CAN interface is not active the clocks are disabled and then flexcan_chip_enable() will always fail due to a timeout error. In order to fix this issue, only call flexcan_chip_enable/disable() when the CAN interface is active. Based on a patch from Dong Aisheng in the NXP kernel. Signed-off-by: Fabio Estevam Cc: linux-stable Signed-off-by: Marc Kleine-Budde --- drivers/net/can/flexcan.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c index 41c0fc9f3b14..16f7cadda5c3 100644 --- a/drivers/net/can/flexcan.c +++ b/drivers/net/can/flexcan.c @@ -1268,11 +1268,10 @@ static int __maybe_unused flexcan_suspend(struct device *device) struct flexcan_priv *priv = netdev_priv(dev); int err; - err = flexcan_chip_disable(priv); - if (err) - return err; - if (netif_running(dev)) { + err = flexcan_chip_disable(priv); + if (err) + return err; netif_stop_queue(dev); netif_device_detach(dev); } @@ -1285,13 +1284,17 @@ static int __maybe_unused flexcan_resume(struct device *device) { struct net_device *dev = dev_get_drvdata(device); struct flexcan_priv *priv = netdev_priv(dev); + int err; priv->can.state = CAN_STATE_ERROR_ACTIVE; if (netif_running(dev)) { netif_device_attach(dev); netif_start_queue(dev); + err = flexcan_chip_enable(priv); + if (err) + return err; } - return flexcan_chip_enable(priv); + return 0; } static SIMPLE_DEV_PM_OPS(flexcan_pm_ops, flexcan_suspend, flexcan_resume); From d8feef9bd447381952a33e6284241006f394c080 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Sun, 18 Sep 2016 11:24:50 -0300 Subject: [PATCH 383/538] [media] cx23885/saa7134: assign q->dev to the PCI device Fix a regression caused by commit 2bc46b3ad3c1 ("[media] media/pci: convert drivers to use the new vb2_queue dev field"). Three places where q->dev should be set were missed, causing a WARN. Fixes: 2bc46b3ad3c1 ("[media] media/pci: convert drivers to use the new vb2_queue dev field"). Signed-off-by: Hans Verkuil Reported-by: Marton Balint Signed-off-by: Mauro Carvalho Chehab --- drivers/media/pci/cx23885/cx23885-417.c | 1 + drivers/media/pci/saa7134/saa7134-dvb.c | 1 + drivers/media/pci/saa7134/saa7134-empress.c | 1 + 3 files changed, 3 insertions(+) diff --git a/drivers/media/pci/cx23885/cx23885-417.c b/drivers/media/pci/cx23885/cx23885-417.c index efec2d1a7afd..4d080da7afaf 100644 --- a/drivers/media/pci/cx23885/cx23885-417.c +++ b/drivers/media/pci/cx23885/cx23885-417.c @@ -1552,6 +1552,7 @@ int cx23885_417_register(struct cx23885_dev *dev) q->mem_ops = &vb2_dma_sg_memops; q->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; q->lock = &dev->lock; + q->dev = &dev->pci->dev; err = vb2_queue_init(q); if (err < 0) diff --git a/drivers/media/pci/saa7134/saa7134-dvb.c b/drivers/media/pci/saa7134/saa7134-dvb.c index db987e5b93eb..59a4b5f7724e 100644 --- a/drivers/media/pci/saa7134/saa7134-dvb.c +++ b/drivers/media/pci/saa7134/saa7134-dvb.c @@ -1238,6 +1238,7 @@ static int dvb_init(struct saa7134_dev *dev) q->buf_struct_size = sizeof(struct saa7134_buf); q->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; q->lock = &dev->lock; + q->dev = &dev->pci->dev; ret = vb2_queue_init(q); if (ret) { vb2_dvb_dealloc_frontends(&dev->frontends); diff --git a/drivers/media/pci/saa7134/saa7134-empress.c b/drivers/media/pci/saa7134/saa7134-empress.c index ca417a454d67..791a5161809b 100644 --- a/drivers/media/pci/saa7134/saa7134-empress.c +++ b/drivers/media/pci/saa7134/saa7134-empress.c @@ -295,6 +295,7 @@ static int empress_init(struct saa7134_dev *dev) q->buf_struct_size = sizeof(struct saa7134_buf); q->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; q->lock = &dev->lock; + q->dev = &dev->pci->dev; err = vb2_queue_init(q); if (err) return err; From 67326666e2d45ebea7db3ed8e3e735f15e60dd91 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 19 Sep 2016 10:52:14 -0500 Subject: [PATCH 384/538] scripts: add script for translating stack dump function offsets addr2line doesn't work with KASLR addresses. Add a basic addr2line wrapper script which takes the 'func+offset/size' format as input. Signed-off-by: Josh Poimboeuf Signed-off-by: Linus Torvalds --- scripts/faddr2line | 177 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100755 scripts/faddr2line diff --git a/scripts/faddr2line b/scripts/faddr2line new file mode 100755 index 000000000000..4fbfe8305fe3 --- /dev/null +++ b/scripts/faddr2line @@ -0,0 +1,177 @@ +#!/bin/bash +# +# Translate stack dump function offsets. +# +# addr2line doesn't work with KASLR addresses. This works similarly to +# addr2line, but instead takes the 'func+0x123' format as input: +# +# $ ./scripts/faddr2line ~/k/vmlinux meminfo_proc_show+0x5/0x568 +# meminfo_proc_show+0x5/0x568: +# meminfo_proc_show at fs/proc/meminfo.c:27 +# +# If the address is part of an inlined function, the full inline call chain is +# printed: +# +# $ ./scripts/faddr2line ~/k/vmlinux native_write_msr+0x6/0x27 +# native_write_msr+0x6/0x27: +# arch_static_branch at arch/x86/include/asm/msr.h:121 +# (inlined by) static_key_false at include/linux/jump_label.h:125 +# (inlined by) native_write_msr at arch/x86/include/asm/msr.h:125 +# +# The function size after the '/' in the input is optional, but recommended. +# It's used to help disambiguate any duplicate symbol names, which can occur +# rarely. If the size is omitted for a duplicate symbol then it's possible for +# multiple code sites to be printed: +# +# $ ./scripts/faddr2line ~/k/vmlinux raw_ioctl+0x5 +# raw_ioctl+0x5/0x20: +# raw_ioctl at drivers/char/raw.c:122 +# +# raw_ioctl+0x5/0xb1: +# raw_ioctl at net/ipv4/raw.c:876 +# +# Multiple addresses can be specified on a single command line: +# +# $ ./scripts/faddr2line ~/k/vmlinux type_show+0x10/45 free_reserved_area+0x90 +# type_show+0x10/0x2d: +# type_show at drivers/video/backlight/backlight.c:213 +# +# free_reserved_area+0x90/0x123: +# free_reserved_area at mm/page_alloc.c:6429 (discriminator 2) + + +set -o errexit +set -o nounset + +command -v awk >/dev/null 2>&1 || die "awk isn't installed" +command -v readelf >/dev/null 2>&1 || die "readelf isn't installed" +command -v addr2line >/dev/null 2>&1 || die "addr2line isn't installed" + +usage() { + echo "usage: faddr2line ..." >&2 + exit 1 +} + +warn() { + echo "$1" >&2 +} + +die() { + echo "ERROR: $1" >&2 + exit 1 +} + +# Try to figure out the source directory prefix so we can remove it from the +# addr2line output. HACK ALERT: This assumes that start_kernel() is in +# kernel/init.c! This only works for vmlinux. Otherwise it falls back to +# printing the absolute path. +find_dir_prefix() { + local objfile=$1 + + local start_kernel_addr=$(readelf -sW $objfile | awk '$8 == "start_kernel" {printf "0x%s", $2}') + [[ -z $start_kernel_addr ]] && return + + local file_line=$(addr2line -e $objfile $start_kernel_addr) + [[ -z $file_line ]] && return + + local prefix=${file_line%init/main.c:*} + if [[ -z $prefix ]] || [[ $prefix = $file_line ]]; then + return + fi + + DIR_PREFIX=$prefix + return 0 +} + +__faddr2line() { + local objfile=$1 + local func_addr=$2 + local dir_prefix=$3 + local print_warnings=$4 + + local func=${func_addr%+*} + local offset=${func_addr#*+} + offset=${offset%/*} + local size= + [[ $func_addr =~ "/" ]] && size=${func_addr#*/} + + if [[ -z $func ]] || [[ -z $offset ]] || [[ $func = $func_addr ]]; then + warn "bad func+offset $func_addr" + DONE=1 + return + fi + + # Go through each of the object's symbols which match the func name. + # In rare cases there might be duplicates. + while read symbol; do + local fields=($symbol) + local sym_base=0x${fields[1]} + local sym_size=${fields[2]} + local sym_type=${fields[3]} + + # calculate the address + local addr=$(($sym_base + $offset)) + if [[ -z $addr ]] || [[ $addr = 0 ]]; then + warn "bad address: $sym_base + $offset" + DONE=1 + return + fi + local hexaddr=0x$(printf %x $addr) + + # weed out non-function symbols + if [[ $sym_type != "FUNC" ]]; then + [[ $print_warnings = 1 ]] && + echo "skipping $func address at $hexaddr due to non-function symbol" + continue + fi + + # if the user provided a size, make sure it matches the symbol's size + if [[ -n $size ]] && [[ $size -ne $sym_size ]]; then + [[ $print_warnings = 1 ]] && + echo "skipping $func address at $hexaddr due to size mismatch ($size != $sym_size)" + continue; + fi + + # make sure the provided offset is within the symbol's range + if [[ $offset -gt $sym_size ]]; then + [[ $print_warnings = 1 ]] && + echo "skipping $func address at $hexaddr due to size mismatch ($offset > $sym_size)" + continue + fi + + # separate multiple entries with a blank line + [[ $FIRST = 0 ]] && echo + FIRST=0 + + local hexsize=0x$(printf %x $sym_size) + echo "$func+$offset/$hexsize:" + addr2line -fpie $objfile $hexaddr | sed "s;$dir_prefix;;" + DONE=1 + + done < <(readelf -sW $objfile | awk -v f=$func '$8 == f {print}') +} + +[[ $# -lt 2 ]] && usage + +objfile=$1 +[[ ! -f $objfile ]] && die "can't find objfile $objfile" +shift + +DIR_PREFIX=supercalifragilisticexpialidocious +find_dir_prefix $objfile + +FIRST=1 +while [[ $# -gt 0 ]]; do + func_addr=$1 + shift + + # print any matches found + DONE=0 + __faddr2line $objfile $func_addr $DIR_PREFIX 0 + + # if no match was found, print warnings + if [[ $DONE = 0 ]]; then + __faddr2line $objfile $func_addr $DIR_PREFIX 1 + warn "no match for $func_addr" + fi +done From c23a7266e6599e74305cc5b790f93398bb212380 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 6 Sep 2016 19:04:37 +0200 Subject: [PATCH 385/538] arm64/FP/SIMD: Convert to hotplug state machine Install the callbacks via the state machine. Signed-off-by: Sebastian Andrzej Siewior Acked-by: Will Deacon Cc: Peter Zijlstra Cc: Catalin Marinas Cc: rt@linutronix.de Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20160906170457.32393-2-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- arch/arm64/kernel/fpsimd.c | 22 +++++----------------- include/linux/cpuhotplug.h | 1 + 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 975b274ee7b5..394c61db5566 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -299,28 +299,16 @@ static inline void fpsimd_pm_init(void) { } #endif /* CONFIG_CPU_PM */ #ifdef CONFIG_HOTPLUG_CPU -static int fpsimd_cpu_hotplug_notifier(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int fpsimd_cpu_dead(unsigned int cpu) { - unsigned int cpu = (long)hcpu; - - switch (action) { - case CPU_DEAD: - case CPU_DEAD_FROZEN: - per_cpu(fpsimd_last_state, cpu) = NULL; - break; - } - return NOTIFY_OK; + per_cpu(fpsimd_last_state, cpu) = NULL; + return 0; } -static struct notifier_block fpsimd_cpu_hotplug_notifier_block = { - .notifier_call = fpsimd_cpu_hotplug_notifier, -}; - static inline void fpsimd_hotplug_init(void) { - register_cpu_notifier(&fpsimd_cpu_hotplug_notifier_block); + cpuhp_setup_state_nocalls(CPUHP_ARM64_FPSIMD_DEAD, "arm64/fpsimd:dead", + NULL, fpsimd_cpu_dead); } #else diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index afd59e2ca4b3..0da071ff36d2 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -20,6 +20,7 @@ enum cpuhp_state { CPUHP_SOFTIRQ_DEAD, CPUHP_NET_MVNETA_DEAD, CPUHP_CPUIDLE_DEAD, + CPUHP_ARM64_FPSIMD_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, From 657ebf7a2354f39cc7d3f4e64ee49dbf1c3cae4f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 6 Sep 2016 19:04:38 +0200 Subject: [PATCH 386/538] ARM/shmobile: Convert to hotplug state machine Install the callbacks via the state machine so the old notifier based cpuhotplug infrastructure can be removed. Signed-off-by: Sebastian Andrzej Siewior Cc: linux-sh@vger.kernel.org Cc: Peter Zijlstra Cc: Magnus Damm Cc: Simon Horman Cc: rt@linutronix.de Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20160906170457.32393-3-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- arch/arm/mach-shmobile/platsmp-scu.c | 26 ++++++++------------------ include/linux/cpuhotplug.h | 1 + 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/arch/arm/mach-shmobile/platsmp-scu.c b/arch/arm/mach-shmobile/platsmp-scu.c index 8d478f1da265..d1ecaf37d142 100644 --- a/arch/arm/mach-shmobile/platsmp-scu.c +++ b/arch/arm/mach-shmobile/platsmp-scu.c @@ -21,26 +21,14 @@ static phys_addr_t shmobile_scu_base_phys; static void __iomem *shmobile_scu_base; -static int shmobile_smp_scu_notifier_call(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int shmobile_scu_cpu_prepare(unsigned int cpu) { - unsigned int cpu = (long)hcpu; - - switch (action) { - case CPU_UP_PREPARE: - /* For this particular CPU register SCU SMP boot vector */ - shmobile_smp_hook(cpu, virt_to_phys(shmobile_boot_scu), - shmobile_scu_base_phys); - break; - }; - - return NOTIFY_OK; + /* For this particular CPU register SCU SMP boot vector */ + shmobile_smp_hook(cpu, virt_to_phys(shmobile_boot_scu), + shmobile_scu_base_phys); + return 0; } -static struct notifier_block shmobile_smp_scu_notifier = { - .notifier_call = shmobile_smp_scu_notifier_call, -}; - void __init shmobile_smp_scu_prepare_cpus(phys_addr_t scu_base_phys, unsigned int max_cpus) { @@ -54,7 +42,9 @@ void __init shmobile_smp_scu_prepare_cpus(phys_addr_t scu_base_phys, scu_power_mode(shmobile_scu_base, SCU_PM_NORMAL); /* Use CPU notifier for reset vector control */ - register_cpu_notifier(&shmobile_smp_scu_notifier); + cpuhp_setup_state_nocalls(CPUHP_ARM_SHMOBILE_SCU_PREPARE, + "arm/shmobile-scu:prepare", + shmobile_scu_cpu_prepare, NULL); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 0da071ff36d2..008eed0c0787 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -35,6 +35,7 @@ enum cpuhp_state { CPUHP_POWERPC_PMAC_PREPARE, CPUHP_POWERPC_MMU_CTX_PREPARE, CPUHP_NOTIFY_PREPARE, + CPUHP_ARM_SHMOBILE_SCU_PREPARE, CPUHP_TIMERS_DEAD, CPUHP_BRINGUP_CPU, CPUHP_AP_IDLE_DEAD, From a4fa9cc220fef29162d38a0ada71f5569a116087 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 6 Sep 2016 19:04:39 +0200 Subject: [PATCH 387/538] ARM/OMAP/wakeupgen: Convert to hotplug state machine Install the callbacks via the state machine. Signed-off-by: Sebastian Andrzej Siewior Acked-by: Tony Lindgren Cc: Peter Zijlstra Cc: rt@linutronix.de Cc: linux-omap@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20160906170457.32393-4-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- arch/arm/mach-omap2/omap-wakeupgen.c | 35 +++++++++++----------------- include/linux/cpuhotplug.h | 1 + 2 files changed, 14 insertions(+), 22 deletions(-) diff --git a/arch/arm/mach-omap2/omap-wakeupgen.c b/arch/arm/mach-omap2/omap-wakeupgen.c index 0c4754386532..369f95a703ac 100644 --- a/arch/arm/mach-omap2/omap-wakeupgen.c +++ b/arch/arm/mach-omap2/omap-wakeupgen.c @@ -322,34 +322,25 @@ static void irq_save_secure_context(void) #endif #ifdef CONFIG_HOTPLUG_CPU -static int irq_cpu_hotplug_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int omap_wakeupgen_cpu_online(unsigned int cpu) { - unsigned int cpu = (unsigned int)hcpu; - - /* - * Corresponding FROZEN transitions do not have to be handled, - * they are handled by at a higher level - * (drivers/cpuidle/coupled.c). - */ - switch (action) { - case CPU_ONLINE: - wakeupgen_irqmask_all(cpu, 0); - break; - case CPU_DEAD: - wakeupgen_irqmask_all(cpu, 1); - break; - } - return NOTIFY_OK; + wakeupgen_irqmask_all(cpu, 0); + return 0; } -static struct notifier_block irq_hotplug_notifier = { - .notifier_call = irq_cpu_hotplug_notify, -}; +static int omap_wakeupgen_cpu_dead(unsigned int cpu) +{ + wakeupgen_irqmask_all(cpu, 1); + return 0; +} static void __init irq_hotplug_init(void) { - register_hotcpu_notifier(&irq_hotplug_notifier); + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "arm/omap-wake:online", + omap_wakeupgen_cpu_online, NULL); + cpuhp_setup_state_nocalls(CPUHP_ARM_OMAP_WAKE_DEAD, + "arm/omap-wake:dead", NULL, + omap_wakeupgen_cpu_dead); } #else static void __init irq_hotplug_init(void) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 008eed0c0787..35859aafbbfc 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -21,6 +21,7 @@ enum cpuhp_state { CPUHP_NET_MVNETA_DEAD, CPUHP_CPUIDLE_DEAD, CPUHP_ARM64_FPSIMD_DEAD, + CPUHP_ARM_OMAP_WAKE_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, From 6b8d642239e866debbaa37f25e53837a5d141f33 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 6 Sep 2016 19:04:40 +0200 Subject: [PATCH 388/538] ia64/mca: Convert to hotplug state machine Install the callbacks via the state machine and let the core invoke the callbacks on the already online CPUs. Signed-off-by: Sebastian Andrzej Siewior Cc: Fenghua Yu Cc: Tony Luck Cc: linux-ia64@vger.kernel.org Cc: Peter Zijlstra Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160906170457.32393-5-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- arch/ia64/kernel/mca.c | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index eb9220cde76c..c2858865b146 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1890,7 +1890,7 @@ ia64_mca_cpu_init(void *cpu_data) PAGE_KERNEL))); } -static void ia64_mca_cmc_vector_adjust(void *dummy) +static int ia64_mca_cpu_online(unsigned int cpu) { unsigned long flags; @@ -1898,25 +1898,9 @@ static void ia64_mca_cmc_vector_adjust(void *dummy) if (!cmc_polling_enabled) ia64_mca_cmc_vector_enable(NULL); local_irq_restore(flags); + return 0; } -static int mca_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - ia64_mca_cmc_vector_adjust(NULL); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block mca_cpu_notifier = { - .notifier_call = mca_cpu_callback -}; - /* * ia64_mca_init * @@ -2111,15 +2095,13 @@ ia64_mca_late_init(void) if (!mca_init) return 0; - register_hotcpu_notifier(&mca_cpu_notifier); - /* Setup the CMCI/P vector and handler */ setup_timer(&cmc_poll_timer, ia64_mca_cmc_poll, 0UL); /* Unmask/enable the vector */ cmc_polling_enabled = 0; - schedule_work(&cmc_enable_work); - + cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "ia64/mca:online", + ia64_mca_cpu_online, NULL); IA64_MCA_DEBUG("%s: CMCI/P setup and enabled.\n", __func__); #ifdef CONFIG_ACPI From 515332336be71d014bca1d29369c5d72baa38f71 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 6 Sep 2016 19:04:41 +0200 Subject: [PATCH 389/538] sh/SH-X3 SMP: Convert to hotplug state machine Install the callbacks via the state machine. Signed-off-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: linux-sh@vger.kernel.org Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160906170457.32393-6-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- arch/sh/kernel/cpu/sh4a/smp-shx3.c | 26 +++++--------------------- include/linux/cpuhotplug.h | 1 + 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/arch/sh/kernel/cpu/sh4a/smp-shx3.c b/arch/sh/kernel/cpu/sh4a/smp-shx3.c index 839612c8a0a0..0d3637c494bf 100644 --- a/arch/sh/kernel/cpu/sh4a/smp-shx3.c +++ b/arch/sh/kernel/cpu/sh4a/smp-shx3.c @@ -122,32 +122,16 @@ static void shx3_update_boot_vector(unsigned int cpu) __raw_writel(STBCR_RESET, STBCR_REG(cpu)); } -static int -shx3_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +static int shx3_cpu_prepare(unsigned int cpu) { - unsigned int cpu = (unsigned int)hcpu; - - switch (action) { - case CPU_UP_PREPARE: - shx3_update_boot_vector(cpu); - break; - case CPU_ONLINE: - pr_info("CPU %u is now online\n", cpu); - break; - case CPU_DEAD: - break; - } - - return NOTIFY_OK; + shx3_update_boot_vector(cpu); + return 0; } -static struct notifier_block shx3_cpu_notifier = { - .notifier_call = shx3_cpu_callback, -}; - static int register_shx3_cpu_notifier(void) { - register_hotcpu_notifier(&shx3_cpu_notifier); + cpuhp_setup_state_nocalls(CPUHP_SH_SH3X_PREPARE, "sh/shx3:prepare", + shx3_cpu_prepare, NULL); return 0; } late_initcall(register_shx3_cpu_notifier); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 35859aafbbfc..8dec2a236af3 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -37,6 +37,7 @@ enum cpuhp_state { CPUHP_POWERPC_MMU_CTX_PREPARE, CPUHP_NOTIFY_PREPARE, CPUHP_ARM_SHMOBILE_SCU_PREPARE, + CPUHP_SH_SH3X_PREPARE, CPUHP_TIMERS_DEAD, CPUHP_BRINGUP_CPU, CPUHP_AP_IDLE_DEAD, From 29bd7fbc071598e939526f782293dbe137be3768 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 7 Sep 2016 18:45:23 +0200 Subject: [PATCH 390/538] x86/microcode: Convert to hotplug state machine Install the callbacks via the state machine. CPU_UP_CANCELED_FROZEN() is not preserved: It is only there to free memory in an error case because it is assumed if the CPU does show up on resume it won't be seen ever again. As per Borislav: |IOW, you don't need mc_cpu_dead(). Signed-off-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: Borislav Petkov Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160907164523.46a2xnffha4bv63g@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/microcode/core.c | 52 +++++++++------------------- 1 file changed, 17 insertions(+), 35 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index df04b2d033f6..5ce5155f0695 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -558,55 +558,36 @@ static struct syscore_ops mc_syscore_ops = { .resume = mc_bp_resume, }; -static int -mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) +static int mc_cpu_online(unsigned int cpu) { - unsigned int cpu = (unsigned long)hcpu; struct device *dev; dev = get_cpu_device(cpu); + microcode_update_cpu(cpu); + pr_debug("CPU%d added\n", cpu); - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - microcode_update_cpu(cpu); - pr_debug("CPU%d added\n", cpu); - /* - * "break" is missing on purpose here because we want to fall - * through in order to create the sysfs group. - */ - - case CPU_DOWN_FAILED: - if (sysfs_create_group(&dev->kobj, &mc_attr_group)) - pr_err("Failed to create group for CPU%d\n", cpu); - break; + if (sysfs_create_group(&dev->kobj, &mc_attr_group)) + pr_err("Failed to create group for CPU%d\n", cpu); + return 0; +} - case CPU_DOWN_PREPARE: - /* Suspend is in progress, only remove the interface */ - sysfs_remove_group(&dev->kobj, &mc_attr_group); - pr_debug("CPU%d removed\n", cpu); - break; +static int mc_cpu_down_prep(unsigned int cpu) +{ + struct device *dev; + dev = get_cpu_device(cpu); + /* Suspend is in progress, only remove the interface */ + sysfs_remove_group(&dev->kobj, &mc_attr_group); + pr_debug("CPU%d removed\n", cpu); /* - * case CPU_DEAD: - * * When a CPU goes offline, don't free up or invalidate the copy of * the microcode in kernel memory, so that we can reuse it when the * CPU comes back online without unnecessarily requesting the userspace * for it again. */ - } - - /* The CPU refused to come up during a system resume */ - if (action == CPU_UP_CANCELED_FROZEN) - microcode_fini_cpu(cpu); - - return NOTIFY_OK; + return 0; } -static struct notifier_block mc_cpu_notifier = { - .notifier_call = mc_cpu_callback, -}; - static struct attribute *cpu_root_microcode_attrs[] = { &dev_attr_reload.attr, NULL @@ -665,7 +646,8 @@ int __init microcode_init(void) goto out_ucode_group; register_syscore_ops(&mc_syscore_ops); - register_hotcpu_notifier(&mc_cpu_notifier); + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online", + mc_cpu_online, mc_cpu_down_prep); pr_info("Microcode Update Driver: v" MICROCODE_VERSION " , Peter Oruba\n"); From 75e12ed65312a56401f3b286ac7e12994301371c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 6 Sep 2016 19:04:43 +0200 Subject: [PATCH 391/538] lib/irq_poll: Convert to hotplug state machine Install the callbacks via the state machine. Signed-off-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: Jens Axboe Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160906170457.32393-8-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 1 + lib/irq_poll.c | 26 +++++++++----------------- 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 8dec2a236af3..2ca7b34871e0 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -22,6 +22,7 @@ enum cpuhp_state { CPUHP_CPUIDLE_DEAD, CPUHP_ARM64_FPSIMD_DEAD, CPUHP_ARM_OMAP_WAKE_DEAD, + CPUHP_IRQ_POLL_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, diff --git a/lib/irq_poll.c b/lib/irq_poll.c index 836f7db4e548..2be55692aa43 100644 --- a/lib/irq_poll.c +++ b/lib/irq_poll.c @@ -184,30 +184,21 @@ void irq_poll_init(struct irq_poll *iop, int weight, irq_poll_fn *poll_fn) } EXPORT_SYMBOL(irq_poll_init); -static int irq_poll_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int irq_poll_cpu_dead(unsigned int cpu) { /* * If a CPU goes away, splice its entries to the current CPU * and trigger a run of the softirq */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - int cpu = (unsigned long) hcpu; - - local_irq_disable(); - list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), - this_cpu_ptr(&blk_cpu_iopoll)); - __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); - local_irq_enable(); - } + local_irq_disable(); + list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), + this_cpu_ptr(&blk_cpu_iopoll)); + __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); + local_irq_enable(); - return NOTIFY_OK; + return 0; } -static struct notifier_block irq_poll_cpu_notifier = { - .notifier_call = irq_poll_cpu_notify, -}; - static __init int irq_poll_setup(void) { int i; @@ -216,7 +207,8 @@ static __init int irq_poll_setup(void) INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i)); open_softirq(IRQ_POLL_SOFTIRQ, irq_poll_softirq); - register_hotcpu_notifier(&irq_poll_cpu_notifier); + cpuhp_setup_state_nocalls(CPUHP_IRQ_POLL_DEAD, "irq_poll:dead", NULL, + irq_poll_cpu_dead); return 0; } subsys_initcall(irq_poll_setup); From 9a659f43dfea27cca423d4e80809be447f4c9ce7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 6 Sep 2016 19:04:44 +0200 Subject: [PATCH 392/538] block/softirq: Convert to hotplug state machine Install the callbacks via the state machine. Signed-off-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: Jens Axboe Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160906170457.32393-9-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- block/blk-softirq.c | 27 ++++++++++----------------- include/linux/cpuhotplug.h | 1 + 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 53b1737e978d..96631e6a22b9 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -78,30 +78,21 @@ static int raise_blk_irq(int cpu, struct request *rq) } #endif -static int blk_cpu_notify(struct notifier_block *self, unsigned long action, - void *hcpu) +static int blk_softirq_cpu_dead(unsigned int cpu) { /* * If a CPU goes away, splice its entries to the current CPU * and trigger a run of the softirq */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - int cpu = (unsigned long) hcpu; - - local_irq_disable(); - list_splice_init(&per_cpu(blk_cpu_done, cpu), - this_cpu_ptr(&blk_cpu_done)); - raise_softirq_irqoff(BLOCK_SOFTIRQ); - local_irq_enable(); - } + local_irq_disable(); + list_splice_init(&per_cpu(blk_cpu_done, cpu), + this_cpu_ptr(&blk_cpu_done)); + raise_softirq_irqoff(BLOCK_SOFTIRQ); + local_irq_enable(); - return NOTIFY_OK; + return 0; } -static struct notifier_block blk_cpu_notifier = { - .notifier_call = blk_cpu_notify, -}; - void __blk_complete_request(struct request *req) { int ccpu, cpu; @@ -180,7 +171,9 @@ static __init int blk_softirq_init(void) INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); - register_hotcpu_notifier(&blk_cpu_notifier); + cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, + "block/softirq:dead", NULL, + blk_softirq_cpu_dead); return 0; } subsys_initcall(blk_softirq_init); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 2ca7b34871e0..d4274d51fe27 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -23,6 +23,7 @@ enum cpuhp_state { CPUHP_ARM64_FPSIMD_DEAD, CPUHP_ARM_OMAP_WAKE_DEAD, CPUHP_IRQ_POLL_DEAD, + CPUHP_BLOCK_SOFTIRQ_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, From a4e0591ece7d88634a802c4076db8c0debbde805 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 6 Sep 2016 19:04:45 +0200 Subject: [PATCH 393/538] oprofile/timer: Convert to hotplug state machine Install the callbacks via the state machine and let the core invoke the callbacks on the already online CPUs. Since the online target runs always on the target CPU we can drop smp_call_function_single(). The functions is invoked with interrupts off to keep the old calling convention. If the maintainer things that this function can be called with interrupts enabled then it can be removed :) Signed-off-by: Sebastian Andrzej Siewior Cc: Robert Richter Cc: Peter Zijlstra Cc: oprofile-list@lists.sf.net Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160906170457.32393-10-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- drivers/oprofile/timer_int.c | 44 +++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/drivers/oprofile/timer_int.c b/drivers/oprofile/timer_int.c index bdef916e5dda..2498a6cd7c24 100644 --- a/drivers/oprofile/timer_int.c +++ b/drivers/oprofile/timer_int.c @@ -74,37 +74,39 @@ static void oprofile_hrtimer_stop(void) put_online_cpus(); } -static int oprofile_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int oprofile_timer_online(unsigned int cpu) { - long cpu = (long) hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - smp_call_function_single(cpu, __oprofile_hrtimer_start, - NULL, 1); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - __oprofile_hrtimer_stop(cpu); - break; - } - return NOTIFY_OK; + local_irq_disable(); + __oprofile_hrtimer_start(NULL); + local_irq_enable(); + return 0; } -static struct notifier_block __refdata oprofile_cpu_notifier = { - .notifier_call = oprofile_cpu_notify, -}; +static int oprofile_timer_prep_down(unsigned int cpu) +{ + __oprofile_hrtimer_stop(cpu); + return 0; +} + +static enum cpuhp_state hp_online; static int oprofile_hrtimer_setup(void) { - return register_hotcpu_notifier(&oprofile_cpu_notifier); + int ret; + + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "oprofile/timer:online", + oprofile_timer_online, + oprofile_timer_prep_down); + if (ret < 0) + return ret; + hp_online = ret; + return 0; } static void oprofile_hrtimer_shutdown(void) { - unregister_hotcpu_notifier(&oprofile_cpu_notifier); + cpuhp_remove_state_nocalls(hp_online); } int oprofile_timer_init(struct oprofile_operations *ops) From 8904f5a5afc4dd74e8fe2ab3eeb98018ef02f3e6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 6 Sep 2016 19:04:46 +0200 Subject: [PATCH 394/538] virtio scsi: Convert to hotplug state machine Install the callbacks via the state machine. It uses the multi instance infrastructure of the hotplug code to handle each interface. virtscsi_set_affinity() is removed from virtscsi_init() because virtscsi_cpu_notif_add() (the function which registers the instance) is invoked right after it and the cpuhp_state_add_instance() functions invokes the startup callback on all online CPUs. The same thing can not be applied virtscsi_cpu_notif_remove() because virtscsi_remove_vqs() invokes virtscsi_set_affinity() with affinity = false as argument but the old CPU_DEAD state invoked the function with affinity = true (which does not match the DEAD callback). Signed-off-by: Sebastian Andrzej Siewior Cc: "James E.J. Bottomley" Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: virtualization@lists.linux-foundation.org Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160906170457.32393-11-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- drivers/scsi/virtio_scsi.c | 76 ++++++++++++++++++++++++-------------- include/linux/cpuhotplug.h | 1 + 2 files changed, 50 insertions(+), 27 deletions(-) diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 7dbbb29d24c6..deefab3a94d0 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -107,8 +107,8 @@ struct virtio_scsi { /* If the affinity hint is set for virtqueues */ bool affinity_hint_set; - /* CPU hotplug notifier */ - struct notifier_block nb; + struct hlist_node node; + struct hlist_node node_dead; /* Protected by event_vq lock */ bool stop_events; @@ -118,6 +118,7 @@ struct virtio_scsi { struct virtio_scsi_vq req_vqs[]; }; +static enum cpuhp_state virtioscsi_online; static struct kmem_cache *virtscsi_cmd_cache; static mempool_t *virtscsi_cmd_pool; @@ -852,21 +853,33 @@ static void virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity) put_online_cpus(); } -static int virtscsi_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int virtscsi_cpu_online(unsigned int cpu, struct hlist_node *node) { - struct virtio_scsi *vscsi = container_of(nfb, struct virtio_scsi, nb); - switch(action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - __virtscsi_set_affinity(vscsi, true); - break; - default: - break; - } - return NOTIFY_OK; + struct virtio_scsi *vscsi = hlist_entry_safe(node, struct virtio_scsi, + node); + __virtscsi_set_affinity(vscsi, true); + return 0; +} + +static int virtscsi_cpu_notif_add(struct virtio_scsi *vi) +{ + int ret; + + ret = cpuhp_state_add_instance(virtioscsi_online, &vi->node); + if (ret) + return ret; + + ret = cpuhp_state_add_instance(CPUHP_VIRT_SCSI_DEAD, &vi->node_dead); + if (ret) + cpuhp_state_remove_instance(virtioscsi_online, &vi->node); + return ret; +} + +static void virtscsi_cpu_notif_remove(struct virtio_scsi *vi) +{ + cpuhp_state_remove_instance_nocalls(virtioscsi_online, &vi->node); + cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_SCSI_DEAD, + &vi->node_dead); } static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq, @@ -929,8 +942,6 @@ static int virtscsi_init(struct virtio_device *vdev, virtscsi_init_vq(&vscsi->req_vqs[i - VIRTIO_SCSI_VQ_BASE], vqs[i]); - virtscsi_set_affinity(vscsi, true); - virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE); virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE); @@ -987,12 +998,9 @@ static int virtscsi_probe(struct virtio_device *vdev) if (err) goto virtscsi_init_failed; - vscsi->nb.notifier_call = &virtscsi_cpu_callback; - err = register_hotcpu_notifier(&vscsi->nb); - if (err) { - pr_err("registering cpu notifier failed\n"); + err = virtscsi_cpu_notif_add(vscsi); + if (err) goto scsi_add_host_failed; - } cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1; shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue); @@ -1049,7 +1057,7 @@ static void virtscsi_remove(struct virtio_device *vdev) scsi_remove_host(shost); - unregister_hotcpu_notifier(&vscsi->nb); + virtscsi_cpu_notif_remove(vscsi); virtscsi_remove_vqs(vdev); scsi_host_put(shost); @@ -1061,7 +1069,7 @@ static int virtscsi_freeze(struct virtio_device *vdev) struct Scsi_Host *sh = virtio_scsi_host(vdev); struct virtio_scsi *vscsi = shost_priv(sh); - unregister_hotcpu_notifier(&vscsi->nb); + virtscsi_cpu_notif_remove(vscsi); virtscsi_remove_vqs(vdev); return 0; } @@ -1076,12 +1084,11 @@ static int virtscsi_restore(struct virtio_device *vdev) if (err) return err; - err = register_hotcpu_notifier(&vscsi->nb); + err = virtscsi_cpu_notif_add(vscsi); if (err) { vdev->config->del_vqs(vdev); return err; } - virtio_device_ready(vdev); if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) @@ -1136,6 +1143,16 @@ static int __init init(void) pr_err("mempool_create() for virtscsi_cmd_pool failed\n"); goto error; } + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "scsi/virtio:online", + virtscsi_cpu_online, NULL); + if (ret < 0) + goto error; + virtioscsi_online = ret; + ret = cpuhp_setup_state_multi(CPUHP_VIRT_SCSI_DEAD, "scsi/virtio:dead", + NULL, virtscsi_cpu_online); + if (ret) + goto error; ret = register_virtio_driver(&virtio_scsi_driver); if (ret < 0) goto error; @@ -1151,12 +1168,17 @@ static int __init init(void) kmem_cache_destroy(virtscsi_cmd_cache); virtscsi_cmd_cache = NULL; } + if (virtioscsi_online) + cpuhp_remove_multi_state(virtioscsi_online); + cpuhp_remove_multi_state(CPUHP_VIRT_SCSI_DEAD); return ret; } static void __exit fini(void) { unregister_virtio_driver(&virtio_scsi_driver); + cpuhp_remove_multi_state(virtioscsi_online); + cpuhp_remove_multi_state(CPUHP_VIRT_SCSI_DEAD); mempool_destroy(virtscsi_cmd_pool); kmem_cache_destroy(virtscsi_cmd_cache); } diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index d4274d51fe27..e7146ee88ea4 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -24,6 +24,7 @@ enum cpuhp_state { CPUHP_ARM_OMAP_WAKE_DEAD, CPUHP_IRQ_POLL_DEAD, CPUHP_BLOCK_SOFTIRQ_DEAD, + CPUHP_VIRT_SCSI_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, From 64f3bf2f85c5690228200d6b94eb6847049af70d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 6 Sep 2016 19:04:47 +0200 Subject: [PATCH 395/538] ACPI/processor: Convert to hotplug state machine Install the callbacks via the state machine. Signed-off-by: Sebastian Andrzej Siewior Acked-by: "Rafael J. Wysocki" Cc: Peter Zijlstra Cc: linux-acpi@vger.kernel.org Cc: rt@linutronix.de Cc: Len Brown Link: http://lkml.kernel.org/r/20160906170457.32393-12-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- drivers/acpi/processor_driver.c | 91 +++++++++++++++-------------- drivers/acpi/processor_throttling.c | 4 +- include/acpi/processor.h | 4 +- include/linux/cpuhotplug.h | 1 + 4 files changed, 52 insertions(+), 48 deletions(-) diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index 0553aeebb228..13e5ac415abf 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -110,55 +110,46 @@ static void acpi_processor_notify(acpi_handle handle, u32 event, void *data) static int __acpi_processor_start(struct acpi_device *device); -static int acpi_cpu_soft_notify(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int acpi_soft_cpu_online(unsigned int cpu) { - unsigned int cpu = (unsigned long)hcpu; struct acpi_processor *pr = per_cpu(processors, cpu); struct acpi_device *device; - action &= ~CPU_TASKS_FROZEN; - - switch (action) { - case CPU_ONLINE: - case CPU_DEAD: - break; - default: - return NOTIFY_DONE; - } if (!pr || acpi_bus_get_device(pr->handle, &device)) - return NOTIFY_DONE; - - if (action == CPU_ONLINE) { - /* - * CPU got physically hotplugged and onlined for the first time: - * Initialize missing things. - */ - if (pr->flags.need_hotplug_init) { - int ret; - - pr_info("Will online and init hotplugged CPU: %d\n", - pr->id); - pr->flags.need_hotplug_init = 0; - ret = __acpi_processor_start(device); - WARN(ret, "Failed to start CPU: %d\n", pr->id); - } else { - /* Normal CPU soft online event. */ - acpi_processor_ppc_has_changed(pr, 0); - acpi_processor_hotplug(pr); - acpi_processor_reevaluate_tstate(pr, action); - acpi_processor_tstate_has_changed(pr); - } - } else if (action == CPU_DEAD) { - /* Invalidate flag.throttling after the CPU is offline. */ - acpi_processor_reevaluate_tstate(pr, action); + return 0; + /* + * CPU got physically hotplugged and onlined for the first time: + * Initialize missing things. + */ + if (pr->flags.need_hotplug_init) { + int ret; + + pr_info("Will online and init hotplugged CPU: %d\n", + pr->id); + pr->flags.need_hotplug_init = 0; + ret = __acpi_processor_start(device); + WARN(ret, "Failed to start CPU: %d\n", pr->id); + } else { + /* Normal CPU soft online event. */ + acpi_processor_ppc_has_changed(pr, 0); + acpi_processor_hotplug(pr); + acpi_processor_reevaluate_tstate(pr, false); + acpi_processor_tstate_has_changed(pr); } - return NOTIFY_OK; + return 0; } -static struct notifier_block acpi_cpu_notifier = { - .notifier_call = acpi_cpu_soft_notify, -}; +static int acpi_soft_cpu_dead(unsigned int cpu) +{ + struct acpi_processor *pr = per_cpu(processors, cpu); + struct acpi_device *device; + + if (!pr || acpi_bus_get_device(pr->handle, &device)) + return 0; + + acpi_processor_reevaluate_tstate(pr, true); + return 0; +} #ifdef CONFIG_ACPI_CPU_FREQ_PSS static int acpi_pss_perf_init(struct acpi_processor *pr, @@ -303,7 +294,7 @@ static int acpi_processor_stop(struct device *dev) * This is needed for the powernow-k8 driver, that works even without * ACPI, but needs symbols from this driver */ - +static enum cpuhp_state hp_online; static int __init acpi_processor_driver_init(void) { int result = 0; @@ -315,11 +306,22 @@ static int __init acpi_processor_driver_init(void) if (result < 0) return result; - register_hotcpu_notifier(&acpi_cpu_notifier); + result = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "acpi/cpu-drv:online", + acpi_soft_cpu_online, NULL); + if (result < 0) + goto err; + hp_online = result; + cpuhp_setup_state_nocalls(CPUHP_ACPI_CPUDRV_DEAD, "acpi/cpu-drv:dead", + NULL, acpi_soft_cpu_dead); + acpi_thermal_cpufreq_init(); acpi_processor_ppc_init(); acpi_processor_throttling_init(); return 0; +err: + driver_unregister(&acpi_processor_driver); + return result; } static void __exit acpi_processor_driver_exit(void) @@ -329,7 +331,8 @@ static void __exit acpi_processor_driver_exit(void) acpi_processor_ppc_exit(); acpi_thermal_cpufreq_exit(); - unregister_hotcpu_notifier(&acpi_cpu_notifier); + cpuhp_remove_state_nocalls(hp_online); + cpuhp_remove_state_nocalls(CPUHP_ACPI_CPUDRV_DEAD); driver_unregister(&acpi_processor_driver); } diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c index c72e64893d03..d51ca1c05619 100644 --- a/drivers/acpi/processor_throttling.c +++ b/drivers/acpi/processor_throttling.c @@ -375,11 +375,11 @@ int acpi_processor_tstate_has_changed(struct acpi_processor *pr) * 3. TSD domain */ void acpi_processor_reevaluate_tstate(struct acpi_processor *pr, - unsigned long action) + bool is_dead) { int result = 0; - if (action == CPU_DEAD) { + if (is_dead) { /* When one CPU is offline, the T-state throttling * will be invalidated. */ diff --git a/include/acpi/processor.h b/include/acpi/processor.h index bfe6b2e10f3a..f3db11c24654 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -359,7 +359,7 @@ extern int acpi_processor_set_throttling(struct acpi_processor *pr, * onlined/offlined. In such case the flags.throttling will be updated. */ extern void acpi_processor_reevaluate_tstate(struct acpi_processor *pr, - unsigned long action); + bool is_dead); extern const struct file_operations acpi_processor_throttling_fops; extern void acpi_processor_throttling_init(void); #else @@ -380,7 +380,7 @@ static inline int acpi_processor_set_throttling(struct acpi_processor *pr, } static inline void acpi_processor_reevaluate_tstate(struct acpi_processor *pr, - unsigned long action) {} + bool is_dead) {} static inline void acpi_processor_throttling_init(void) {} #endif /* CONFIG_ACPI_CPU_FREQ_PSS */ diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index e7146ee88ea4..7706987c7827 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -25,6 +25,7 @@ enum cpuhp_state { CPUHP_IRQ_POLL_DEAD, CPUHP_BLOCK_SOFTIRQ_DEAD, CPUHP_VIRT_SCSI_DEAD, + CPUHP_ACPI_CPUDRV_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, From 27622b061eb4bb4d16b5d61219ac10a792010321 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 6 Sep 2016 19:04:48 +0200 Subject: [PATCH 396/538] cpufreq: Convert to hotplug state machine Install the callbacks via the state machine. Signed-off-by: Sebastian Andrzej Siewior Acked-by: "Rafael J. Wysocki" Cc: linux-pm@vger.kernel.org Cc: Peter Zijlstra Cc: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 3dd4884c6f9e..e0bc632a259e 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1358,7 +1358,7 @@ static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif) return add_cpu_dev_symlink(policy, cpu); } -static void cpufreq_offline(unsigned int cpu) +static int cpufreq_offline(unsigned int cpu) { struct cpufreq_policy *policy; int ret; @@ -1368,7 +1368,7 @@ static void cpufreq_offline(unsigned int cpu) policy = cpufreq_cpu_get_raw(cpu); if (!policy) { pr_debug("%s: No cpu_data found\n", __func__); - return; + return 0; } down_write(&policy->rwsem); @@ -1417,6 +1417,7 @@ static void cpufreq_offline(unsigned int cpu) unlock: up_write(&policy->rwsem); + return 0; } /** @@ -2332,28 +2333,6 @@ int cpufreq_update_policy(unsigned int cpu) } EXPORT_SYMBOL(cpufreq_update_policy); -static int cpufreq_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - cpufreq_online(cpu); - break; - - case CPU_DOWN_PREPARE: - cpufreq_offline(cpu); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block __refdata cpufreq_cpu_notifier = { - .notifier_call = cpufreq_cpu_callback, -}; - /********************************************************************* * BOOST * *********************************************************************/ @@ -2455,6 +2434,7 @@ EXPORT_SYMBOL_GPL(cpufreq_boost_enabled); /********************************************************************* * REGISTER / UNREGISTER CPUFREQ DRIVER * *********************************************************************/ +static enum cpuhp_state hp_online; /** * cpufreq_register_driver - register a CPU Frequency driver @@ -2517,7 +2497,13 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) goto err_if_unreg; } - register_hotcpu_notifier(&cpufreq_cpu_notifier); + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "cpufreq:online", + cpufreq_online, + cpufreq_offline); + if (ret < 0) + goto err_if_unreg; + hp_online = ret; + pr_debug("driver %s up and running\n", driver_data->name); goto out; @@ -2556,7 +2542,7 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver) get_online_cpus(); subsys_interface_unregister(&cpufreq_interface); remove_boost_sysfs_file(); - unregister_hotcpu_notifier(&cpufreq_cpu_notifier); + cpuhp_remove_state_nocalls(hp_online); write_lock_irqsave(&cpufreq_driver_lock, flags); From 30e92153b4e6f1cd01e30c34d9ef6f0986f96b0e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 6 Sep 2016 19:04:49 +0200 Subject: [PATCH 397/538] padata: Convert to hotplug state machine Install the callbacks via the state machine. CPU-hotplug multinstance support is used with the nocalls() version. Maybe parts of padata_alloc() could be moved into the online callback so that we could invoke ->startup callback for instance and drop get_online_cpus(). Signed-off-by: Sebastian Andrzej Siewior Cc: Steffen Klassert Cc: Peter Zijlstra Cc: linux-crypto@vger.kernel.org Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160906170457.32393-14-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/padata.h | 2 +- kernel/padata.c | 88 ++++++++++++++++++++++++------------------ 2 files changed, 51 insertions(+), 39 deletions(-) diff --git a/include/linux/padata.h b/include/linux/padata.h index 113ee626a4dc..0f9e567d5e15 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -151,7 +151,7 @@ struct parallel_data { * @flags: padata flags. */ struct padata_instance { - struct notifier_block cpu_notifier; + struct hlist_node node; struct workqueue_struct *wq; struct parallel_data *pd; struct padata_cpumask cpumask; diff --git a/kernel/padata.c b/kernel/padata.c index 993278895ccc..7848f0566403 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -30,6 +30,7 @@ #include #include #include +#include #define MAX_OBJ_NUM 1000 @@ -769,52 +770,43 @@ static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) cpumask_test_cpu(cpu, pinst->cpumask.cbcpu); } - -static int padata_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int padata_cpu_online(unsigned int cpu, struct hlist_node *node) { - int err; struct padata_instance *pinst; - int cpu = (unsigned long)hcpu; + int ret; - pinst = container_of(nfb, struct padata_instance, cpu_notifier); + pinst = hlist_entry_safe(node, struct padata_instance, node); + if (!pinst_has_cpu(pinst, cpu)) + return 0; - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - if (!pinst_has_cpu(pinst, cpu)) - break; - mutex_lock(&pinst->lock); - err = __padata_add_cpu(pinst, cpu); - mutex_unlock(&pinst->lock); - if (err) - return notifier_from_errno(err); - break; + mutex_lock(&pinst->lock); + ret = __padata_add_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + return ret; +} - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!pinst_has_cpu(pinst, cpu)) - break; - mutex_lock(&pinst->lock); - err = __padata_remove_cpu(pinst, cpu); - mutex_unlock(&pinst->lock); - if (err) - return notifier_from_errno(err); - break; - } +static int padata_cpu_prep_down(unsigned int cpu, struct hlist_node *node) +{ + struct padata_instance *pinst; + int ret; + + pinst = hlist_entry_safe(node, struct padata_instance, node); + if (!pinst_has_cpu(pinst, cpu)) + return 0; - return NOTIFY_OK; + mutex_lock(&pinst->lock); + ret = __padata_remove_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + return ret; } + +static enum cpuhp_state hp_online; #endif static void __padata_free(struct padata_instance *pinst) { #ifdef CONFIG_HOTPLUG_CPU - unregister_hotcpu_notifier(&pinst->cpu_notifier); + cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node); #endif padata_stop(pinst); @@ -1012,11 +1004,8 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq, mutex_init(&pinst->lock); #ifdef CONFIG_HOTPLUG_CPU - pinst->cpu_notifier.notifier_call = padata_cpu_callback; - pinst->cpu_notifier.priority = 0; - register_hotcpu_notifier(&pinst->cpu_notifier); + cpuhp_state_add_instance_nocalls(hp_online, &pinst->node); #endif - return pinst; err_free_masks: @@ -1039,3 +1028,26 @@ void padata_free(struct padata_instance *pinst) kobject_put(&pinst->kobj); } EXPORT_SYMBOL(padata_free); + +#ifdef CONFIG_HOTPLUG_CPU + +static __init int padata_driver_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online", + padata_cpu_online, + padata_cpu_prep_down); + if (ret < 0) + return ret; + hp_online = ret; + return 0; +} +module_init(padata_driver_init); + +static __exit void padata_driver_exit(void) +{ + cpuhp_remove_multi_state(hp_online); +} +module_exit(padata_driver_exit); +#endif From 8c58898b3ecb213ad7c52aa0c7c9d3201e559be1 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 6 Sep 2016 19:04:50 +0200 Subject: [PATCH 398/538] fault-injection/cpu: Convert to hotplug state machine Install the callbacks via the state machine. This is just a temporary vehicle to keep the interface working for now, It'll be replaced by the sysfs interface which allows to step through the hotplug state machine step by step. Signed-off-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: Akinobu Mita Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160906170457.32393-15-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 1 + lib/cpu-notifier-error-inject.c | 46 ++++++++++++++++++++++++++++----- 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 7706987c7827..bb6231d13d3a 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -43,6 +43,7 @@ enum cpuhp_state { CPUHP_ARM_SHMOBILE_SCU_PREPARE, CPUHP_SH_SH3X_PREPARE, CPUHP_TIMERS_DEAD, + CPUHP_NOTF_ERR_INJ_PREPARE, CPUHP_BRINGUP_CPU, CPUHP_AP_IDLE_DEAD, CPUHP_AP_OFFLINE, diff --git a/lib/cpu-notifier-error-inject.c b/lib/cpu-notifier-error-inject.c index 707ca24f7b18..0e2c9a1e958a 100644 --- a/lib/cpu-notifier-error-inject.c +++ b/lib/cpu-notifier-error-inject.c @@ -8,16 +8,47 @@ static int priority; module_param(priority, int, 0); MODULE_PARM_DESC(priority, "specify cpu notifier priority"); +#define UP_PREPARE 0 +#define UP_PREPARE_FROZEN 0 +#define DOWN_PREPARE 0 +#define DOWN_PREPARE_FROZEN 0 + static struct notifier_err_inject cpu_notifier_err_inject = { .actions = { - { NOTIFIER_ERR_INJECT_ACTION(CPU_UP_PREPARE) }, - { NOTIFIER_ERR_INJECT_ACTION(CPU_UP_PREPARE_FROZEN) }, - { NOTIFIER_ERR_INJECT_ACTION(CPU_DOWN_PREPARE) }, - { NOTIFIER_ERR_INJECT_ACTION(CPU_DOWN_PREPARE_FROZEN) }, + { NOTIFIER_ERR_INJECT_ACTION(UP_PREPARE) }, + { NOTIFIER_ERR_INJECT_ACTION(UP_PREPARE_FROZEN) }, + { NOTIFIER_ERR_INJECT_ACTION(DOWN_PREPARE) }, + { NOTIFIER_ERR_INJECT_ACTION(DOWN_PREPARE_FROZEN) }, {} } }; +static int notf_err_handle(struct notifier_err_inject_action *action) +{ + int ret; + + ret = action->error; + if (ret) + pr_info("Injecting error (%d) to %s\n", ret, action->name); + return ret; +} + +static int notf_err_inj_up_prepare(unsigned int cpu) +{ + if (!cpuhp_tasks_frozen) + return notf_err_handle(&cpu_notifier_err_inject.actions[0]); + else + return notf_err_handle(&cpu_notifier_err_inject.actions[1]); +} + +static int notf_err_inj_dead(unsigned int cpu) +{ + if (!cpuhp_tasks_frozen) + return notf_err_handle(&cpu_notifier_err_inject.actions[2]); + else + return notf_err_handle(&cpu_notifier_err_inject.actions[3]); +} + static struct dentry *dir; static int err_inject_init(void) @@ -29,7 +60,10 @@ static int err_inject_init(void) if (IS_ERR(dir)) return PTR_ERR(dir); - err = register_hotcpu_notifier(&cpu_notifier_err_inject.nb); + err = cpuhp_setup_state_nocalls(CPUHP_NOTF_ERR_INJ_PREPARE, + "cpu-err-notif:prepare", + notf_err_inj_up_prepare, + notf_err_inj_dead); if (err) debugfs_remove_recursive(dir); @@ -38,7 +72,7 @@ static int err_inject_init(void) static void err_inject_exit(void) { - unregister_hotcpu_notifier(&cpu_notifier_err_inject.nb); + cpuhp_remove_state_nocalls(CPUHP_NOTF_ERR_INJ_PREPARE); debugfs_remove_recursive(dir); } From dd6d7c6f3dc136c1bec6def840f7fa53f84d1fe6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 6 Sep 2016 19:04:51 +0200 Subject: [PATCH 399/538] mips/octeon/smp: Convert to hotplug state machine Install the callbacks via the state machine. [ tglx: Renamed the state to MIPS_SOC_PREPARE so it can be reused by other SOCs ] Signed-off-by: Sebastian Andrzej Siewior Acked-by: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: Peter Zijlstra Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160906170457.32393-16-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- arch/mips/cavium-octeon/smp.c | 24 +++--------------------- include/linux/cpuhotplug.h | 1 + 2 files changed, 4 insertions(+), 21 deletions(-) diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c index 4d457d602d3b..256fe6f65cf2 100644 --- a/arch/mips/cavium-octeon/smp.c +++ b/arch/mips/cavium-octeon/smp.c @@ -380,29 +380,11 @@ static int octeon_update_boot_vector(unsigned int cpu) return 0; } -static int octeon_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - octeon_update_boot_vector(cpu); - break; - case CPU_ONLINE: - pr_info("Cpu %d online\n", cpu); - break; - case CPU_DEAD: - break; - } - - return NOTIFY_OK; -} - static int register_cavium_notifier(void) { - hotcpu_notifier(octeon_cpu_callback, 0); - return 0; + return cpuhp_setup_state_nocalls(CPUHP_MIPS_SOC_PREPARE, + "mips/cavium:prepare", + octeon_update_boot_vector, NULL); } late_initcall(register_cavium_notifier); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index bb6231d13d3a..8f8a48bbe86d 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -44,6 +44,7 @@ enum cpuhp_state { CPUHP_SH_SH3X_PREPARE, CPUHP_TIMERS_DEAD, CPUHP_NOTF_ERR_INJ_PREPARE, + CPUHP_MIPS_SOC_PREPARE, CPUHP_BRINGUP_CPU, CPUHP_AP_IDLE_DEAD, CPUHP_AP_OFFLINE, From e476d3129100ba18daea2224f38fdd7195118d4b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 6 Sep 2016 19:04:52 +0200 Subject: [PATCH 400/538] mips/loongson/smp: Convert to hotplug state machine Install the callbacks via the state machine. [ tglx: Reuse the MIPS_SOC_PREPARE state ] Signed-off-by: Sebastian Andrzej Siewior Acked-by: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: Peter Zijlstra Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160906170457.32393-17-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- arch/mips/loongson64/loongson-3/smp.c | 34 +++++++-------------------- 1 file changed, 8 insertions(+), 26 deletions(-) diff --git a/arch/mips/loongson64/loongson-3/smp.c b/arch/mips/loongson64/loongson-3/smp.c index 2fec6f753a35..99aab9f85904 100644 --- a/arch/mips/loongson64/loongson-3/smp.c +++ b/arch/mips/loongson64/loongson-3/smp.c @@ -677,7 +677,7 @@ void play_dead(void) play_dead_at_ckseg1(state_addr); } -void loongson3_disable_clock(int cpu) +static int loongson3_disable_clock(unsigned int cpu) { uint64_t core_id = cpu_data[cpu].core; uint64_t package_id = cpu_data[cpu].package; @@ -688,9 +688,10 @@ void loongson3_disable_clock(int cpu) if (!(loongson_sysconf.workarounds & WORKAROUND_CPUHOTPLUG)) LOONGSON_FREQCTRL(package_id) &= ~(1 << (core_id * 4 + 3)); } + return 0; } -void loongson3_enable_clock(int cpu) +static int loongson3_enable_clock(unsigned int cpu) { uint64_t core_id = cpu_data[cpu].core; uint64_t package_id = cpu_data[cpu].package; @@ -701,34 +702,15 @@ void loongson3_enable_clock(int cpu) if (!(loongson_sysconf.workarounds & WORKAROUND_CPUHOTPLUG)) LOONGSON_FREQCTRL(package_id) |= 1 << (core_id * 4 + 3); } -} - -#define CPU_POST_DEAD_FROZEN (CPU_POST_DEAD | CPU_TASKS_FROZEN) -static int loongson3_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - - switch (action) { - case CPU_POST_DEAD: - case CPU_POST_DEAD_FROZEN: - pr_info("Disable clock for CPU#%d\n", cpu); - loongson3_disable_clock(cpu); - break; - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - pr_info("Enable clock for CPU#%d\n", cpu); - loongson3_enable_clock(cpu); - break; - } - - return NOTIFY_OK; + return 0; } static int register_loongson3_notifier(void) { - hotcpu_notifier(loongson3_cpu_callback, 0); - return 0; + return cpuhp_setup_state_nocalls(CPUHP_MIPS_SOC_PREPARE, + "mips/loongson:prepare", + loongson3_enable_clock, + loongson3_disable_clock); } early_initcall(register_loongson3_notifier); From 84c9ceefecb8fe51c4bfa2a5424dd73bc024e41d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 6 Sep 2016 19:04:53 +0200 Subject: [PATCH 401/538] s390/mm/pfault: Convert to hotplug state machine Install the callbacks via the state machine. Signed-off-by: Sebastian Andrzej Siewior Cc: linux-s390@vger.kernel.org Cc: Peter Zijlstra Cc: Heiko Carstens Cc: rt@linutronix.de Cc: Martin Schwidefsky Link: http://lkml.kernel.org/r/20160906170457.32393-18-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- arch/s390/mm/fault.c | 30 ++++++++++++------------------ include/linux/cpuhotplug.h | 1 + 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index a58bca62a93b..cbb73fabc91e 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -740,28 +740,21 @@ static void pfault_interrupt(struct ext_code ext_code, put_task_struct(tsk); } -static int pfault_cpu_notify(struct notifier_block *self, unsigned long action, - void *hcpu) +static int pfault_cpu_dead(unsigned int cpu) { struct thread_struct *thread, *next; struct task_struct *tsk; - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DEAD: - spin_lock_irq(&pfault_lock); - list_for_each_entry_safe(thread, next, &pfault_list, list) { - thread->pfault_wait = 0; - list_del(&thread->list); - tsk = container_of(thread, struct task_struct, thread); - wake_up_process(tsk); - put_task_struct(tsk); - } - spin_unlock_irq(&pfault_lock); - break; - default: - break; + spin_lock_irq(&pfault_lock); + list_for_each_entry_safe(thread, next, &pfault_list, list) { + thread->pfault_wait = 0; + list_del(&thread->list); + tsk = container_of(thread, struct task_struct, thread); + wake_up_process(tsk); + put_task_struct(tsk); } - return NOTIFY_OK; + spin_unlock_irq(&pfault_lock); + return 0; } static int __init pfault_irq_init(void) @@ -775,7 +768,8 @@ static int __init pfault_irq_init(void) if (rc) goto out_pfault; irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); - hotcpu_notifier(pfault_cpu_notify, 0); + cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead", + NULL, pfault_cpu_dead); return 0; out_pfault: diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 8f8a48bbe86d..dea6696c673a 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -26,6 +26,7 @@ enum cpuhp_state { CPUHP_BLOCK_SOFTIRQ_DEAD, CPUHP_VIRT_SCSI_DEAD, CPUHP_ACPI_CPUDRV_DEAD, + CPUHP_S390_PFAULT_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, From b067a7be411ccb31b6cc866fc213670c3acf4001 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 6 Sep 2016 19:04:54 +0200 Subject: [PATCH 402/538] x86/apic/uv: Convert to hotplug state machine Install the callbacks via the state machine. Signed-off-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160906170457.32393-19-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/x2apic_uv_x.c | 31 ++++++------------------------ 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index cb0673c1e940..391b7f8b31c8 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -927,7 +927,7 @@ static void uv_heartbeat(unsigned long ignored) mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); } -static void uv_heartbeat_enable(int cpu) +static int uv_heartbeat_enable(unsigned int cpu) { while (!uv_cpu_scir_info(cpu)->enabled) { struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer; @@ -941,43 +941,24 @@ static void uv_heartbeat_enable(int cpu) /* also ensure that boot cpu is enabled */ cpu = 0; } + return 0; } #ifdef CONFIG_HOTPLUG_CPU -static void uv_heartbeat_disable(int cpu) +static int uv_heartbeat_disable(unsigned int cpu) { if (uv_cpu_scir_info(cpu)->enabled) { uv_cpu_scir_info(cpu)->enabled = 0; del_timer(&uv_cpu_scir_info(cpu)->timer); } uv_set_cpu_scir_bits(cpu, 0xff); -} - -/* - * cpu hotplug notifier - */ -static int uv_scir_cpu_notify(struct notifier_block *self, unsigned long action, - void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_FAILED: - case CPU_ONLINE: - uv_heartbeat_enable(cpu); - break; - case CPU_DOWN_PREPARE: - uv_heartbeat_disable(cpu); - break; - default: - break; - } - return NOTIFY_OK; + return 0; } static __init void uv_scir_register_cpu_notifier(void) { - hotcpu_notifier(uv_scir_cpu_notify, 0); + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/x2apic-uvx:online", + uv_heartbeat_enable, uv_heartbeat_disable); } #else /* !CONFIG_HOTPLUG_CPU */ From 164c80ed84a7669114869d9347c0f3ea7f56ea89 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 19 Sep 2016 15:12:41 -0600 Subject: [PATCH 403/538] blk-throttle: Extend slice if throttle group is not empty Right now, if slice is expired, we start a new slice. If a bio is queued, we keep on extending slice by throtle_slice interval (100ms). This worked well as long as pending timer function got executed with-in few milli seconds of scheduled time. But looks like with recent changes in timer subsystem, slack can be much longer depending on the expiry time of the scheduled timer. commit 500462a9de65 ("timers: Switch to a non-cascading wheel") This means, by the time timer function gets executed, it is possible the delay from scheduled time is more than 100ms. That means current code will conclude that existing slice has expired and a new one needs to be started. New slice will be 100ms by default and that will not be sufficient to meet rate requirement of group given the bio size and bio will not be dispatched and we will start a new timer function to wait. And when that timer expires, same process will repeat and we will wait again and this can easily be an infinite loop. Solve this issue by starting a new slice only if throttle gropup is empty. If it is not empty, that means there should be an active slice going on. Ideally it should not be expired but given the slack, it is possible that it has expired. Reported-by: Hou Tao Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-throttle.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index f1aba26f4719..a3ea8260c94c 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -780,9 +780,11 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, /* * If previous slice expired, start a new one otherwise renew/extend * existing slice to make sure it is at least throtl_slice interval - * long since now. + * long since now. New slice is started only for empty throttle group. + * If there is queued bio, that means there should be an active + * slice and it should be extended instead. */ - if (throtl_slice_used(tg, rw)) + if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) throtl_start_new_slice(tg, rw); else { if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) From 7fadce0d60d09427e0027d3d468781b08ca0b3d1 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 19 Sep 2016 14:49:08 -0700 Subject: [PATCH 404/538] scripts/faddr2line: improve on base path filtering a bit Due to our compiler include directives, the build pathnames for header files often end up being of the form "$srcdir/./include/linux/xyz.h", which ends up having that extra "." path component after the build base in it. Teach faddr2line to skip that too, to make code generated in inline functions in header files match the filename for the regular C files. Rabin Vincent pointed out that I can't make a stricter regexp match by using the " at " prefix for the pathname, because that ends up being locale-dependent. But this does require that the path match be preceded by a space, to make it a bit more strict (that matters mainly if we didn't find any base_dir at all, and we only end up with the "./" part of the match) Acked-by: Josh Poimboeuf Cc: Rabin Vincent Signed-off-by: Linus Torvalds --- scripts/faddr2line | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/faddr2line b/scripts/faddr2line index 4fbfe8305fe3..450b33257339 100755 --- a/scripts/faddr2line +++ b/scripts/faddr2line @@ -145,7 +145,7 @@ __faddr2line() { local hexsize=0x$(printf %x $sym_size) echo "$func+$offset/$hexsize:" - addr2line -fpie $objfile $hexaddr | sed "s;$dir_prefix;;" + addr2line -fpie $objfile $hexaddr | sed "s; $dir_prefix\(\./\)*; ;" DONE=1 done < <(readelf -sW $objfile | awk -v f=$func '$8 == f {print}') From 6e68b08728ce3365c713f8663c6b05a79e2bbca1 Mon Sep 17 00:00:00 2001 From: Vinson Lee Date: Sat, 17 Sep 2016 00:51:53 +0000 Subject: [PATCH 405/538] x86/vdso: Use CONFIG_X86_X32_ABI to enable vdso prctl The prctl code which references vdso_image_x32 is built when CONFIG_X86_X32 is set. This results in the following build failure: LD init/built-in.o arch/x86/built-in.o: In function `do_arch_prctl': (.text+0x27466): undefined reference to `vdso_image_x32' vdso_image_x32 depends on CONFIG_X86_X32_ABI. So we need to make the prctl depend on that as well. [ tglx: Massaged changelog ] Fixes: 2eefd8789698 ("x86/arch_prctl/vdso: Add ARCH_MAP_VDSO_*") Signed-off-by: Vinson Lee Reviewed-by: Dmitry Safonov Cc: Andy Lutomirski Cc: Brian Gerst Cc: Borislav Petkov Cc: Dmitry Vyukov Link: http://lkml.kernel.org/r/1474073513-6656-1-git-send-email-vlee@freedesktop.org Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b26a0092a01d..b4603b71a659 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -592,7 +592,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) } #ifdef CONFIG_CHECKPOINT_RESTORE -# ifdef CONFIG_X86_X32 +# ifdef CONFIG_X86_X32_ABI case ARCH_MAP_VDSO_X32: return prctl_map_vdso(&vdso_image_x32, addr); # endif From b0f48706a176b71a6e54f399d7404bbeeaa7cfab Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Sun, 18 Sep 2016 19:34:51 +0800 Subject: [PATCH 406/538] x86/apic: Order irq_enter/exit() calls correctly vs. ack_APIC_irq() =============================== [ INFO: suspicious RCU usage. ] 4.8.0-rc6+ #5 Not tainted ------------------------------- ./arch/x86/include/asm/msr-trace.h:47 suspicious rcu_dereference_check() usage! other info that might help us debug this: RCU used illegally from idle CPU! rcu_scheduler_active = 1, debug_locks = 0 RCU used illegally from extended quiescent state! no locks held by swapper/2/0. stack backtrace: CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.8.0-rc6+ #5 Hardware name: Dell Inc. OptiPlex 7020/0F5C5X, BIOS A03 01/08/2015 0000000000000000 ffff8d1bd6003f10 ffffffff94446949 ffff8d1bd4a68000 0000000000000001 ffff8d1bd6003f40 ffffffff940e9247 ffff8d1bbdfcf3d0 000000000000080b 0000000000000000 0000000000000000 ffff8d1bd6003f70 Call Trace: [] dump_stack+0x99/0xd0 [] lockdep_rcu_suspicious+0xe7/0x120 [] do_trace_write_msr+0x135/0x140 [] native_write_msr+0x20/0x30 [] native_apic_msr_eoi_write+0x1d/0x30 [] smp_trace_call_function_interrupt+0x1e/0x270 [] trace_call_function_interrupt+0x96/0xa0 [] ? cpuidle_enter_state+0xe4/0x360 [] ? cpuidle_enter_state+0xcf/0x360 [] cpuidle_enter+0x17/0x20 [] cpu_startup_entry+0x338/0x4d0 [] start_secondary+0x154/0x180 This can be reproduced readily by running ftrace test case of kselftest. Move the irq_enter() call before ack_APIC_irq(), because irq_enter() tells the RCU susbstems to end the extended quiescent state, so that the following trace call in ack_APIC_irq() works correctly. The same applies to exiting_ack_irq() which calls ack_APIC_irq() after irq_exit(). [ tglx: Massaged changelog ] Signed-off-by: Wanpeng Li Cc: Peter Zijlstra Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1474198491-3738-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/apic.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 124357773ffa..f5aaf6c83222 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -650,8 +650,8 @@ static inline void entering_ack_irq(void) static inline void ipi_entering_ack_irq(void) { - ack_APIC_irq(); irq_enter(); + ack_APIC_irq(); } static inline void exiting_irq(void) @@ -661,9 +661,8 @@ static inline void exiting_irq(void) static inline void exiting_ack_irq(void) { - irq_exit(); - /* Ack only at the end to avoid potential reentry */ ack_APIC_irq(); + irq_exit(); } extern void ioapic_zap_locks(void); From cff9ab2b291e64259d97add48fe073c081afe4e2 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 13 Sep 2016 20:12:32 +0200 Subject: [PATCH 407/538] x86/apic: Get rid of apic_version[] array The array has a size of MAX_LOCAL_APIC, which can be as large as 32k, so it can consume up to 128k. The array has been there forever and was never used for anything useful other than a version mismatch check which was introduced in 2009. There is no reason to store the version in an array. The kernel is not prepared to handle different APIC versions anyway, so the real important part is to detect a version mismatch and warn about it, which can be done with a single variable as well. [ tglx: Massaged changelog ] Signed-off-by: Denys Vlasenko CC: Andy Lutomirski CC: Borislav Petkov CC: Brian Gerst CC: Mike Travis Link: http://lkml.kernel.org/r/20160913181232.30815-1-dvlasenk@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/mpspec.h | 2 +- arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/apic/apic.c | 17 +++++++---------- arch/x86/kernel/apic/io_apic.c | 4 ++-- arch/x86/kernel/apic/probe_32.c | 2 +- arch/x86/kernel/smpboot.c | 10 +++++----- 6 files changed, 17 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index b07233b64578..c2f94dcc92ce 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -6,7 +6,6 @@ #include #include -extern int apic_version[]; extern int pic_mode; #ifdef CONFIG_X86_32 @@ -40,6 +39,7 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES]; extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); extern unsigned int boot_cpu_physical_apicid; +extern u8 boot_cpu_apic_version; extern unsigned long mp_lapic_addr; #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 1ad5fe213043..0447e314e7f5 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -182,7 +182,7 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) } if (boot_cpu_physical_apicid != -1U) - ver = apic_version[boot_cpu_physical_apicid]; + ver = boot_cpu_apic_version; cpu = generic_processor_info(id, ver); if (cpu >= 0) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 1cbae30af51c..779dae5a852f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -64,6 +64,8 @@ unsigned disabled_cpus; unsigned int boot_cpu_physical_apicid = -1U; EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); +u8 boot_cpu_apic_version; + /* * The highest APIC ID seen during enumeration. */ @@ -1812,8 +1814,7 @@ void __init init_apic_mappings(void) * since smp_sanity_check is prepared for such a case * and disable smp mode */ - apic_version[new_apicid] = - GET_APIC_VERSION(apic_read(APIC_LVR)); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } } @@ -1828,13 +1829,10 @@ void __init register_lapic_address(unsigned long address) } if (boot_cpu_physical_apicid == -1U) { boot_cpu_physical_apicid = read_apic_id(); - apic_version[boot_cpu_physical_apicid] = - GET_APIC_VERSION(apic_read(APIC_LVR)); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } } -int apic_version[MAX_LOCAL_APIC]; - /* * Local APIC interrupts */ @@ -2124,11 +2122,10 @@ int generic_processor_info(int apicid, int version) cpu, apicid); version = 0x10; } - apic_version[apicid] = version; - if (version != apic_version[boot_cpu_physical_apicid]) { + if (version != boot_cpu_apic_version) { pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", - apic_version[boot_cpu_physical_apicid], cpu, version); + boot_cpu_apic_version, cpu, version); } physid_set(apicid, phys_cpu_present_map); @@ -2271,7 +2268,7 @@ int __init APIC_init_uniprocessor(void) * Complain if the BIOS pretends there is one. */ if (!boot_cpu_has(X86_FEATURE_APIC) && - APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + APIC_INTEGRATED(boot_cpu_apic_version)) { pr_err("BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); return -1; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7491f417a8e4..48e6d84f173e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1593,7 +1593,7 @@ void __init setup_ioapic_ids_from_mpc(void) * no meaning without the serial APIC bus. */ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + || APIC_XAPIC(boot_cpu_apic_version)) return; setup_ioapic_ids_from_mpc_nocheck(); } @@ -2423,7 +2423,7 @@ static int io_apic_get_unique_id(int ioapic, int apic_id) static u8 io_apic_unique_id(int idx, u8 id) { if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + !APIC_XAPIC(boot_cpu_apic_version)) return io_apic_get_unique_id(idx, id); else return id; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 7c43e716c158..563096267ca2 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -152,7 +152,7 @@ early_param("apic", parse_apic); void __init default_setup_apic_routing(void) { - int version = apic_version[boot_cpu_physical_apicid]; + int version = boot_cpu_apic_version; if (num_possible_cpus() > 8) { switch (boot_cpu_data.x86_vendor) { diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8216b997c1c9..f2b8e4574d69 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -683,7 +683,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); - if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + if (APIC_INTEGRATED(boot_cpu_apic_version)) { maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); @@ -710,7 +710,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) /* * Be paranoid about clearing APIC errors. */ - if (APIC_INTEGRATED(apic_version[phys_apicid])) { + if (APIC_INTEGRATED(boot_cpu_apic_version)) { if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); apic_read(APIC_ESR); @@ -749,7 +749,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) * Determine this based on the APIC version. * If we don't have an integrated APIC, don't send the STARTUP IPIs. */ - if (APIC_INTEGRATED(apic_version[phys_apicid])) + if (APIC_INTEGRATED(boot_cpu_apic_version)) num_starts = 2; else num_starts = 0; @@ -987,7 +987,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) /* * Be paranoid about clearing APIC errors. */ - if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + if (APIC_INTEGRATED(boot_cpu_apic_version)) { apic_write(APIC_ESR, 0); apic_read(APIC_ESR); } @@ -1242,7 +1242,7 @@ static int __init smp_sanity_check(unsigned max_cpus) /* * If we couldn't find a local APIC, then get out of here now! */ - if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && + if (APIC_INTEGRATED(boot_cpu_apic_version) && !boot_cpu_has(X86_FEATURE_APIC)) { if (!disable_apic) { pr_err("BIOS bug, local APIC #%d not detected!...\n", From 9bb627be47a574b764e162e8513d5db78d49e7f5 Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Mon, 19 Sep 2016 14:43:52 -0700 Subject: [PATCH 408/538] mem-hotplug: don't clear the only node in new_node_page() Commit 394e31d2ceb4 ("mem-hotplug: alloc new page from a nearest neighbor node when mem-offline") introduced new_node_page() for memory hotplug. In new_node_page(), the nid is cleared before calling __alloc_pages_nodemask(). But if it is the only node of the system, and the first round allocation fails, it will not be able to get memory from an empty nodemask, and will trigger oom. The patch checks whether it is the last node on the system, and if it is, then don't clear the nid in the nodemask. Fixes: 394e31d2ceb4 ("mem-hotplug: alloc new page from a nearest neighbor node when mem-offline") Link: http://lkml.kernel.org/r/1473044391.4250.19.camel@TP420 Signed-off-by: Li Zhong Reported-by: John Allen Acked-by: Vlastimil Babka Cc: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 41266dc29f33..b58906b6215c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1567,7 +1567,9 @@ static struct page *new_node_page(struct page *page, unsigned long private, return alloc_huge_page_node(page_hstate(compound_head(page)), next_node_in(nid, nmask)); - node_clear(nid, nmask); + if (nid != next_node_in(nid, nmask)) + node_clear(nid, nmask); + if (PageHighMem(page) || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) gfp_mask |= __GFP_HIGHMEM; From e6f0c6e6170fec175fe676495f29029aecdf486c Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 19 Sep 2016 14:43:55 -0700 Subject: [PATCH 409/538] ocfs2/dlm: fix race between convert and migration Commit ac7cf246dfdb ("ocfs2/dlm: fix race between convert and recovery") checks if lockres master has changed to identify whether new master has finished recovery or not. This will introduce a race that right after old master does umount ( means master will change), a new convert request comes. In this case, it will reset lockres state to DLM_RECOVERING and then retry convert, and then fail with lockres->l_action being set to OCFS2_AST_INVALID, which will cause inconsistent lock level between ocfs2 and dlm, and then finally BUG. Since dlm recovery will clear lock->convert_pending in dlm_move_lockres_to_recovery_list, we can use it to correctly identify the race case between convert and recovery. So fix it. Fixes: ac7cf246dfdb ("ocfs2/dlm: fix race between convert and recovery") Link: http://lkml.kernel.org/r/57CE1569.8010704@huawei.com Signed-off-by: Joseph Qi Signed-off-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmconvert.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index cdeafb4e7ed6..0bb128659d4b 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -268,7 +268,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, struct dlm_lock *lock, int flags, int type) { enum dlm_status status; - u8 old_owner = res->owner; mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type, lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS); @@ -335,7 +334,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; - lock->convert_pending = 0; /* if it failed, move it back to granted queue. * if master returns DLM_NORMAL and then down before sending ast, * it may have already been moved to granted queue, reset to @@ -344,12 +342,14 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, if (status != DLM_NOTQUEUED) dlm_error(status); dlm_revert_pending_convert(res, lock); - } else if ((res->state & DLM_LOCK_RES_RECOVERING) || - (old_owner != res->owner)) { - mlog(0, "res %.*s is in recovering or has been recovered.\n", - res->lockname.len, res->lockname.name); + } else if (!lock->convert_pending) { + mlog(0, "%s: res %.*s, owner died and lock has been moved back " + "to granted list, retry convert.\n", + dlm->name, res->lockname.len, res->lockname.name); status = DLM_RECOVERING; } + + lock->convert_pending = 0; bail: spin_unlock(&res->spinlock); From d8e3875431956c1f78e142d531f490f76c760ce3 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Mon, 19 Sep 2016 14:43:58 -0700 Subject: [PATCH 410/538] MAINTAINERS: Maik has moved Maik is no longer using the plusserver.de email, update with his current email. Link: http://lkml.kernel.org/r/1473007794-27960-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Sudip Mukherjee Cc: Maik Broemme Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 644ff65d336d..2551f6e2fb43 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6103,7 +6103,7 @@ S: Supported F: drivers/cpufreq/intel_pstate.c INTEL FRAMEBUFFER DRIVER (excluding 810 and 815) -M: Maik Broemme +M: Maik Broemme L: linux-fbdev@vger.kernel.org S: Maintained F: Documentation/fb/intelfb.txt From c131f751ab1a852d4dd4b490b3a7fbba7d738de5 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 19 Sep 2016 14:44:01 -0700 Subject: [PATCH 411/538] khugepaged: fix use-after-free in collapse_huge_page() hugepage_vma_revalidate() tries to re-check if we still should try to collapse small pages into huge one after the re-acquiring mmap_sem. The problem Dmitry Vyukov reported[1] is that the vma found by hugepage_vma_revalidate() can be suitable for huge pages, but not the same vma we had before dropping mmap_sem. And dereferencing original vma can lead to fun results.. Let's use vma hugepage_vma_revalidate() found instead of assuming it's the same as what we had before the lock was dropped. [1] http://lkml.kernel.org/r/CACT4Y+Z3gigBvhca9kRJFcjX0G70V_nRhbwKBU+yGoESBDKi9Q@mail.gmail.com Link: http://lkml.kernel.org/r/20160907122559.GA6542@black.fi.intel.com Signed-off-by: Kirill A. Shutemov Reported-by: Dmitry Vyukov Reviewed-by: Andrea Arcangeli Cc: Ebru Akagunduz Cc: Vlastimil Babka Cc: Mel Gorman Cc: Johannes Weiner Cc: Vegard Nossum Cc: Sasha Levin Cc: Konstantin Khlebnikov Cc: Andrey Ryabinin Cc: Greg Thelen Cc: Suleiman Souhlal Cc: Hugh Dickins Cc: David Rientjes Cc: syzkaller Cc: Kostya Serebryany Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 79c52d0061af..62339bf3c726 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -838,7 +838,8 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) * value (scan code). */ -static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address) +static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, + struct vm_area_struct **vmap) { struct vm_area_struct *vma; unsigned long hstart, hend; @@ -846,7 +847,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address) if (unlikely(khugepaged_test_exit(mm))) return SCAN_ANY_PROCESS; - vma = find_vma(mm, address); + *vmap = vma = find_vma(mm, address); if (!vma) return SCAN_VMA_NULL; @@ -898,7 +899,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ if (ret & VM_FAULT_RETRY) { down_read(&mm->mmap_sem); - if (hugepage_vma_revalidate(mm, address)) { + if (hugepage_vma_revalidate(mm, address, &fe.vma)) { /* vma is no longer available, don't continue to swapin */ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; @@ -923,7 +924,6 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, - struct vm_area_struct *vma, int node, int referenced) { pmd_t *pmd, _pmd; @@ -933,6 +933,7 @@ static void collapse_huge_page(struct mm_struct *mm, spinlock_t *pmd_ptl, *pte_ptl; int isolated = 0, result = 0; struct mem_cgroup *memcg; + struct vm_area_struct *vma; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ gfp_t gfp; @@ -961,7 +962,7 @@ static void collapse_huge_page(struct mm_struct *mm, } down_read(&mm->mmap_sem); - result = hugepage_vma_revalidate(mm, address); + result = hugepage_vma_revalidate(mm, address, &vma); if (result) { mem_cgroup_cancel_charge(new_page, memcg, true); up_read(&mm->mmap_sem); @@ -994,7 +995,7 @@ static void collapse_huge_page(struct mm_struct *mm, * handled by the anon_vma lock + PG_lock. */ down_write(&mm->mmap_sem); - result = hugepage_vma_revalidate(mm, address); + result = hugepage_vma_revalidate(mm, address, &vma); if (result) goto out; /* check if the pmd is still valid */ @@ -1202,7 +1203,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, if (ret) { node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_sem released */ - collapse_huge_page(mm, address, hpage, vma, node, referenced); + collapse_huge_page(mm, address, hpage, node, referenced); } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, From 982785c6b05a82c01e90687b7e25ee87c8970b2e Mon Sep 17 00:00:00 2001 From: Ebru Akagunduz Date: Mon, 19 Sep 2016 14:44:04 -0700 Subject: [PATCH 412/538] mm, thp: fix leaking mapped pte in __collapse_huge_page_swapin() Currently, khugepaged does not permit swapin if there are enough young pages in a THP. The problem is when a THP does not have enough young pages, khugepaged leaks mapped ptes. This patch prohibits leaking mapped ptes. Link: http://lkml.kernel.org/r/1472820276-7831-1-git-send-email-ebru.akagunduz@gmail.com Signed-off-by: Ebru Akagunduz Suggested-by: Andrea Arcangeli Reviewed-by: Andrea Arcangeli Reviewed-by: Rik van Riel Cc: Vlastimil Babka Cc: Mel Gorman Cc: Kirill A. Shutemov Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 62339bf3c726..728d7790dc2d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -882,6 +882,11 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, .pmd = pmd, }; + /* we only decide to swapin, if there is enough young ptes */ + if (referenced < HPAGE_PMD_NR/2) { + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); + return false; + } fe.pte = pte_offset_map(pmd, address); for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE; fe.pte++, fe.address += PAGE_SIZE) { @@ -889,11 +894,6 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, if (!is_swap_pte(pteval)) continue; swapped_in++; - /* we only decide to swapin, if there is enough young ptes */ - if (referenced < HPAGE_PMD_NR/2) { - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; - } ret = do_swap_page(&fe, pteval); /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ From 4d35427ad7641cba08ea0deffae1a78147ad41c0 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 19 Sep 2016 14:44:07 -0700 Subject: [PATCH 413/538] mm: avoid endless recursion in dump_page() dump_page() uses page_mapcount() to get mapcount of the page. page_mapcount() has VM_BUG_ON_PAGE(PageSlab(page)) as mapcount doesn't make sense for slab pages and the field in struct page used for other information. It leads to recursion if dump_page() called for slub page and DEBUG_VM is enabled: dump_page() -> page_mapcount() -> VM_BUG_ON_PAGE() -> dump_page -> ... Let's avoid calling page_mapcount() for slab pages in dump_page(). Link: http://lkml.kernel.org/r/20160908082137.131076-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/debug.c b/mm/debug.c index 8865bfb41b0b..74c7cae4f683 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -42,9 +42,11 @@ const struct trace_print_flags vmaflag_names[] = { void __dump_page(struct page *page, const char *reason) { + int mapcount = PageSlab(page) ? 0 : page_mapcount(page); + pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", - page, page_ref_count(page), page_mapcount(page), - page->mapping, page->index); + page, page_ref_count(page), mapcount, + page->mapping, page_to_pgoff(page)); if (PageCompound(page)) pr_cont(" compound_mapcount: %d", compound_mapcount(page)); pr_cont("\n"); From 08eeb3061e44661afb4cb9eb08780e2fff8bfbc5 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 19 Sep 2016 14:44:09 -0700 Subject: [PATCH 414/538] MAINTAINERS: update email for VLYNQ bus entry Link: http://lkml.kernel.org/r/1473218738-21836-1-git-send-email-f.fainelli@gmail.com Signed-off-by: Florian Fainelli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2551f6e2fb43..a0ce40f4c66c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12569,7 +12569,7 @@ F: include/linux/if_*vlan.h F: net/8021q/ VLYNQ BUS -M: Florian Fainelli +M: Florian Fainelli L: openwrt-devel@lists.openwrt.org (subscribers-only) S: Maintained F: drivers/vlynq/vlynq.c From 7cbdb4a286a60c5d519cb9223fe2134d26870d39 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 19 Sep 2016 14:44:12 -0700 Subject: [PATCH 415/538] autofs: use dentry flags to block walks during expire Somewhere along the way the autofs expire operation has changed to hold a spin lock over expired dentry selection. The autofs indirect mount expired dentry selection is complicated and quite lengthy so it isn't appropriate to hold a spin lock over the operation. Commit 47be61845c77 ("fs/dcache.c: avoid soft-lockup in dput()") added a might_sleep() to dput() causing a WARN_ONCE() about this usage to be issued. But the spin lock doesn't need to be held over this check, the autofs dentry info. flags are enough to block walks into dentrys during the expire. I've left the direct mount expire as it is (for now) because it is much simpler and quicker than the indirect mount expire and adding spin lock release and re-aquires would do nothing more than add overhead. Fixes: 47be61845c77 ("fs/dcache.c: avoid soft-lockup in dput()") Link: http://lkml.kernel.org/r/20160912014017.1773.73060.stgit@pluto.themaw.net Signed-off-by: Ian Kent Reported-by: Takashi Iwai Tested-by: Takashi Iwai Cc: Takashi Iwai Cc: NeilBrown Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/expire.c | 55 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 13 deletions(-) diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index b493909e7492..d8e6d421c27f 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -417,6 +417,7 @@ static struct dentry *should_expire(struct dentry *dentry, } return NULL; } + /* * Find an eligible tree to time-out * A tree is eligible if :- @@ -432,6 +433,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, struct dentry *root = sb->s_root; struct dentry *dentry; struct dentry *expired; + struct dentry *found; struct autofs_info *ino; if (!root) @@ -442,31 +444,46 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, dentry = NULL; while ((dentry = get_next_positive_subdir(dentry, root))) { + int flags = how; + spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); - if (ino->flags & AUTOFS_INF_WANT_EXPIRE) - expired = NULL; - else - expired = should_expire(dentry, mnt, timeout, how); - if (!expired) { + if (ino->flags & AUTOFS_INF_WANT_EXPIRE) { spin_unlock(&sbi->fs_lock); continue; } + spin_unlock(&sbi->fs_lock); + + expired = should_expire(dentry, mnt, timeout, flags); + if (!expired) + continue; + + spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(expired); ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); - spin_lock(&sbi->fs_lock); - if (should_expire(expired, mnt, timeout, how)) { - if (expired != dentry) - dput(dentry); - goto found; - } + /* Make sure a reference is not taken on found if + * things have changed. + */ + flags &= ~AUTOFS_EXP_LEAVES; + found = should_expire(expired, mnt, timeout, how); + if (!found || found != expired) + /* Something has changed, continue */ + goto next; + + if (expired != dentry) + dput(dentry); + + spin_lock(&sbi->fs_lock); + goto found; +next: + spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; + spin_unlock(&sbi->fs_lock); if (expired != dentry) dput(expired); - spin_unlock(&sbi->fs_lock); } return NULL; @@ -483,6 +500,7 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; + int state; /* Block on any pending expire */ if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE)) @@ -490,8 +508,19 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) if (rcu_walk) return -ECHILD; +retry: spin_lock(&sbi->fs_lock); - if (ino->flags & AUTOFS_INF_EXPIRING) { + state = ino->flags & (AUTOFS_INF_WANT_EXPIRE | AUTOFS_INF_EXPIRING); + if (state == AUTOFS_INF_WANT_EXPIRE) { + spin_unlock(&sbi->fs_lock); + /* + * Possibly being selected for expire, wait until + * it's selected or not. + */ + schedule_timeout_uninterruptible(HZ/10); + goto retry; + } + if (state & AUTOFS_INF_EXPIRING) { spin_unlock(&sbi->fs_lock); pr_debug("waiting for expire %p name=%pd\n", dentry, dentry); From c8de641b1e9c5489aa6ca57b7836acd68e7563f1 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Mon, 19 Sep 2016 14:44:15 -0700 Subject: [PATCH 416/538] mm: fix the page_swap_info() BUG_ON check Commit 62c230bc1790 ("mm: add support for a filesystem to activate swap files and use direct_IO for writing swap pages") replaced the swap_aops dirty hook from __set_page_dirty_no_writeback() with swap_set_page_dirty(). For normal cases without these special SWP flags code path falls back to __set_page_dirty_no_writeback() so the behaviour is expected to be the same as before. But swap_set_page_dirty() makes use of the page_swap_info() helper to get the swap_info_struct to check for the flags like SWP_FILE, SWP_BLKDEV etc as desired for those features. This helper has BUG_ON(!PageSwapCache(page)) which is racy and safe only for the set_page_dirty_lock() path. For the set_page_dirty() path which is often needed for cases to be called from irq context, kswapd() can toggle the flag behind the back while the call is getting executed when system is low on memory and heavy swapping is ongoing. This ends up with undesired kernel panic. This patch just moves the check outside the helper to its users appropriately to fix kernel panic for the described path. Couple of users of helpers already take care of SwapCache condition so I skipped them. Link: http://lkml.kernel.org/r/1473460718-31013-1-git-send-email-santosh.shilimkar@oracle.com Signed-off-by: Santosh Shilimkar Cc: Mel Gorman Cc: Joe Perches Cc: Peter Zijlstra Cc: Rik van Riel Cc: David S. Miller Cc: Jens Axboe Cc: Michal Hocko Cc: Hugh Dickins Cc: Al Viro Cc: [4.7.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_io.c | 3 +++ mm/swapfile.c | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/page_io.c b/mm/page_io.c index 16bd82fad38c..eafe5ddc2b54 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -264,6 +264,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, int ret; struct swap_info_struct *sis = page_swap_info(page); + BUG_ON(!PageSwapCache(page)); if (sis->flags & SWP_FILE) { struct kiocb kiocb; struct file *swap_file = sis->swap_file; @@ -337,6 +338,7 @@ int swap_readpage(struct page *page) int ret = 0; struct swap_info_struct *sis = page_swap_info(page); + BUG_ON(!PageSwapCache(page)); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageUptodate(page), page); if (frontswap_load(page) == 0) { @@ -386,6 +388,7 @@ int swap_set_page_dirty(struct page *page) if (sis->flags & SWP_FILE) { struct address_space *mapping = sis->swap_file->f_mapping; + BUG_ON(!PageSwapCache(page)); return mapping->a_ops->set_page_dirty(page); } else { return __set_page_dirty_no_writeback(page); diff --git a/mm/swapfile.c b/mm/swapfile.c index 78cfa292a29a..2657accc6e2b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2724,7 +2724,6 @@ int swapcache_prepare(swp_entry_t entry) struct swap_info_struct *page_swap_info(struct page *page) { swp_entry_t swap = { .val = page_private(page) }; - BUG_ON(!PageSwapCache(page)); return swap_info[swp_type(swap)]; } From 31b4beb473e3bdee1bf79db849502dcb24b5c202 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 19 Sep 2016 14:44:18 -0700 Subject: [PATCH 417/538] ipc/shm: fix crash if CONFIG_SHMEM is not set Commit c01d5b300774 ("shmem: get_unmapped_area align huge page") makes use of shm_get_unmapped_area() in shm_file_operations() unconditional to CONFIG_MMU. As Tony Battersby pointed this can lead NULL-pointer dereference on machine with CONFIG_MMU=y and CONFIG_SHMEM=n. In this case ipc/shm is backed by ramfs which doesn't provide f_op->get_unmapped_area for configurations with MMU. The solution is to provide dummy f_op->get_unmapped_area for ramfs when CONFIG_MMU=y, which just call current->mm->get_unmapped_area(). Fixes: c01d5b300774 ("shmem: get_unmapped_area align huge page") Link: http://lkml.kernel.org/r/20160912102704.140442-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reported-by: Tony Battersby Tested-by: Tony Battersby Cc: Hugh Dickins Cc: [4.7.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/file-mmu.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index 183a212694bf..12af0490322f 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -27,9 +27,17 @@ #include #include #include +#include #include "internal.h" +static unsigned long ramfs_mmu_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); +} + const struct file_operations ramfs_file_operations = { .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, @@ -38,6 +46,7 @@ const struct file_operations ramfs_file_operations = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .llseek = generic_file_llseek, + .get_unmapped_area = ramfs_mmu_get_unmapped_area, }; const struct inode_operations ramfs_file_inode_operations = { From 2b0ad0085aa47ace4756aa501274a7de0325c09c Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Mon, 19 Sep 2016 14:44:21 -0700 Subject: [PATCH 418/538] ocfs2: fix trans extend while flush truncate log Every time, ocfs2_extend_trans() included a credit for truncate log inode, but as that inode had been managed by jbd2 running transaction first time, it will not consume that credit until jbd2_journal_restart(). Since total credits to extend always included the un-consumed ones, there will be more and more un-consumed credit, at last jbd2_journal_restart() will fail due to credit number over the half of max transction credit. The following error was caught when unlinking a large file with many extents: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 13626 at fs/jbd2/transaction.c:269 start_this_handle+0x4c3/0x510 [jbd2]() Modules linked in: ocfs2 nfsd lockd grace nfs_acl auth_rpcgss sunrpc autofs4 ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm ocfs2_nodemanager ocfs2_stackglue configfs sd_mod sg ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables be2iscsi iscsi_boot_sysfs bnx2i cnic uio cxgb4i cxgb4 cxgb3i libcxgbi cxgb3 mdio ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr ipv6 iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi ppdev xen_kbdfront xen_netfront fb_sys_fops sysimgblt sysfillrect syscopyarea parport_pc parport pcspkr i2c_piix4 i2c_core acpi_cpufreq ext4 jbd2 mbcache xen_blkfront floppy pata_acpi ata_generic ata_piix dm_mirror dm_region_hash dm_log dm_mod CPU: 0 PID: 13626 Comm: unlink Tainted: G W 4.1.12-37.6.3.el6uek.x86_64 #2 Hardware name: Xen HVM domU, BIOS 4.4.4OVM 02/11/2016 Call Trace: dump_stack+0x48/0x5c warn_slowpath_common+0x95/0xe0 warn_slowpath_null+0x1a/0x20 start_this_handle+0x4c3/0x510 [jbd2] jbd2__journal_restart+0x161/0x1b0 [jbd2] jbd2_journal_restart+0x13/0x20 [jbd2] ocfs2_extend_trans+0x74/0x220 [ocfs2] ocfs2_replay_truncate_records+0x93/0x360 [ocfs2] __ocfs2_flush_truncate_log+0x13e/0x3a0 [ocfs2] ocfs2_remove_btree_range+0x458/0x7f0 [ocfs2] ocfs2_commit_truncate+0x1b3/0x6f0 [ocfs2] ocfs2_truncate_for_delete+0xbd/0x380 [ocfs2] ocfs2_wipe_inode+0x136/0x6a0 [ocfs2] ocfs2_delete_inode+0x2a2/0x3e0 [ocfs2] ocfs2_evict_inode+0x28/0x60 [ocfs2] evict+0xab/0x1a0 iput_final+0xf6/0x190 iput+0xc8/0xe0 do_unlinkat+0x1b7/0x310 SyS_unlink+0x16/0x20 system_call_fastpath+0x12/0x71 ---[ end trace 28aa7410e69369cf ]--- JBD2: unlink wants too many credits (251 > 128) Link: http://lkml.kernel.org/r/1473674623-11810-1-git-send-email-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 7dabbc31060e..51128789a661 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5922,7 +5922,6 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, } static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, - handle_t *handle, struct inode *data_alloc_inode, struct buffer_head *data_alloc_bh) { @@ -5935,11 +5934,19 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, struct ocfs2_truncate_log *tl; struct inode *tl_inode = osb->osb_tl_inode; struct buffer_head *tl_bh = osb->osb_tl_bh; + handle_t *handle; di = (struct ocfs2_dinode *) tl_bh->b_data; tl = &di->id2.i_dealloc; i = le16_to_cpu(tl->tl_used) - 1; while (i >= 0) { + handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail; + } + /* Caller has given us at least enough credits to * update the truncate log dinode */ status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, @@ -5974,12 +5981,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, } } - status = ocfs2_extend_trans(handle, - OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_commit_trans(osb, handle); i--; } @@ -5994,7 +5996,6 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) { int status; unsigned int num_to_flush; - handle_t *handle; struct inode *tl_inode = osb->osb_tl_inode; struct inode *data_alloc_inode = NULL; struct buffer_head *tl_bh = osb->osb_tl_bh; @@ -6038,21 +6039,11 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out_mutex; } - handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto out_unlock; - } - - status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, + status = ocfs2_replay_truncate_records(osb, data_alloc_inode, data_alloc_bh); if (status < 0) mlog_errno(status); - ocfs2_commit_trans(osb, handle); - -out_unlock: brelse(data_alloc_bh); ocfs2_inode_unlock(data_alloc_inode, 1); From d5bf141893880f7283fe97e1812c58ff22c8f9a5 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Mon, 19 Sep 2016 14:44:24 -0700 Subject: [PATCH 419/538] ocfs2: fix trans extend while free cached blocks The root cause of this issue is the same with the one fixed by the last patch, but this time credits for allocator inode and group descriptor may not be consumed before trans extend. The following error was caught: WARNING: CPU: 0 PID: 2037 at fs/jbd2/transaction.c:269 start_this_handle+0x4c3/0x510 [jbd2]() Modules linked in: ocfs2 nfsd lockd grace nfs_acl auth_rpcgss sunrpc autofs4 ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm ocfs2_nodemanager ocfs2_stackglue configfs sd_mod sg ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables be2iscsi iscsi_boot_sysfs bnx2i cnic uio cxgb4i cxgb4 cxgb3i libcxgbi cxgb3 mdio ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr ipv6 iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi ppdev xen_kbdfront fb_sys_fops sysimgblt sysfillrect syscopyarea xen_netfront parport_pc parport pcspkr i2c_piix4 i2c_core acpi_cpufreq ext4 jbd2 mbcache xen_blkfront floppy pata_acpi ata_generic ata_piix dm_mirror dm_region_hash dm_log dm_mod CPU: 0 PID: 2037 Comm: rm Tainted: G W 4.1.12-37.6.3.el6uek.bug24573128v2.x86_64 #2 Hardware name: Xen HVM domU, BIOS 4.4.4OVM 02/11/2016 Call Trace: dump_stack+0x48/0x5c warn_slowpath_common+0x95/0xe0 warn_slowpath_null+0x1a/0x20 start_this_handle+0x4c3/0x510 [jbd2] jbd2__journal_restart+0x161/0x1b0 [jbd2] jbd2_journal_restart+0x13/0x20 [jbd2] ocfs2_extend_trans+0x74/0x220 [ocfs2] ocfs2_free_cached_blocks+0x16b/0x4e0 [ocfs2] ocfs2_run_deallocs+0x70/0x270 [ocfs2] ocfs2_commit_truncate+0x474/0x6f0 [ocfs2] ocfs2_truncate_for_delete+0xbd/0x380 [ocfs2] ocfs2_wipe_inode+0x136/0x6a0 [ocfs2] ocfs2_delete_inode+0x2a2/0x3e0 [ocfs2] ocfs2_evict_inode+0x28/0x60 [ocfs2] evict+0xab/0x1a0 iput_final+0xf6/0x190 iput+0xc8/0xe0 do_unlinkat+0x1b7/0x310 SyS_unlinkat+0x22/0x40 system_call_fastpath+0x12/0x71 ---[ end trace a62437cb060baa71 ]--- JBD2: rm wants too many credits (149 > 128) Link: http://lkml.kernel.org/r/1473674623-11810-2-git-send-email-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 51128789a661..f165f867f332 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6404,43 +6404,34 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, goto out_mutex; } - handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out_unlock; - } - while (head) { if (head->free_bg) bg_blkno = head->free_bg; else bg_blkno = ocfs2_which_suballoc_group(head->free_blk, head->free_bit); + handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + trace_ocfs2_free_cached_blocks( (unsigned long long)head->free_blk, head->free_bit); ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, head->free_bit, bg_blkno, 1); - if (ret) { + if (ret) mlog_errno(ret); - goto out_journal; - } - ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE); - if (ret) { - mlog_errno(ret); - goto out_journal; - } + ocfs2_commit_trans(osb, handle); tmp = head; head = head->free_next; kfree(tmp); } -out_journal: - ocfs2_commit_trans(osb, handle); - out_unlock: ocfs2_inode_unlock(inode, 1); brelse(di_bh); From 12703dbfeb15402260e7554d32a34ac40c233990 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 19 Sep 2016 14:44:27 -0700 Subject: [PATCH 420/538] fsnotify: add a way to stop queueing events on group shutdown Implement a function that can be called when a group is being shutdown to stop queueing new events to the group. Fanotify will use this. Fixes: 5838d4442bd5 ("fanotify: fix double free of pending permission events") Link: http://lkml.kernel.org/r/1473797711-14111-2-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Miklos Szeredi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/group.c | 19 +++++++++++++++++++ fs/notify/notification.c | 8 +++++++- include/linux/fsnotify_backend.h | 3 +++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/fs/notify/group.c b/fs/notify/group.c index 3e2dd85be5dd..b47f7cfdcaa4 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -39,6 +39,17 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) kfree(group); } +/* + * Stop queueing new events for this group. Once this function returns + * fsnotify_add_event() will not add any new events to the group's queue. + */ +void fsnotify_group_stop_queueing(struct fsnotify_group *group) +{ + mutex_lock(&group->notification_mutex); + group->shutdown = true; + mutex_unlock(&group->notification_mutex); +} + /* * Trying to get rid of a group. Remove all marks, flush all events and release * the group reference. @@ -47,6 +58,14 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) */ void fsnotify_destroy_group(struct fsnotify_group *group) { + /* + * Stop queueing new events. The code below is careful enough to not + * require this but fanotify needs to stop queuing events even before + * fsnotify_destroy_group() is called and this makes the other callers + * of fsnotify_destroy_group() to see the same behavior. + */ + fsnotify_group_stop_queueing(group); + /* clear all inode marks for this group, attach them to destroy_list */ fsnotify_detach_group_marks(group); diff --git a/fs/notify/notification.c b/fs/notify/notification.c index a95d8e037aeb..3d76e65ff84f 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -82,7 +82,8 @@ void fsnotify_destroy_event(struct fsnotify_group *group, * Add an event to the group notification queue. The group can later pull this * event off the queue to deal with. The function returns 0 if the event was * added to the queue, 1 if the event was merged with some other queued event, - * 2 if the queue of events has overflown. + * 2 if the event was not queued - either the queue of events has overflown + * or the group is shutting down. */ int fsnotify_add_event(struct fsnotify_group *group, struct fsnotify_event *event, @@ -96,6 +97,11 @@ int fsnotify_add_event(struct fsnotify_group *group, mutex_lock(&group->notification_mutex); + if (group->shutdown) { + mutex_unlock(&group->notification_mutex); + return 2; + } + if (group->q_len >= group->max_events) { ret = 2; /* Queue overflow event only if it isn't already queued */ diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 58205f33af02..40a9e99de703 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -148,6 +148,7 @@ struct fsnotify_group { #define FS_PRIO_1 1 /* fanotify content based access control */ #define FS_PRIO_2 2 /* fanotify pre-content access */ unsigned int priority; + bool shutdown; /* group is being shut down, don't queue more events */ /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */ struct mutex mark_mutex; /* protect marks_list */ @@ -292,6 +293,8 @@ extern struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *op extern void fsnotify_get_group(struct fsnotify_group *group); /* drop reference on a group from fsnotify_alloc_group */ extern void fsnotify_put_group(struct fsnotify_group *group); +/* group destruction begins, stop queuing new events */ +extern void fsnotify_group_stop_queueing(struct fsnotify_group *group); /* destroy group */ extern void fsnotify_destroy_group(struct fsnotify_group *group); /* fasync handler function */ From 96d41019e3ac55f6f0115b0ce97e4f24a3d636d2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 19 Sep 2016 14:44:30 -0700 Subject: [PATCH 421/538] fanotify: fix list corruption in fanotify_get_response() fanotify_get_response() calls fsnotify_remove_event() when it finds that group is being released from fanotify_release() (bypass_perm is set). However the event it removes need not be only in the group's notification queue but it can have already moved to access_list (userspace read the event before closing the fanotify instance fd) which is protected by a different lock. Thus when fsnotify_remove_event() races with fanotify_release() operating on access_list, the list can get corrupted. Fix the problem by moving all the logic removing permission events from the lists to one place - fanotify_release(). Fixes: 5838d4442bd5 ("fanotify: fix double free of pending permission events") Link: http://lkml.kernel.org/r/1473797711-14111-3-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reported-by: Miklos Szeredi Tested-by: Miklos Szeredi Reviewed-by: Miklos Szeredi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify.c | 13 +---------- fs/notify/fanotify/fanotify_user.c | 36 ++++++++++++++++++++---------- fs/notify/notification.c | 15 ------------- include/linux/fsnotify_backend.h | 3 --- 4 files changed, 25 insertions(+), 42 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index d2f97ecca6a5..e0e5f7c3c99f 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -67,18 +67,7 @@ static int fanotify_get_response(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - wait_event(group->fanotify_data.access_waitq, event->response || - atomic_read(&group->fanotify_data.bypass_perm)); - - if (!event->response) { /* bypass_perm set */ - /* - * Event was canceled because group is being destroyed. Remove - * it from group's event list because we are responsible for - * freeing the permission event. - */ - fsnotify_remove_event(group, &event->fae.fse); - return 0; - } + wait_event(group->fanotify_data.access_waitq, event->response); /* userspace responded, convert to something usable */ switch (event->response) { diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 8e8e6bcd1d43..a64313868d3a 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -358,16 +358,20 @@ static int fanotify_release(struct inode *ignored, struct file *file) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS struct fanotify_perm_event_info *event, *next; + struct fsnotify_event *fsn_event; /* - * There may be still new events arriving in the notification queue - * but since userspace cannot use fanotify fd anymore, no event can - * enter or leave access_list by now. + * Stop new events from arriving in the notification queue. since + * userspace cannot use fanotify fd anymore, no event can enter or + * leave access_list by now either. */ - spin_lock(&group->fanotify_data.access_lock); - - atomic_inc(&group->fanotify_data.bypass_perm); + fsnotify_group_stop_queueing(group); + /* + * Process all permission events on access_list and notification queue + * and simulate reply from userspace. + */ + spin_lock(&group->fanotify_data.access_lock); list_for_each_entry_safe(event, next, &group->fanotify_data.access_list, fae.fse.list) { pr_debug("%s: found group=%p event=%p\n", __func__, group, @@ -379,12 +383,21 @@ static int fanotify_release(struct inode *ignored, struct file *file) spin_unlock(&group->fanotify_data.access_lock); /* - * Since bypass_perm is set, newly queued events will not wait for - * access response. Wake up the already sleeping ones now. - * synchronize_srcu() in fsnotify_destroy_group() will wait for all - * processes sleeping in fanotify_handle_event() waiting for access - * response and thus also for all permission events to be freed. + * Destroy all non-permission events. For permission events just + * dequeue them and set the response. They will be freed once the + * response is consumed and fanotify_get_response() returns. */ + mutex_lock(&group->notification_mutex); + while (!fsnotify_notify_queue_is_empty(group)) { + fsn_event = fsnotify_remove_first_event(group); + if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) + fsnotify_destroy_event(group, fsn_event); + else + FANOTIFY_PE(fsn_event)->response = FAN_ALLOW; + } + mutex_unlock(&group->notification_mutex); + + /* Response for all permission events it set, wakeup waiters */ wake_up(&group->fanotify_data.access_waitq); #endif @@ -755,7 +768,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) spin_lock_init(&group->fanotify_data.access_lock); init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); - atomic_set(&group->fanotify_data.bypass_perm, 0); #endif switch (flags & FAN_ALL_CLASS_BITS) { case FAN_CLASS_NOTIF: diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 3d76e65ff84f..e455e83ceeeb 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -131,21 +131,6 @@ int fsnotify_add_event(struct fsnotify_group *group, return ret; } -/* - * Remove @event from group's notification queue. It is the responsibility of - * the caller to destroy the event. - */ -void fsnotify_remove_event(struct fsnotify_group *group, - struct fsnotify_event *event) -{ - mutex_lock(&group->notification_mutex); - if (!list_empty(&event->list)) { - list_del_init(&event->list); - group->q_len--; - } - mutex_unlock(&group->notification_mutex); -} - /* * Remove and return the first event from the notification list. It is the * responsibility of the caller to destroy the obtained event diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 40a9e99de703..7268ed076be8 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -180,7 +180,6 @@ struct fsnotify_group { spinlock_t access_lock; struct list_head access_list; wait_queue_head_t access_waitq; - atomic_t bypass_perm; #endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */ int f_flags; unsigned int max_marks; @@ -307,8 +306,6 @@ extern int fsnotify_add_event(struct fsnotify_group *group, struct fsnotify_event *event, int (*merge)(struct list_head *, struct fsnotify_event *)); -/* Remove passed event from groups notification queue */ -extern void fsnotify_remove_event(struct fsnotify_group *group, struct fsnotify_event *event); /* true if the group notification queue is empty */ extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group); /* return, but do not dequeue the first event on the notification queue */ From 3bb8b653c86f6b1d2cc05aa1744fed4b18f99485 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 19 Sep 2016 14:44:33 -0700 Subject: [PATCH 422/538] ocfs2: fix double unlock in case retry after free truncate log If ocfs2_reserve_cluster_bitmap_bits() fails with ENOSPC, it will try to free truncate log and then retry. Since ocfs2_try_to_free_truncate_log will lock/unlock global bitmap inode, we have to unlock it before calling this function. But when retry reserve and it fails with no global bitmap inode lock taken, it will unlock again in error handling branch and BUG. This issue also exists if no need retry and then ocfs2_inode_lock fails. So fix it. Fixes: 2070ad1aebff ("ocfs2: retry on ENOSPC if sufficient space in truncate log") Link: http://lkml.kernel.org/r/57D91939.6030809@huawei.com Signed-off-by: Joseph Qi Signed-off-by: Jiufei Xue Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/suballoc.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index ea47120a85ff..6ad3533940ba 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1199,14 +1199,24 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, inode_unlock((*ac)->ac_inode); ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted); - if (ret == 1) + if (ret == 1) { + iput((*ac)->ac_inode); + (*ac)->ac_inode = NULL; goto retry; + } if (ret < 0) mlog_errno(ret); inode_lock((*ac)->ac_inode); - ocfs2_inode_lock((*ac)->ac_inode, NULL, 1); + ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1); + if (ret < 0) { + mlog_errno(ret); + inode_unlock((*ac)->ac_inode); + iput((*ac)->ac_inode); + (*ac)->ac_inode = NULL; + goto bail; + } } if (status < 0) { if (status != -ENOSPC) From db2ba40c277dc545bab531671c3f45ac0afea6f8 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 19 Sep 2016 14:44:36 -0700 Subject: [PATCH 423/538] mm: memcontrol: make per-cpu charge cache IRQ-safe for socket accounting During cgroup2 rollout into production, we started encountering css refcount underflows and css access crashes in the memory controller. Splitting the heavily shared css reference counter into logical users narrowed the imbalance down to the cgroup2 socket memory accounting. The problem turns out to be the per-cpu charge cache. Cgroup1 had a separate socket counter, but the new cgroup2 socket accounting goes through the common charge path that uses a shared per-cpu cache for all memory that is being tracked. Those caches are safe against scheduling preemption, but not against interrupts - such as the newly added packet receive path. When cache draining is interrupted by network RX taking pages out of the cache, the resuming drain operation will put references of in-use pages, thus causing the imbalance. Disable IRQs during all per-cpu charge cache operations. Fixes: f7e1cb6ec51b ("mm: memcontrol: account socket memory in unified hierarchy memory controller") Link: http://lkml.kernel.org/r/20160914194846.11153-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Cc: "David S. Miller" Cc: Michal Hocko Cc: Vladimir Davydov Cc: [4.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9a6a51a7c416..4be518d4e68a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1740,17 +1740,22 @@ static DEFINE_MUTEX(percpu_charge_mutex); static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; + unsigned long flags; bool ret = false; if (nr_pages > CHARGE_BATCH) return ret; - stock = &get_cpu_var(memcg_stock); + local_irq_save(flags); + + stock = this_cpu_ptr(&memcg_stock); if (memcg == stock->cached && stock->nr_pages >= nr_pages) { stock->nr_pages -= nr_pages; ret = true; } - put_cpu_var(memcg_stock); + + local_irq_restore(flags); + return ret; } @@ -1771,15 +1776,18 @@ static void drain_stock(struct memcg_stock_pcp *stock) stock->cached = NULL; } -/* - * This must be called under preempt disabled or must be called by - * a thread which is pinned to local cpu. - */ static void drain_local_stock(struct work_struct *dummy) { - struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); + struct memcg_stock_pcp *stock; + unsigned long flags; + + local_irq_save(flags); + + stock = this_cpu_ptr(&memcg_stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); + + local_irq_restore(flags); } /* @@ -1788,14 +1796,19 @@ static void drain_local_stock(struct work_struct *dummy) */ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { - struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); + struct memcg_stock_pcp *stock; + unsigned long flags; + + local_irq_save(flags); + stock = this_cpu_ptr(&memcg_stock); if (stock->cached != memcg) { /* reset if necessary */ drain_stock(stock); stock->cached = memcg; } stock->nr_pages += nr_pages; - put_cpu_var(memcg_stock); + + local_irq_restore(flags); } /* From d979a39d7242e0601bf9b60e89628fb8ac577179 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 19 Sep 2016 14:44:38 -0700 Subject: [PATCH 424/538] cgroup: duplicate cgroup reference when cloning sockets When a socket is cloned, the associated sock_cgroup_data is duplicated but not its reference on the cgroup. As a result, the cgroup reference count will underflow when both sockets are destroyed later on. Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Link: http://lkml.kernel.org/r/20160914194846.11153-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Cc: Michal Hocko Cc: Vladimir Davydov Cc: [4.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 6 ++++++ net/core/sock.c | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1c51b7f5221..5e8dab5bf9ad 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6270,6 +6270,12 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) if (cgroup_sk_alloc_disabled) return; + /* Socket clone path */ + if (skcd->val) { + cgroup_get(sock_cgroup_ptr(skcd)); + return; + } + rcu_read_lock(); while (true) { diff --git a/net/core/sock.c b/net/core/sock.c index 25dab8b60223..fd7b41edf1ce 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1362,7 +1362,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (!try_module_get(prot->owner)) goto out_free_sec; sk_tx_queue_clear(sk); - cgroup_sk_alloc(&sk->sk_cgrp_data); } return sk; @@ -1422,6 +1421,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_net_set(sk, net); atomic_set(&sk->sk_wmem_alloc, 1); + cgroup_sk_alloc(&sk->sk_cgrp_data); sock_update_classid(&sk->sk_cgrp_data); sock_update_netprioidx(&sk->sk_cgrp_data); } @@ -1566,6 +1566,9 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); atomic64_set(&newsk->sk_cookie, 0); + + cgroup_sk_alloc(&newsk->sk_cgrp_data); + /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) From d21c353d5e99c56cdd5b5c1183ffbcaf23b8b960 Mon Sep 17 00:00:00 2001 From: Ashish Samant Date: Mon, 19 Sep 2016 14:44:42 -0700 Subject: [PATCH 425/538] ocfs2: fix start offset to ocfs2_zero_range_for_truncate() If we punch a hole on a reflink such that following conditions are met: 1. start offset is on a cluster boundary 2. end offset is not on a cluster boundary 3. (end offset is somewhere in another extent) or (hole range > MAX_CONTIG_BYTES(1MB)), we dont COW the first cluster starting at the start offset. But in this case, we were wrongly passing this cluster to ocfs2_zero_range_for_truncate() to zero out. This will modify the cluster in place and zero it in the source too. Fix this by skipping this cluster in such a scenario. To reproduce: 1. Create a random file of say 10 MB xfs_io -c 'pwrite -b 4k 0 10M' -f 10MBfile 2. Reflink it reflink -f 10MBfile reflnktest 3. Punch a hole at starting at cluster boundary with range greater that 1MB. You can also use a range that will put the end offset in another extent. fallocate -p -o 0 -l 1048615 reflnktest 4. sync 5. Check the first cluster in the source file. (It will be zeroed out). dd if=10MBfile iflag=direct bs= count=1 | hexdump -C Link: http://lkml.kernel.org/r/1470957147-14185-1-git-send-email-ashish.samant@oracle.com Signed-off-by: Ashish Samant Reported-by: Saar Maoz Reviewed-by: Srinivas Eeda Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Eric Ren Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 4e7b0dc22450..0b055bfb8e86 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1506,7 +1506,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, u64 start, u64 len) { int ret = 0; - u64 tmpend, end = start + len; + u64 tmpend = 0; + u64 end = start + len; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); unsigned int csize = osb->s_clustersize; handle_t *handle; @@ -1538,18 +1539,31 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, } /* - * We want to get the byte offset of the end of the 1st cluster. + * If start is on a cluster boundary and end is somewhere in another + * cluster, we have not COWed the cluster starting at start, unless + * end is also within the same cluster. So, in this case, we skip this + * first call to ocfs2_zero_range_for_truncate() truncate and move on + * to the next one. */ - tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); - if (tmpend > end) - tmpend = end; + if ((start & (csize - 1)) != 0) { + /* + * We want to get the byte offset of the end of the 1st + * cluster. + */ + tmpend = (u64)osb->s_clustersize + + (start & ~(osb->s_clustersize - 1)); + if (tmpend > end) + tmpend = end; - trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start, - (unsigned long long)tmpend); + trace_ocfs2_zero_partial_clusters_range1( + (unsigned long long)start, + (unsigned long long)tmpend); - ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); - if (ret) - mlog_errno(ret); + ret = ocfs2_zero_range_for_truncate(inode, handle, start, + tmpend); + if (ret) + mlog_errno(ret); + } if (tmpend < end) { /* From 63b52c4936a2e679639c38ef51a50aa8ca1c5c07 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Mon, 19 Sep 2016 14:44:44 -0700 Subject: [PATCH 426/538] Revert "ocfs2: bump up o2cb network protocol version" This reverts commit 38b52efd218b ("ocfs2: bump up o2cb network protocol version"). This commit made rolling upgrade fail. When one node is upgraded to new version with this commit, the remaining nodes will fail to establish connections to it, then the application like VMs on the remaining nodes can't be live migrated to the upgraded one. This will cause an outage. Since negotiate hb timeout behavior didn't change without this commit, so revert it. Fixes: 38b52efd218bf ("ocfs2: bump up o2cb network protocol version") Link: http://lkml.kernel.org/r/1471396924-10375-1-git-send-email-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi Cc: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp_internal.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 94b18369b1cc..b95e7df5b76a 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -44,9 +44,6 @@ * version here in tcp_internal.h should not need to be bumped for * filesystem locking changes. * - * New in version 12 - * - Negotiate hb timeout when storage is down. - * * New in version 11 * - Negotiation of filesystem locking in the dlm join. * @@ -78,7 +75,7 @@ * - full 64 bit i_size in the metadata lock lvbs * - introduction of "rw" lock and pushing meta/data locking down */ -#define O2NET_PROTOCOL_VERSION 12ULL +#define O2NET_PROTOCOL_VERSION 11ULL struct o2net_handshake { __be64 protocol_version; __be64 connector_id; From b92ae139c308c5223521ed6ec022148b81312809 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Mon, 19 Sep 2016 14:44:47 -0700 Subject: [PATCH 427/538] rapidio/rio_cm: avoid GFP_KERNEL in atomic context As reported by Alexey Khoroshilov (https://lkml.org/lkml/2016/9/9/737): riocm_send_close() is called from rio_cm_shutdown() under spin_lock_bh(idr_lock), but riocm_send_close() uses a GFP_KERNEL allocation. Fix by taking riocm_send_close() outside of spinlock protected code. [akpm@linux-foundation.org: remove unneeded `if (!list_empty())'] Link: http://lkml.kernel.org/r/20160915175402.10122-1-alexandre.bounine@idt.com Signed-off-by: Alexandre Bounine Reported-by: Alexey Khoroshilov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/rio_cm.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/rapidio/rio_cm.c b/drivers/rapidio/rio_cm.c index 3fa17ac8df54..cebc296463ad 100644 --- a/drivers/rapidio/rio_cm.c +++ b/drivers/rapidio/rio_cm.c @@ -2247,17 +2247,30 @@ static int rio_cm_shutdown(struct notifier_block *nb, unsigned long code, { struct rio_channel *ch; unsigned int i; + LIST_HEAD(list); riocm_debug(EXIT, "."); + /* + * If there are any channels left in connected state send + * close notification to the connection partner. + * First build a list of channels that require a closing + * notification because function riocm_send_close() should + * be called outside of spinlock protected code. + */ spin_lock_bh(&idr_lock); idr_for_each_entry(&ch_idr, ch, i) { - riocm_debug(EXIT, "close ch %d", ch->id); - if (ch->state == RIO_CM_CONNECTED) - riocm_send_close(ch); + if (ch->state == RIO_CM_CONNECTED) { + riocm_debug(EXIT, "close ch %d", ch->id); + idr_remove(&ch_idr, ch->id); + list_add(&ch->ch_node, &list); + } } spin_unlock_bh(&idr_lock); + list_for_each_entry(ch, &list, ch_node) + riocm_send_close(ch); + return NOTIFY_DONE; } From 655e52d2b62458032fc67ff7daaa664af6f36fb5 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 19 Sep 2016 08:51:40 -0400 Subject: [PATCH 428/538] x86/tsc: Use cpu id defines instead of hex constants asm/intel-family.h contains defines for cpu ids which should be used instead of hex constants. Convert the switch case in native_calibrate_tsc() to use the defines before adding more cpu models. [ tglx: Massaged changelog ] Signed-off-by: Prarit Bhargava Cc: Len Brown Cc: Rafael Aquini Cc: "Peter Zijlstra (Intel)" Cc: Andy Lutomirski Link: http://lkml.kernel.org/r/1474289501-31717-2-git-send-email-prarit@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 78b9cb5a26af..2344758ba8a3 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -23,6 +23,7 @@ #include #include #include +#include unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -686,11 +687,11 @@ unsigned long native_calibrate_tsc(void) if (crystal_khz == 0) { switch (boot_cpu_data.x86_model) { - case 0x4E: /* SKL */ - case 0x5E: /* SKL */ + case INTEL_FAM6_SKYLAKE_MOBILE: + case INTEL_FAM6_SKYLAKE_DESKTOP: crystal_khz = 24000; /* 24.0 MHz */ break; - case 0x5C: /* BXT */ + case INTEL_FAM6_ATOM_GOLDMONT: crystal_khz = 19200; /* 19.2 MHz */ break; } From 6baf3d61821f5b38f27b4e9f044ad4d1e8f3d14f Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 19 Sep 2016 08:51:41 -0400 Subject: [PATCH 429/538] x86/tsc: Add additional Intel CPU models to the crystal quirk list commit aa297292d708 ("x86/tsc: Enumerate SKL cpu_khz and tsc_khz via CPUID") added code to retrieve the crystal and TSC frequency from CPUID leaves. If the crystal freqency is enumerated as 0,the resulting TSC frequency is 0 as well. For CPUs with a known fixed crystal frequency a quirk list is available to set the frequency, Kabylake and SkylakeX CPUs are missing in the list of CPUs which need this quirk. Add them so the TSC frequency can be calculated correctly. [ tglx: Removed the silly default case as the switch() is only invoked when cpu_khz is 0. Massaged changelog. ] Signed-off-by: Prarit Bhargava Cc: Len Brown Cc: Rafael Aquini Cc: "Peter Zijlstra (Intel)" Cc: Andy Lutomirski Link: http://lkml.kernel.org/r/1474289501-31717-3-git-send-email-prarit@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 2344758ba8a3..46b2f41f8b05 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -689,8 +689,13 @@ unsigned long native_calibrate_tsc(void) switch (boot_cpu_data.x86_model) { case INTEL_FAM6_SKYLAKE_MOBILE: case INTEL_FAM6_SKYLAKE_DESKTOP: + case INTEL_FAM6_KABYLAKE_MOBILE: + case INTEL_FAM6_KABYLAKE_DESKTOP: crystal_khz = 24000; /* 24.0 MHz */ break; + case INTEL_FAM6_SKYLAKE_X: + crystal_khz = 25000; /* 25.0 MHz */ + break; case INTEL_FAM6_ATOM_GOLDMONT: crystal_khz = 19200; /* 19.2 MHz */ break; From 744c193eb9a223ccb2e60500196cf590b3a6131a Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Mon, 19 Sep 2016 17:04:18 -0400 Subject: [PATCH 430/538] x86: Migrate exception table users off module.h and onto extable.h These files were only including module.h for exception table related functions. We've now separated that content out into its own file "extable.h" so now move over to that and avoid all the extra header content in module.h that we don't really need to compile these files. Signed-off-by: Paul Gortmaker Acked-by: Ingo Molnar Link: http://lkml.kernel.org/r/20160919210418.30243-1-paul.gortmaker@windriver.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/kprobes/core.c | 2 +- arch/x86/kernel/kprobes/opt.c | 2 +- arch/x86/mm/extable.c | 2 +- arch/x86/mm/fault.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 7847e5c0e0b5..28cee019209c 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -45,7 +45,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 4425f593f0ec..3bb4c5f021f6 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 832b98f822be..79ae939970d3 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,4 +1,4 @@ -#include +#include #include #include #include diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index dc8023060456..79ae05477d94 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -5,7 +5,7 @@ */ #include /* test_thread_flag(), ... */ #include /* oops_begin/end, ... */ -#include /* search_exception_table */ +#include /* search_exception_table */ #include /* max_low_pfn */ #include /* NOKPROBE_SYMBOL, ... */ #include /* kmmio_handler, ... */ From 727653d6ce7103b245eb8041f55dd5885f4c3289 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 19 Sep 2016 18:29:15 +0100 Subject: [PATCH 431/538] irqchip/gicv3: Silence noisy DEBUG_PER_CPU_MAPS warning gic_raise_softirq() walks the list of cpus using for_each_cpu(), it calls gic_compute_target_list() which advances the iterator by the number of CPUs in the cluster. If gic_compute_target_list() reaches the last CPU it leaves the iterator pointing at the last CPU. This means the next time round the for_each_cpu() loop cpumask_next() will be called with an invalid CPU. This triggers a warning when built with CONFIG_DEBUG_PER_CPU_MAPS: [ 3.077738] GICv3: CPU1: found redistributor 1 region 0:0x000000002f120000 [ 3.077943] CPU1: Booted secondary processor [410fd0f0] [ 3.078542] ------------[ cut here ]------------ [ 3.078746] WARNING: CPU: 1 PID: 0 at ../include/linux/cpumask.h:121 gic_raise_softirq+0x12c/0x170 [ 3.078812] Modules linked in: [ 3.078869] [ 3.078930] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.8.0-rc5+ #5188 [ 3.078994] Hardware name: Foundation-v8A (DT) [ 3.079059] task: ffff80087a1a0080 task.stack: ffff80087a19c000 [ 3.079145] PC is at gic_raise_softirq+0x12c/0x170 [ 3.079226] LR is at gic_raise_softirq+0xa4/0x170 [ 3.079296] pc : [] lr : [] pstate: 200001c9 [ 3.081139] Call trace: [ 3.081202] Exception stack(0xffff80087a19fbe0 to 0xffff80087a19fd10) [ 3.082269] [] gic_raise_softirq+0x12c/0x170 [ 3.082354] [] smp_send_reschedule+0x34/0x40 [ 3.082433] [] resched_curr+0x50/0x88 [ 3.082512] [] check_preempt_curr+0x60/0xd0 [ 3.082593] [] ttwu_do_wakeup+0x20/0xe8 [ 3.082672] [] ttwu_do_activate+0x90/0xc0 [ 3.082753] [] try_to_wake_up+0x224/0x370 [ 3.082836] [] default_wake_function+0x10/0x18 [ 3.082920] [] __wake_up_common+0x5c/0xa0 [ 3.083003] [] __wake_up_locked+0x14/0x20 [ 3.083086] [] complete+0x40/0x60 [ 3.083168] [] secondary_start_kernel+0x15c/0x1d0 [ 3.083240] [<00000000808911a4>] 0x808911a4 [ 3.113401] Detected PIPT I-cache on CPU2 Avoid updating the iterator if the next call to cpumask_next() would cause the for_each_cpu() loop to exit. There is no change to gic_raise_softirq()'s behaviour, (cpumask_next()s eventual call to _find_next_bit() will return early as start >= nbits), this patch just silences the warning. Fixes: 021f653791ad ("irqchip: gic-v3: Initial support for GICv3") Signed-off-by: James Morse Acked-by: Marc Zyngier Cc: linux-arm-kernel@lists.infradead.org Cc: Jason Cooper Link: http://lkml.kernel.org/r/1474306155-3303-1-git-send-email-james.morse@arm.com Signed-off-by: Thomas Gleixner --- drivers/irqchip/irq-gic-v3.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index ede5672ab34d..da6c0ba61d4f 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -548,7 +548,7 @@ static int gic_starting_cpu(unsigned int cpu) static u16 gic_compute_target_list(int *base_cpu, const struct cpumask *mask, unsigned long cluster_id) { - int cpu = *base_cpu; + int next_cpu, cpu = *base_cpu; unsigned long mpidr = cpu_logical_map(cpu); u16 tlist = 0; @@ -562,9 +562,10 @@ static u16 gic_compute_target_list(int *base_cpu, const struct cpumask *mask, tlist |= 1 << (mpidr & 0xf); - cpu = cpumask_next(cpu, mask); - if (cpu >= nr_cpu_ids) + next_cpu = cpumask_next(cpu, mask); + if (next_cpu >= nr_cpu_ids) goto out; + cpu = next_cpu; mpidr = cpu_logical_map(cpu); From babd6134a54d70efe875fa5661a20eaecb63f278 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Sun, 18 Sep 2016 18:20:27 +0300 Subject: [PATCH 432/538] net/mlx5: Fix flow counter bulk command out mailbox allocation The FW command output length should be only the length of struct mlx5_cmd_fc_bulk out field. Failing to do so will cause the memcpy call which is invoked later in the driver to write over wrong memory address and corrupt kernel memory which results in random crashes. This bug was found using the kernel address sanitizer (kasan). Fixes: a351a1b03bf1 ('net/mlx5: Introduce bulk reading of flow counters') Signed-off-by: Roi Dayan Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index 9134010e2921..287ade151ec8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -425,11 +425,11 @@ struct mlx5_cmd_fc_bulk * mlx5_cmd_fc_bulk_alloc(struct mlx5_core_dev *dev, u16 id, int num) { struct mlx5_cmd_fc_bulk *b; - int outlen = sizeof(*b) + + int outlen = MLX5_ST_SZ_BYTES(query_flow_counter_out) + MLX5_ST_SZ_BYTES(traffic_counter) * num; - b = kzalloc(outlen, GFP_KERNEL); + b = kzalloc(sizeof(*b) + outlen, GFP_KERNEL); if (!b) return NULL; From 4eea37d7b92076fdeac2a21e5f4dbd92d286719d Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Sun, 18 Sep 2016 18:20:28 +0300 Subject: [PATCH 433/538] net/mlx5: E-Switch, Fix error flow in the SRIOV e-switch init code When enablement of the SRIOV e-switch in certain mode (switchdev or legacy) fails, we must set the mode to none. Otherwise, we'll run into double free based crashes when further attempting to deal with the e-switch (such as when disabling sriov or unloading the driver). Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 8b78f156214e..b247949df135 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1554,6 +1554,7 @@ int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode) abort: esw_enable_vport(esw, 0, UC_ADDR_CHANGE); + esw->mode = SRIOV_NONE; return err; } From 6c419ba8e2580ab17c164db6e918e163d3537ec1 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Sun, 18 Sep 2016 18:20:29 +0300 Subject: [PATCH 434/538] net/mlx5: E-Switch, Handle mode change failures E-switch mode changes involve creating HW tables, potentially allocating netdevices, etc, and things can fail. Add an attempt to rollback to the existing mode when changing to the new mode fails. Only if rollback fails, getting proper SRIOV functionality requires module unload or sriov disablement/enablement. Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- .../mellanox/mlx5/core/eswitch_offloads.c | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 3dc83a9459a4..7de40e6b0c25 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -446,7 +446,7 @@ mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn) static int esw_offloads_start(struct mlx5_eswitch *esw) { - int err, num_vfs = esw->dev->priv.sriov.num_vfs; + int err, err1, num_vfs = esw->dev->priv.sriov.num_vfs; if (esw->mode != SRIOV_LEGACY) { esw_warn(esw->dev, "Can't set offloads mode, SRIOV legacy not enabled\n"); @@ -455,8 +455,12 @@ static int esw_offloads_start(struct mlx5_eswitch *esw) mlx5_eswitch_disable_sriov(esw); err = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_OFFLOADS); - if (err) - esw_warn(esw->dev, "Failed set eswitch to offloads, err %d\n", err); + if (err) { + esw_warn(esw->dev, "Failed setting eswitch to offloads, err %d\n", err); + err1 = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_LEGACY); + if (err1) + esw_warn(esw->dev, "Failed setting eswitch back to legacy, err %d\n", err); + } return err; } @@ -508,12 +512,16 @@ int esw_offloads_init(struct mlx5_eswitch *esw, int nvports) static int esw_offloads_stop(struct mlx5_eswitch *esw) { - int err, num_vfs = esw->dev->priv.sriov.num_vfs; + int err, err1, num_vfs = esw->dev->priv.sriov.num_vfs; mlx5_eswitch_disable_sriov(esw); err = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_LEGACY); - if (err) - esw_warn(esw->dev, "Failed set eswitch legacy mode. err %d\n", err); + if (err) { + esw_warn(esw->dev, "Failed setting eswitch to legacy, err %d\n", err); + err1 = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_OFFLOADS); + if (err1) + esw_warn(esw->dev, "Failed setting eswitch back to offloads, err %d\n", err); + } return err; } From c907420fdaec78b17f59a6011cb5f9d6051c6a35 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 19 Sep 2016 07:27:08 -0600 Subject: [PATCH 435/538] locking/rwsem, x86: Drop a bogus cc clobber With the addition of uses of GCC's condition code outputs in commit: 35ccfb7114 ("x86, asm: Use CC_SET()/CC_OUT() in ") ... there's now an overlap of outputs and clobbers in __down_write_trylock(). Such overlaps are generally getting tagged with an error (occasionally even with an ICE). I can't really tell why plain GCC 6.2 doesn't detect this (judging by the code it is meant to), while the slightly modified one I use does. Since condition code clobbers are never necessary on x86 (other than perhaps for documentation purposes, which doesn't really get done consistently), remove it altogether rather than inventing something like CC_CLOBBER (to accompany CC_SET/CC_OUT). Signed-off-by: Jan Beulich Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/57E003CC0200007800110102@prv-mh.provo.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/rwsem.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 8dbc762ad132..3d33a719f5c1 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -154,7 +154,7 @@ static inline bool __down_write_trylock(struct rw_semaphore *sem) : "+m" (sem->count), "=&a" (tmp0), "=&r" (tmp1), CC_OUT(e) (result) : "er" (RWSEM_ACTIVE_WRITE_BIAS) - : "memory", "cc"); + : "memory"); return result; } From 7c7900f89770d7fba96100d8a9e18043a1af3973 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 16 Sep 2016 14:18:12 -0500 Subject: [PATCH 436/538] x86/unwind: Add new unwind interface and implementations The x86 stack dump code is a bit of a mess. dump_trace() uses callbacks, and each user of it seems to have slightly different requirements, so there are several slightly different callbacks floating around. Also there are some upcoming features which will need more changes to the stack dump code, including the printing of stack pt_regs, reliable stack detection for live patching, and a DWARF unwinder. Each of those features would at least need more callbacks and/or callback interfaces, resulting in a much bigger mess than what we have today. Before doing all that, we should try to clean things up and replace dump_trace() with something cleaner and more flexible. The new unwinder is a simple state machine which was heavily inspired by a suggestion from Andy Lutomirski: https://lkml.kernel.org/r/CALCETrUbNTqaM2LRyXGRx=kVLRPeY5A3Pc6k4TtQxF320rUT=w@mail.gmail.com It's also similar to the libunwind API: http://www.nongnu.org/libunwind/man/libunwind(3).html Some if its advantages: - Simplicity: no more callback sprawl and less code duplication. - Flexibility: it allows the caller to stop and inspect the stack state at each step in the unwinding process. - Modularity: the unwinder code, console stack dump code, and stack metadata analysis code are all better separated so that changing one of them shouldn't have much of an impact on any of the others. Two implementations are added which conform to the new unwind interface: - The frame pointer unwinder which is used for CONFIG_FRAME_POINTER=y. - The "guess" unwinder which is used for CONFIG_FRAME_POINTER=n. This isn't an "unwinder" per se. All it does is scan the stack for kernel text addresses. But with no frame pointers, guesses are better than nothing in most cases. Suggested-by: Andy Lutomirski Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/6dc2f909c47533d213d0505f0a113e64585bec82.1474045023.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/unwind.h | 73 ++++++++++++++++++++++++++ arch/x86/kernel/Makefile | 6 +++ arch/x86/kernel/unwind_frame.c | 93 ++++++++++++++++++++++++++++++++++ arch/x86/kernel/unwind_guess.c | 43 ++++++++++++++++ 4 files changed, 215 insertions(+) create mode 100644 arch/x86/include/asm/unwind.h create mode 100644 arch/x86/kernel/unwind_frame.c create mode 100644 arch/x86/kernel/unwind_guess.c diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h new file mode 100644 index 000000000000..c4b6d1cafa46 --- /dev/null +++ b/arch/x86/include/asm/unwind.h @@ -0,0 +1,73 @@ +#ifndef _ASM_X86_UNWIND_H +#define _ASM_X86_UNWIND_H + +#include +#include +#include +#include + +struct unwind_state { + struct stack_info stack_info; + unsigned long stack_mask; + struct task_struct *task; + int graph_idx; +#ifdef CONFIG_FRAME_POINTER + unsigned long *bp; +#else + unsigned long *sp; +#endif +}; + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame); + +bool unwind_next_frame(struct unwind_state *state); + +static inline bool unwind_done(struct unwind_state *state) +{ + return state->stack_info.type == STACK_TYPE_UNKNOWN; +} + +static inline +void unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + first_frame = first_frame ? : get_stack_pointer(task, regs); + + __unwind_start(state, task, regs, first_frame); +} + +#ifdef CONFIG_FRAME_POINTER + +static inline +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + return state->bp + 1; +} + +unsigned long unwind_get_return_address(struct unwind_state *state); + +#else /* !CONFIG_FRAME_POINTER */ + +static inline +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + return NULL; +} + +static inline +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + + return ftrace_graph_ret_addr(state->task, &state->graph_idx, + *state->sp, state->sp); +} + +#endif /* CONFIG_FRAME_POINTER */ + +#endif /* _ASM_X86_UNWIND_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0503f5bfb18d..45257cf84370 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -125,6 +125,12 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o +ifdef CONFIG_FRAME_POINTER +obj-y += unwind_frame.o +else +obj-y += unwind_guess.o +endif + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c new file mode 100644 index 000000000000..a2456d4d286a --- /dev/null +++ b/arch/x86/kernel/unwind_frame.c @@ -0,0 +1,93 @@ +#include +#include +#include +#include +#include + +#define FRAME_HEADER_SIZE (sizeof(long) * 2) + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + unsigned long addr; + unsigned long *addr_p = unwind_get_return_address_ptr(state); + + if (unwind_done(state)) + return 0; + + addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p, + addr_p); + + return __kernel_text_address(addr) ? addr : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +static bool update_stack_state(struct unwind_state *state, void *addr, + size_t len) +{ + struct stack_info *info = &state->stack_info; + + /* + * If addr isn't on the current stack, switch to the next one. + * + * We may have to traverse multiple stacks to deal with the possibility + * that 'info->next_sp' could point to an empty stack and 'addr' could + * be on a subsequent stack. + */ + while (!on_stack(info, addr, len)) + if (get_stack_info(info->next_sp, state->task, info, + &state->stack_mask)) + return false; + + return true; +} + +bool unwind_next_frame(struct unwind_state *state) +{ + unsigned long *next_bp; + + if (unwind_done(state)) + return false; + + next_bp = (unsigned long *)*state->bp; + + /* make sure the next frame's data is accessible */ + if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE)) + return false; + + /* move to the next frame */ + state->bp = next_bp; + return true; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + memset(state, 0, sizeof(*state)); + state->task = task; + + /* don't even attempt to start from user mode regs */ + if (regs && user_mode(regs)) { + state->stack_info.type = STACK_TYPE_UNKNOWN; + return; + } + + /* set up the starting stack frame */ + state->bp = get_frame_pointer(task, regs); + + /* initialize stack info and make sure the frame data is accessible */ + get_stack_info(state->bp, state->task, &state->stack_info, + &state->stack_mask); + update_stack_state(state, state->bp, FRAME_HEADER_SIZE); + + /* + * The caller can provide the address of the first frame directly + * (first_frame) or indirectly (regs->sp) to indicate which stack frame + * to start unwinding at. Skip ahead until we reach it. + */ + while (!unwind_done(state) && + (!on_stack(&state->stack_info, first_frame, sizeof(long)) || + state->bp < first_frame)) + unwind_next_frame(state); +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c new file mode 100644 index 000000000000..b5a834c93065 --- /dev/null +++ b/arch/x86/kernel/unwind_guess.c @@ -0,0 +1,43 @@ +#include +#include +#include +#include +#include +#include + +bool unwind_next_frame(struct unwind_state *state) +{ + struct stack_info *info = &state->stack_info; + + if (unwind_done(state)) + return false; + + do { + for (state->sp++; state->sp < info->end; state->sp++) + if (__kernel_text_address(*state->sp)) + return true; + + state->sp = info->next_sp; + + } while (!get_stack_info(state->sp, state->task, info, + &state->stack_mask)); + + return false; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + memset(state, 0, sizeof(*state)); + + state->task = task; + state->sp = first_frame; + + get_stack_info(first_frame, state->task, &state->stack_info, + &state->stack_mask); + + if (!__kernel_text_address(*first_frame)) + unwind_next_frame(state); +} +EXPORT_SYMBOL_GPL(__unwind_start); From 35f4d9b32527c08c3da3982aedae5198dc663ce8 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 16 Sep 2016 14:18:13 -0500 Subject: [PATCH 437/538] perf/x86: Convert perf_callchain_kernel() to use the new unwinder Convert perf_callchain_kernel() to use the new unwinder. dump_trace() has been deprecated. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a2df0c4f09b3d438e11b41681f10b0775a819a7f.1474045023.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 477dc38b62b1..0a8bd7fcdbed 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "perf_event.h" @@ -2247,31 +2248,12 @@ void arch_perf_update_userpage(struct perf_event *event, cyc2ns_read_end(data); } -/* - * callchain support - */ - -static int backtrace_stack(void *data, const char *name) -{ - return 0; -} - -static int backtrace_address(void *data, unsigned long addr, int reliable) -{ - struct perf_callchain_entry_ctx *entry = data; - - return perf_callchain_store(entry, addr); -} - -static const struct stacktrace_ops backtrace_ops = { - .stack = backtrace_stack, - .address = backtrace_address, - .walk_stack = print_context_stack_bp, -}; - void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct unwind_state state; + unsigned long addr; + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { /* TODO: We don't support guest os callchain now */ return; @@ -2280,7 +2262,12 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re if (perf_callchain_store(entry, regs->ip)) return; - dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); + for (unwind_start(&state, current, regs, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr || perf_callchain_store(entry, addr)) + return; + } } static inline int From 49a612c6b06defbd6e6d334c683fea28006728e3 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 16 Sep 2016 14:18:14 -0500 Subject: [PATCH 438/538] x86/stacktrace: Convert save_stack_trace_*() to use the new unwinder Convert save_stack_trace_*() to use the new unwinder. dump_trace() has been deprecated. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/815494c627d89887db0ce56ceffd58ad16ee6c21.1474045023.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/stacktrace.c | 74 ++++++++++++++---------------------- 1 file changed, 29 insertions(+), 45 deletions(-) diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 23fa81e24c8a..0653788026e2 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -8,73 +8,59 @@ #include #include #include +#include -static int save_stack_stack(void *data, const char *name) +static int save_stack_address(struct stack_trace *trace, unsigned long addr, + bool nosched) { - return 0; -} - -static int -__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched) -{ - struct stack_trace *trace = data; -#ifdef CONFIG_FRAME_POINTER - if (!reliable) - return 0; -#endif if (nosched && in_sched_functions(addr)) return 0; + if (trace->skip > 0) { trace->skip--; return 0; } - if (trace->nr_entries < trace->max_entries) { - trace->entries[trace->nr_entries++] = addr; - return 0; - } else { - return -1; /* no more room, stop walking the stack */ - } -} -static int save_stack_address(void *data, unsigned long addr, int reliable) -{ - return __save_stack_address(data, addr, reliable, false); + if (trace->nr_entries >= trace->max_entries) + return -1; + + trace->entries[trace->nr_entries++] = addr; + return 0; } -static int -save_stack_address_nosched(void *data, unsigned long addr, int reliable) +static void __save_stack_trace(struct stack_trace *trace, + struct task_struct *task, struct pt_regs *regs, + bool nosched) { - return __save_stack_address(data, addr, reliable, true); -} + struct unwind_state state; + unsigned long addr; -static const struct stacktrace_ops save_stack_ops = { - .stack = save_stack_stack, - .address = save_stack_address, - .walk_stack = print_context_stack, -}; + if (regs) + save_stack_address(trace, regs->ip, nosched); -static const struct stacktrace_ops save_stack_ops_nosched = { - .stack = save_stack_stack, - .address = save_stack_address_nosched, - .walk_stack = print_context_stack, -}; + for (unwind_start(&state, task, regs, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr || save_stack_address(trace, addr, nosched)) + break; + } + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} /* * Save stack-backtrace addresses into a stack_trace buffer. */ void save_stack_trace(struct stack_trace *trace) { - dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + __save_stack_trace(trace, current, NULL, false); } EXPORT_SYMBOL_GPL(save_stack_trace); void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) { - dump_trace(current, regs, NULL, 0, &save_stack_ops, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + __save_stack_trace(trace, current, regs, false); } void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) @@ -82,9 +68,7 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) if (!try_get_task_stack(tsk)) return; - dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + __save_stack_trace(trace, tsk, NULL, true); put_task_stack(tsk); } From ec2ad9ccf12dc965cad2d367a4063f68d6561a6b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 16 Sep 2016 14:18:15 -0500 Subject: [PATCH 439/538] oprofile/x86: Convert x86_backtrace() to use the new unwinder Convert oprofile's x86_backtrace() to use the new unwinder. dump_trace() has been deprecated. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Robert Richter Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/412df8927705795e8ea60cffcf89a79e010713b1.1474045023.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/oprofile/backtrace.c | 39 +++++++++++++++-------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 75391488130b..a2488b6e27d6 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -16,27 +16,7 @@ #include #include - -static int backtrace_stack(void *data, const char *name) -{ - /* Yes, we want all stacks */ - return 0; -} - -static int backtrace_address(void *data, unsigned long addr, int reliable) -{ - unsigned int *depth = data; - - if ((*depth)--) - oprofile_add_trace(addr); - return 0; -} - -static struct stacktrace_ops backtrace_ops = { - .stack = backtrace_stack, - .address = backtrace_address, - .walk_stack = print_context_stack, -}; +#include #ifdef CONFIG_COMPAT static struct stack_frame_ia32 * @@ -113,14 +93,29 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); if (!user_mode(regs)) { + struct unwind_state state; + unsigned long addr; + if (!depth) return; oprofile_add_trace(regs->ip); + if (!--depth) return; - dump_trace(NULL, regs, NULL, 0, &backtrace_ops, &depth); + for (unwind_start(&state, current, regs, NULL); + !unwind_done(&state); unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr) + break; + + oprofile_add_trace(addr); + + if (!--depth) + break; + } + return; } From e18bcccd1a4ecb41e99678e002ef833586185bf1 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 16 Sep 2016 14:18:16 -0500 Subject: [PATCH 440/538] x86/dumpstack: Convert show_trace_log_lvl() to use the new unwinder Convert show_trace_log_lvl() to use the new unwinder. dump_trace() has been deprecated. show_trace_log_lvl() is special compared to other users of the unwinder. It's the only place where both reliable *and* unreliable addresses are needed. With frame pointers enabled, most callers of the unwinder don't want to know about unreliable addresses. But in this case, when we're dumping the stack to the console because something presumably went wrong, the unreliable addresses are useful: - They show stale data on the stack which can provide useful clues. - If something goes wrong with the unwinder, or if frame pointers are corrupt or missing, all the stack addresses still get shown. So in order to show all addresses on the stack, and at the same time figure out which addresses are reliable, we have to do the scanning and the unwinding in parallel. The scanning is done with the help of get_stack_info() to traverse the stacks. The unwinding is done separately by the new unwinder. In theory we could simplify show_trace_log_lvl() by instead pushing some of this logic into the unwind code. But then we would need some kind of "fake" frame logic in the unwinder which would add a lot of complexity and wouldn't be worth it in order to support only one user. Another benefit of this approach is that once we have a DWARF unwinder, we should be able to just plug it in with minimal impact to this code. Another change here is that callers of show_trace_log_lvl() don't need to provide the 'bp' argument. The unwinder already finds the relevant frame pointer by unwinding until it reaches the first frame after the provided stack pointer. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/703b5998604c712a1f801874b43f35d6dac52ede.1474045023.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/stacktrace.h | 10 +-- arch/x86/kernel/dumpstack.c | 126 ++++++++++++++++++++++-------- arch/x86/kernel/dumpstack_32.c | 9 +-- arch/x86/kernel/dumpstack_64.c | 9 +-- 4 files changed, 107 insertions(+), 47 deletions(-) diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index ed2be1b5ada8..c9ccf0676ca6 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -119,13 +119,11 @@ get_stack_pointer(struct task_struct *task, struct pt_regs *regs) return (unsigned long *)task->thread.sp; } -extern void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl); +void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, char *log_lvl); -extern void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl); +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, char *log_lvl); extern unsigned int code_bytes; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index e0648f755158..c08f32ab8ace 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -17,7 +17,7 @@ #include #include - +#include int panic_on_unrecovered_nmi; int panic_on_io_nmi; @@ -142,56 +142,120 @@ print_context_stack_bp(struct task_struct *task, } EXPORT_SYMBOL_GPL(print_context_stack_bp); -static int print_trace_stack(void *data, const char *name) +void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, char *log_lvl) { - printk("%s <%s> ", (char *)data, name); - return 0; -} + struct unwind_state state; + struct stack_info stack_info = {0}; + unsigned long visit_mask = 0; + int graph_idx = 0; -/* - * Print one address/symbol entries per line. - */ -static int print_trace_address(void *data, unsigned long addr, int reliable) -{ - printk_stack_address(addr, reliable, data); - return 0; -} + printk("%sCall Trace:\n", log_lvl); -static const struct stacktrace_ops print_trace_ops = { - .stack = print_trace_stack, - .address = print_trace_address, - .walk_stack = print_context_stack, -}; + unwind_start(&state, task, regs, stack); -void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); + /* + * Iterate through the stacks, starting with the current stack pointer. + * Each stack has a pointer to the next one. + * + * x86-64 can have several stacks: + * - task stack + * - interrupt stack + * - HW exception stacks (double fault, nmi, debug, mce) + * + * x86-32 can have up to three stacks: + * - task stack + * - softirq stack + * - hardirq stack + */ + for (; stack; stack = stack_info.next_sp) { + const char *str_begin, *str_end; + + /* + * If we overflowed the task stack into a guard page, jump back + * to the bottom of the usable stack. + */ + if (task_stack_page(task) - (void *)stack < PAGE_SIZE) + stack = task_stack_page(task); + + if (get_stack_info(stack, task, &stack_info, &visit_mask)) + break; + + stack_type_str(stack_info.type, &str_begin, &str_end); + if (str_begin) + printk("%s <%s> ", log_lvl, str_begin); + + /* + * Scan the stack, printing any text addresses we find. At the + * same time, follow proper stack frames with the unwinder. + * + * Addresses found during the scan which are not reported by + * the unwinder are considered to be additional clues which are + * sometimes useful for debugging and are prefixed with '?'. + * This also serves as a failsafe option in case the unwinder + * goes off in the weeds. + */ + for (; stack < stack_info.end; stack++) { + unsigned long real_addr; + int reliable = 0; + unsigned long addr = *stack; + unsigned long *ret_addr_p = + unwind_get_return_address_ptr(&state); + + if (!__kernel_text_address(addr)) + continue; + + if (stack == ret_addr_p) + reliable = 1; + + /* + * When function graph tracing is enabled for a + * function, its return address on the stack is + * replaced with the address of an ftrace handler + * (return_to_handler). In that case, before printing + * the "real" address, we want to print the handler + * address as an "unreliable" hint that function graph + * tracing was involved. + */ + real_addr = ftrace_graph_ret_addr(task, &graph_idx, + addr, stack); + if (real_addr != addr) + printk_stack_address(addr, 0, log_lvl); + printk_stack_address(real_addr, reliable, log_lvl); + + if (!reliable) + continue; + + /* + * Get the next frame from the unwinder. No need to + * check for an error: if anything goes wrong, the rest + * of the addresses will just be printed as unreliable. + */ + unwind_next_frame(&state); + } + + if (str_end) + printk("%s <%s> ", log_lvl, str_end); + } } void show_stack(struct task_struct *task, unsigned long *sp) { - unsigned long bp = 0; - task = task ? : current; /* * Stack frames below this one aren't interesting. Don't show them * if we're printing for %current. */ - if (!sp && task == current) { + if (!sp && task == current) sp = get_stack_pointer(current, NULL); - bp = (unsigned long)get_frame_pointer(current, NULL); - } - show_stack_log_lvl(task, NULL, sp, bp, ""); + show_stack_log_lvl(current, NULL, sp, ""); } void show_stack_regs(struct pt_regs *regs) { - show_stack_log_lvl(current, regs, NULL, 0, ""); + show_stack_log_lvl(current, regs, NULL, ""); } static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 4ff000811e03..e476eb774278 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -156,9 +156,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, } EXPORT_SYMBOL(dump_trace); -void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, char *log_lvl) { unsigned long *stack; int i; @@ -181,7 +180,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, touch_nmi_watchdog(); } pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, bp, log_lvl); + show_trace_log_lvl(task, regs, sp, log_lvl); put_task_stack(task); } @@ -205,7 +204,7 @@ void show_regs(struct pt_regs *regs) u8 *ip; pr_emerg("Stack:\n"); - show_stack_log_lvl(current, regs, NULL, 0, KERN_EMERG); + show_stack_log_lvl(current, regs, NULL, KERN_EMERG); pr_emerg("Code:"); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 008a29837cab..4e9f2cf64ac8 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -209,9 +209,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, } EXPORT_SYMBOL(dump_trace); -void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, char *log_lvl) { unsigned long *irq_stack_end; unsigned long *irq_stack; @@ -255,7 +254,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, } pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, bp, log_lvl); + show_trace_log_lvl(task, regs, sp, log_lvl); put_task_stack(task); } @@ -278,7 +277,7 @@ void show_regs(struct pt_regs *regs) u8 *ip; printk(KERN_DEFAULT "Stack:\n"); - show_stack_log_lvl(current, regs, NULL, 0, KERN_DEFAULT); + show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT); printk(KERN_DEFAULT "Code: "); From c8fe4609827aedc9c4b45de80e7cdc8ccfa8541b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 16 Sep 2016 14:18:17 -0500 Subject: [PATCH 441/538] x86/dumpstack: Remove dump_trace() and related callbacks All previous users of dump_trace() have been converted to use the new unwind interfaces, so we can remove it and the related print_context_stack() and print_context_stack_bp() callback functions. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5b97da3572b40b5a4d8e185cf2429308d0987a13.1474045023.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/stacktrace.h | 36 ------------- arch/x86/kernel/dumpstack.c | 86 ------------------------------- arch/x86/kernel/dumpstack_32.c | 35 ------------- arch/x86/kernel/dumpstack_64.c | 69 ------------------------- 4 files changed, 226 deletions(-) diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index c9ccf0676ca6..37f2e0b377ad 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -45,42 +45,6 @@ static inline bool on_stack(struct stack_info *info, void *addr, size_t len) extern int kstack_depth_to_print; -struct thread_info; -struct stacktrace_ops; - -typedef unsigned long (*walk_stack_t)(struct task_struct *task, - unsigned long *stack, - unsigned long bp, - const struct stacktrace_ops *ops, - void *data, - struct stack_info *info, - int *graph); - -extern unsigned long -print_context_stack(struct task_struct *task, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - struct stack_info *info, int *graph); - -extern unsigned long -print_context_stack_bp(struct task_struct *task, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - struct stack_info *info, int *graph); - -/* Generic stack tracer with callbacks */ - -struct stacktrace_ops { - int (*address)(void *data, unsigned long address, int reliable); - /* On negative return stop dumping */ - int (*stack)(void *data, const char *name); - walk_stack_t walk_stack; -}; - -void dump_trace(struct task_struct *tsk, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data); - #ifdef CONFIG_X86_32 #define STACKSLOTS_PER_LINE 8 #else diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c08f32ab8ace..999de3b3f7f4 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -56,92 +56,6 @@ void printk_address(unsigned long address) pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address); } -/* - * x86-64 can have up to three kernel stacks: - * process stack - * interrupt stack - * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack - */ - -unsigned long -print_context_stack(struct task_struct *task, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - struct stack_info *info, int *graph) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - /* - * If we overflowed the stack into a guard page, jump back to the - * bottom of the usable stack. - */ - if ((unsigned long)task_stack_page(task) - (unsigned long)stack < - PAGE_SIZE) - stack = (unsigned long *)task_stack_page(task); - - while (on_stack(info, stack, sizeof(*stack))) { - unsigned long addr = *stack; - - if (__kernel_text_address(addr)) { - unsigned long real_addr; - int reliable = 0; - - if ((unsigned long) stack == bp + sizeof(long)) { - reliable = 1; - frame = frame->next_frame; - bp = (unsigned long) frame; - } - - /* - * When function graph tracing is enabled for a - * function, its return address on the stack is - * replaced with the address of an ftrace handler - * (return_to_handler). In that case, before printing - * the "real" address, we want to print the handler - * address as an "unreliable" hint that function graph - * tracing was involved. - */ - real_addr = ftrace_graph_ret_addr(task, graph, addr, - stack); - if (real_addr != addr) - ops->address(data, addr, 0); - - ops->address(data, real_addr, reliable); - } - stack++; - } - return bp; -} -EXPORT_SYMBOL_GPL(print_context_stack); - -unsigned long -print_context_stack_bp(struct task_struct *task, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - struct stack_info *info, int *graph) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - unsigned long *retp = &frame->return_address; - - while (on_stack(info, stack, sizeof(*stack) * 2)) { - unsigned long addr = *retp; - unsigned long real_addr; - - if (!__kernel_text_address(addr)) - break; - - real_addr = ftrace_graph_ret_addr(task, graph, addr, retp); - if (ops->address(data, real_addr, 1)) - break; - - frame = frame->next_frame; - retp = &frame->return_address; - } - - return (unsigned long)frame; -} -EXPORT_SYMBOL_GPL(print_context_stack_bp); - void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, char *log_lvl) { diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e476eb774278..06eb322b5f9f 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -121,41 +121,6 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, return -EINVAL; } -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) -{ - unsigned long visit_mask = 0; - int graph = 0; - - task = task ? : current; - stack = stack ? : get_stack_pointer(task, regs); - bp = bp ? : (unsigned long)get_frame_pointer(task, regs); - - for (;;) { - const char *begin_str, *end_str; - struct stack_info info; - - if (get_stack_info(stack, task, &info, &visit_mask)) - break; - - stack_type_str(info.type, &begin_str, &end_str); - - if (begin_str && ops->stack(data, begin_str) < 0) - break; - - bp = ops->walk_stack(task, stack, bp, ops, data, &info, &graph); - - if (end_str && ops->stack(data, end_str) < 0) - break; - - stack = info.next_sp; - - touch_nmi_watchdog(); - } -} -EXPORT_SYMBOL(dump_trace); - void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, char *log_lvl) { diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 4e9f2cf64ac8..36cf1a498227 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -140,75 +140,6 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, return -EINVAL; } -/* - * x86-64 can have up to three kernel stacks: - * process stack - * interrupt stack - * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack - */ - -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) -{ - unsigned long visit_mask = 0; - struct stack_info info; - int graph = 0; - int done = 0; - - task = task ? : current; - stack = stack ? : get_stack_pointer(task, regs); - bp = bp ? : (unsigned long)get_frame_pointer(task, regs); - - /* - * Print function call entries in all stacks, starting at the - * current stack address. If the stacks consist of nested - * exceptions - */ - while (!done) { - const char *begin_str, *end_str; - - get_stack_info(stack, task, &info, &visit_mask); - - /* Default finish unless specified to continue */ - done = 1; - - switch (info.type) { - - /* Break out early if we are on the thread stack */ - case STACK_TYPE_TASK: - break; - - case STACK_TYPE_IRQ: - case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST: - - stack_type_str(info.type, &begin_str, &end_str); - - if (ops->stack(data, begin_str) < 0) - break; - - bp = ops->walk_stack(task, stack, bp, ops, - data, &info, &graph); - - ops->stack(data, end_str); - - stack = info.next_sp; - done = 0; - break; - - default: - ops->stack(data, "UNK"); - break; - } - } - - /* - * This handles the process stack: - */ - bp = ops->walk_stack(task, stack, bp, ops, data, &info, &graph); -} -EXPORT_SYMBOL(dump_trace); - void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, char *log_lvl) { From a435a07f9164dda7c0c26e8ad758881f4bafc127 Mon Sep 17 00:00:00 2001 From: Vincent Bernat Date: Sun, 18 Sep 2016 17:46:07 +0200 Subject: [PATCH 442/538] net: ipv6: fallback to full lookup if table lookup is unsuitable Commit 8c14586fc320 ("net: ipv6: Use passed in table for nexthop lookups") introduced a regression: insertion of an IPv6 route in a table not containing the appropriate connected route for the gateway but which contained a non-connected route (like a default gateway) fails while it was previously working: $ ip link add eth0 type dummy $ ip link set up dev eth0 $ ip addr add 2001:db8::1/64 dev eth0 $ ip route add ::/0 via 2001:db8::5 dev eth0 table 20 $ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20 RTNETLINK answers: No route to host $ ip -6 route show table 20 default via 2001:db8::5 dev eth0 metric 1024 pref medium After this patch, we get: $ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20 $ ip -6 route show table 20 2001:db8:cafe::1 via 2001:db8::6 dev eth0 metric 1024 pref medium default via 2001:db8::5 dev eth0 metric 1024 pref medium Fixes: 8c14586fc320 ("net: ipv6: Use passed in table for nexthop lookups") Signed-off-by: Vincent Bernat Acked-by: David Ahern Tested-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 49817555449e..e3a224b97905 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1986,9 +1986,18 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) if (!(gwa_type & IPV6_ADDR_UNICAST)) goto out; - if (cfg->fc_table) + if (cfg->fc_table) { grt = ip6_nh_lookup_table(net, cfg, gw_addr); + if (grt) { + if (grt->rt6i_flags & RTF_GATEWAY || + (dev && dev != grt->dst.dev)) { + ip6_rt_put(grt); + grt = NULL; + } + } + } + if (!grt) grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); From 3ed6e498b91a4dc5d0e8b6270a6c144061db2455 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 18 Sep 2016 21:17:19 +0200 Subject: [PATCH 443/538] MAINTAINERS: Add an entry for the core network DSA code The core distributed switch architecture code currently does not have a MAINTAINERS entry, which results in some contributions not landing in the right peoples inbox. Signed-off-by: Andrew Lunn Acked-by: Florian Fainelli Acked-by: Vivien Didelot Signed-off-by: David S. Miller --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index a5e1270dfbf1..247b418959fa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8160,6 +8160,15 @@ S: Maintained W: https://fedorahosted.org/dropwatch/ F: net/core/drop_monitor.c +NETWORKING [DSA] +M: Andrew Lunn +M: Vivien Didelot +M: Florian Fainelli +S: Maintained +F: net/dsa/ +F: include/net/dsa.h +F: drivers/net/dsa/ + NETWORKING [GENERAL] M: "David S. Miller" L: netdev@vger.kernel.org From 67a99b7061c07b190ac6c39f136afedbb7aa86e9 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Mon, 19 Sep 2016 17:47:41 +0300 Subject: [PATCH 444/538] qed: Fix stack corruption on probe Commit fe56b9e6a8d95 ("qed: Add module with basic common support") has introduced a stack corruption during probe, where filling a local struct with data to be sent to management firmware is incorrectly filled; The data is written outside of the struct and corrupts the stack. Changes from v1: ---------------- - Correct the value written [Caught by David Laight] Fixes: fe56b9e6a8d95 ("qed: Add module with basic common support") Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_mcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c index a240f26344a4..f776a77794c5 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c @@ -1153,8 +1153,8 @@ qed_mcp_send_drv_version(struct qed_hwfn *p_hwfn, p_drv_version = &union_data.drv_version; p_drv_version->version = p_ver->version; - for (i = 0; i < MCP_DRV_VER_STR_SIZE - 1; i += 4) { - val = cpu_to_be32(p_ver->name[i]); + for (i = 0; i < (MCP_DRV_VER_STR_SIZE - 4) / sizeof(u32); i++) { + val = cpu_to_be32(*((u32 *)&p_ver->name[i * sizeof(u32)])); *(__be32 *)&p_drv_version->name[i * sizeof(u32)] = val; } From e2a738f7a88f32622684d972d654a9fed026555f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 6 Sep 2016 19:04:55 +0200 Subject: [PATCH 445/538] blk/mq: Reserve hotplug states for block multiqueue This patch only reserves two CPU hotplug states for block/mq so the block tree can apply the conversion patches. Signed-off-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: Jens Axboe Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160906170457.32393-20-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index dcfe619171b4..2ac07d01bdb5 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -14,6 +14,7 @@ enum cpuhp_state { CPUHP_PERF_SUPERH, CPUHP_X86_HPET_DEAD, CPUHP_X86_APB_DEAD, + CPUHP_BLK_MQ_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, @@ -22,6 +23,7 @@ enum cpuhp_state { CPUHP_SMPCFD_PREPARE, CPUHP_RCUTREE_PREP, CPUHP_NOTIFY_PREPARE, + CPUHP_BLK_MQ_PREPARE, CPUHP_TIMERS_DEAD, CPUHP_BRINGUP_CPU, CPUHP_AP_IDLE_DEAD, From e535ec0899d1fe52ec3a84c9bc03457ac67ad6f7 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 20 Sep 2016 14:26:21 +0100 Subject: [PATCH 446/538] x86/mm/pat: Prevent hang during boot when mapping pages There's a mixture of signed 32-bit and unsigned 32-bit and 64-bit data types used for keeping track of how many pages have been mapped. This leads to hangs during boot when mapping large numbers of pages (multiple terabytes, as reported by Waiman) because those values are interpreted as being negative. commit 742563777e8d ("x86/mm/pat: Avoid truncation when converting cpa->numpages to address") fixed one of those bugs, but there is another lurking in __change_page_attr_set_clr(). Additionally, the return value type for the populate_*() functions can return negative values when a large number of pages have been mapped, triggering the error paths even though no error occurred. Consistently use 64-bit types on 64-bit platforms when counting pages. Even in the signed case this gives us room for regions 8PiB (pebibytes) in size whilst still allowing the usual negative value error checking idiom. Reported-by: Waiman Long Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Linus Torvalds CC: Theodore Ts'o Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Scott J Norton Cc: Douglas Hatch Signed-off-by: Matt Fleming --- arch/x86/mm/pageattr.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 849dc09fa4f0..e3353c97d086 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -917,11 +917,11 @@ static void populate_pte(struct cpa_data *cpa, } } -static int populate_pmd(struct cpa_data *cpa, - unsigned long start, unsigned long end, - unsigned num_pages, pud_t *pud, pgprot_t pgprot) +static long populate_pmd(struct cpa_data *cpa, + unsigned long start, unsigned long end, + unsigned num_pages, pud_t *pud, pgprot_t pgprot) { - unsigned int cur_pages = 0; + long cur_pages = 0; pmd_t *pmd; pgprot_t pmd_pgprot; @@ -991,12 +991,12 @@ static int populate_pmd(struct cpa_data *cpa, return num_pages; } -static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, - pgprot_t pgprot) +static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, + pgprot_t pgprot) { pud_t *pud; unsigned long end; - int cur_pages = 0; + long cur_pages = 0; pgprot_t pud_pgprot; end = start + (cpa->numpages << PAGE_SHIFT); @@ -1052,7 +1052,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, /* Map trailing leftover */ if (start < end) { - int tmp; + long tmp; pud = pud_offset(pgd, start); if (pud_none(*pud)) @@ -1078,7 +1078,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) pgprot_t pgprot = __pgprot(_KERNPG_TABLE); pud_t *pud = NULL; /* shut up gcc */ pgd_t *pgd_entry; - int ret; + long ret; pgd_entry = cpa->pgd + pgd_index(addr); @@ -1327,7 +1327,8 @@ static int cpa_process_alias(struct cpa_data *cpa) static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) { - int ret, numpages = cpa->numpages; + unsigned long numpages = cpa->numpages; + int ret; while (numpages) { /* From 1297667083d5442aafe3e337b9413bf02b114edb Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Mon, 19 Sep 2016 13:09:09 +0100 Subject: [PATCH 447/538] x86/efi: Only map RAM into EFI page tables if in mixed-mode Waiman reported that booting with CONFIG_EFI_MIXED enabled on his multi-terabyte HP machine results in boot crashes, because the EFI region mapping functions loop forever while trying to map those regions describing RAM. While this patch doesn't fix the underlying hang, there's really no reason to map EFI_CONVENTIONAL_MEMORY regions into the EFI page tables when mixed-mode is not in use at runtime. Reported-by: Waiman Long Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Linus Torvalds CC: Theodore Ts'o Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Scott J Norton Cc: Douglas Hatch Cc: # v4.6+ Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 677e29e29473..8dd3784eb075 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -245,7 +245,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) * text and allocate a new stack because we can't rely on the * stack pointer being < 4GB. */ - if (!IS_ENABLED(CONFIG_EFI_MIXED)) + if (!IS_ENABLED(CONFIG_EFI_MIXED) || efi_is_native()) return 0; /* From 92dc33501bfba74655dbf3ec63ea82d040fd6d58 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 16 Sep 2016 15:12:47 +0100 Subject: [PATCH 448/538] x86/efi: Round EFI memmap reservations to EFI_PAGE_SIZE Mike Galbraith reported that his machine started rebooting during boot after, commit 8e80632fb23f ("efi/esrt: Use efi_mem_reserve() and avoid a kmalloc()") The ESRT table on his machine is 56 bytes and at no point in the efi_arch_mem_reserve() call path is that size rounded up to EFI_PAGE_SIZE, nor is the start address on an EFI_PAGE_SIZE boundary. Since the EFI memory map only deals with whole pages, inserting an EFI memory region with 56 bytes results in a new entry covering zero pages, and completely screws up the calculations for the old regions that were trimmed. Round all sizes upwards, and start addresses downwards, to the nearest EFI_PAGE_SIZE boundary. Additionally, efi_memmap_insert() expects the mem::range::end value to be one less than the end address for the region. Reported-by: Mike Galbraith Reported-by: Mike Krinkin Tested-by: Mike Krinkin Cc: Peter Jones Cc: Ard Biesheuvel Cc: Mark Rutland Cc: Taku Izumi Signed-off-by: Matt Fleming --- arch/x86/platform/efi/quirks.c | 6 +++++- drivers/firmware/efi/memmap.c | 11 +++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index f14b7a9da24b..10aca63a50d7 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -201,8 +201,12 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size) return; } + size += addr % EFI_PAGE_SIZE; + size = round_up(size, EFI_PAGE_SIZE); + addr = round_down(addr, EFI_PAGE_SIZE); + mr.range.start = addr; - mr.range.end = addr + size; + mr.range.end = addr + size - 1; mr.attribute = md.attribute | EFI_MEMORY_RUNTIME; num_entries = efi_memmap_split_count(&md, &mr.range); diff --git a/drivers/firmware/efi/memmap.c b/drivers/firmware/efi/memmap.c index cd96086fd851..f03ddecd232b 100644 --- a/drivers/firmware/efi/memmap.c +++ b/drivers/firmware/efi/memmap.c @@ -225,6 +225,17 @@ void __init efi_memmap_insert(struct efi_memory_map *old_memmap, void *buf, m_end = mem->range.end; m_attr = mem->attribute; + /* + * The EFI memory map deals with regions in EFI_PAGE_SIZE + * units. Ensure that the region described by 'mem' is aligned + * correctly. + */ + if (!IS_ALIGNED(m_start, EFI_PAGE_SIZE) || + !IS_ALIGNED(m_end + 1, EFI_PAGE_SIZE)) { + WARN_ON(1); + return; + } + for (old = old_memmap->map, new = buf; old < old_memmap->map_end; old += old_memmap->desc_size, new += old_memmap->desc_size) { From 5372e054a1928fe704cf0a5e2e139645a777b50a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 20 Sep 2016 16:56:28 +0200 Subject: [PATCH 449/538] cpufreq: Fix up conversion to hotplug state machine The function cpufreq_register_driver() returns zero on success and since commit 27622b061eb4 ("cpufreq: Convert to hotplug state machine") erroneously a positive number. Due to the "if (x) assume_error" construct all callers assumed an error and as a consequence the cpu freq kworker crashes with a NULL pointer dereference. Reset the return value back to zero in the success case. Fixes: 27622b061eb4 ("cpufreq: Convert to hotplug state machine") Reported-by: Borislav Petkov Reported-and-tested-by: Ingo Molnar Signed-off-by: Sebastian Andrzej Siewior Cc: peterz@infradead.org Cc: rjw@rjwysocki.net Link: http://lkml.kernel.org/r/20160920145628.lp2bmq72ip3oiash@linutronix.de Signed-off-by: Thomas Gleixner --- drivers/cpufreq/cpufreq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index e0bc632a259e..8b44de4d7438 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2503,6 +2503,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) if (ret < 0) goto err_if_unreg; hp_online = ret; + ret = 0; pr_debug("driver %s up and running\n", driver_data->name); goto out; From f5beeb1851ea6f8cfcf2657f26cb24c0582b4945 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 8 Sep 2016 09:57:07 +0200 Subject: [PATCH 450/538] fs/proc/kcore.c: Make bounce buffer global for read Next patch adds bounce buffer for ktext area, so it's convenient to have single bounce buffer for both vmalloc/module and ktext cases. Suggested-by: Linus Torvalds Signed-off-by: Jiri Olsa Acked-by: Kees Cook Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index a939f5ed7f89..bd3ac9dca252 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -430,6 +430,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) static ssize_t read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) { + char *buf = file->private_data; ssize_t acc = 0; size_t size, tsz; size_t elf_buflen; @@ -500,18 +501,10 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) if (clear_user(buffer, tsz)) return -EFAULT; } else if (is_vmalloc_or_module_addr((void *)start)) { - char * elf_buf; - - elf_buf = kzalloc(tsz, GFP_KERNEL); - if (!elf_buf) - return -ENOMEM; - vread(elf_buf, (char *)start, tsz); + vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ - if (copy_to_user(buffer, elf_buf, tsz)) { - kfree(elf_buf); + if (copy_to_user(buffer, buf, tsz)) return -EFAULT; - } - kfree(elf_buf); } else { if (kern_addr_valid(start)) { unsigned long n; @@ -549,6 +542,11 @@ static int open_kcore(struct inode *inode, struct file *filp) { if (!capable(CAP_SYS_RAWIO)) return -EPERM; + + filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!filp->private_data) + return -ENOMEM; + if (kcore_need_update) kcore_update_ram(); if (i_size_read(inode) != proc_root_kcore->size) { @@ -559,10 +557,16 @@ static int open_kcore(struct inode *inode, struct file *filp) return 0; } +static int release_kcore(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} static const struct file_operations proc_kcore_operations = { .read = read_kcore, .open = open_kcore, + .release = release_kcore, .llseek = default_llseek, }; From df04abfd181acc276ba6762c8206891ae10ae00d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 8 Sep 2016 09:57:08 +0200 Subject: [PATCH 451/538] fs/proc/kcore.c: Add bounce buffer for ktext data We hit hardened usercopy feature check for kernel text access by reading kcore file: usercopy: kernel memory exposure attempt detected from ffffffff8179a01f () (4065 bytes) kernel BUG at mm/usercopy.c:75! Bypassing this check for kcore by adding bounce buffer for ktext data. Reported-by: Steve Best Fixes: f5509cc18daa ("mm: Hardened usercopy") Suggested-by: Kees Cook Signed-off-by: Jiri Olsa Acked-by: Kees Cook Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index bd3ac9dca252..5c89a07e3d7f 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -509,7 +509,12 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) if (kern_addr_valid(start)) { unsigned long n; - n = copy_to_user(buffer, (char *)start, tsz); + /* + * Using bounce buffer to bypass the + * hardened user copy kernel text checks. + */ + memcpy(buf, (char *) start, tsz); + n = copy_to_user(buffer, buf, tsz); /* * We cannot distinguish between fault on source * and fault on destination. When this happens From e875bd66dfb68f4e898e9a43ef42858c504a7f23 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Tue, 13 Sep 2016 17:53:35 +0100 Subject: [PATCH 452/538] irqchip/mips-gic: Fix local interrupts Since the device hierarchy domain was added by commit c98c1822ee13 ("irqchip/mips-gic: Add device hierarchy domain"), GIC local interrupts have been broken. Users attempting to setup a per-cpu local IRQ, for example the GIC timer clock events code in drivers/clocksource/mips-gic-timer.c, the setup_percpu_irq function would refuse with -EINVAL because the GIC irqchip driver never called irq_set_percpu_devid so the IRQ_PER_CPU_DEVID flag was never set for the IRQ. This happens because irq_set_percpu_devid was being called from the gic_irq_domain_map function which is no longer called. Doing only that runs into further problems because gic_dev_domain_alloc set the struct irq_chip for all interrupts, local or shared, to gic_level_irq_controller despite that only being suitable for shared interrupts. The typical outcome of this is that gic_level_irq_controller callback functions are called for local interrupts, and then hwirq number calculations overflow & the driver ends up attempting to access some invalid register with an address calculated from an invalid hwirq number. Best case scenario is that this then leads to a bus error. This is fixed by abstracting the setup of the hwirq & chip to a new function gic_setup_dev_chip which is used by both the root GIC IRQ domain & the device domain. Finally, decoding local interrupts failed because gic_dev_domain_alloc only called irq_domain_alloc_irqs_parent for shared interrupts. Local ones were therefore never associated with hwirqs in the root GIC IRQ domain and the virq in gic_handle_local_int would always be 0. This is fixed by calling irq_domain_alloc_irqs_parent unconditionally & having gic_irq_domain_alloc handle both local & shared interrupts, which is easy due to the aforementioned abstraction of chip setup into gic_setup_dev_chip. This fixes use of the MIPS GIC timer for clock events, which has been broken since c98c1822ee13 ("irqchip/mips-gic: Add device hierarchy domain") but hadn't been noticed due to a silent fallback to the MIPS coprocessor 0 count/compare clock events device. Fixes: c98c1822ee13 ("irqchip/mips-gic: Add device hierarchy domain") Signed-off-by: Paul Burton Cc: linux-mips@linux-mips.org Cc: Jason Cooper Cc: Qais Yousef Cc: stable@vger.kernel.org Cc: Marc Zyngier Link: http://lkml.kernel.org/r/20160913165335.31389-1-paul.burton@imgtec.com Signed-off-by: Thomas Gleixner --- drivers/irqchip/irq-mips-gic.c | 105 ++++++++++++++++----------------- 1 file changed, 50 insertions(+), 55 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 83f498393a7f..6185696405d5 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -638,27 +638,6 @@ static int gic_local_irq_domain_map(struct irq_domain *d, unsigned int virq, if (!gic_local_irq_is_routable(intr)) return -EPERM; - /* - * HACK: These are all really percpu interrupts, but the rest - * of the MIPS kernel code does not use the percpu IRQ API for - * the CP0 timer and performance counter interrupts. - */ - switch (intr) { - case GIC_LOCAL_INT_TIMER: - case GIC_LOCAL_INT_PERFCTR: - case GIC_LOCAL_INT_FDC: - irq_set_chip_and_handler(virq, - &gic_all_vpes_local_irq_controller, - handle_percpu_irq); - break; - default: - irq_set_chip_and_handler(virq, - &gic_local_irq_controller, - handle_percpu_devid_irq); - irq_set_percpu_devid(virq); - break; - } - spin_lock_irqsave(&gic_lock, flags); for (i = 0; i < gic_vpes; i++) { u32 val = GIC_MAP_TO_PIN_MSK | gic_cpu_pin; @@ -724,16 +703,42 @@ static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq, return 0; } -static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq, - irq_hw_number_t hw) +static int gic_setup_dev_chip(struct irq_domain *d, unsigned int virq, + unsigned int hwirq) { - if (GIC_HWIRQ_TO_LOCAL(hw) < GIC_NUM_LOCAL_INTRS) - return gic_local_irq_domain_map(d, virq, hw); + struct irq_chip *chip; + int err; + + if (hwirq >= GIC_SHARED_HWIRQ_BASE) { + err = irq_domain_set_hwirq_and_chip(d, virq, hwirq, + &gic_level_irq_controller, + NULL); + } else { + switch (GIC_HWIRQ_TO_LOCAL(hwirq)) { + case GIC_LOCAL_INT_TIMER: + case GIC_LOCAL_INT_PERFCTR: + case GIC_LOCAL_INT_FDC: + /* + * HACK: These are all really percpu interrupts, but + * the rest of the MIPS kernel code does not use the + * percpu IRQ API for them. + */ + chip = &gic_all_vpes_local_irq_controller; + irq_set_handler(virq, handle_percpu_irq); + break; + + default: + chip = &gic_local_irq_controller; + irq_set_handler(virq, handle_percpu_devid_irq); + irq_set_percpu_devid(virq); + break; + } - irq_set_chip_and_handler(virq, &gic_level_irq_controller, - handle_level_irq); + err = irq_domain_set_hwirq_and_chip(d, virq, hwirq, + chip, NULL); + } - return gic_shared_irq_domain_map(d, virq, hw, 0); + return err; } static int gic_irq_domain_alloc(struct irq_domain *d, unsigned int virq, @@ -744,15 +749,12 @@ static int gic_irq_domain_alloc(struct irq_domain *d, unsigned int virq, int cpu, ret, i; if (spec->type == GIC_DEVICE) { - /* verify that it doesn't conflict with an IPI irq */ - if (test_bit(spec->hwirq, ipi_resrv)) + /* verify that shared irqs don't conflict with an IPI irq */ + if ((spec->hwirq >= GIC_SHARED_HWIRQ_BASE) && + test_bit(GIC_HWIRQ_TO_SHARED(spec->hwirq), ipi_resrv)) return -EBUSY; - hwirq = GIC_SHARED_TO_HWIRQ(spec->hwirq); - - return irq_domain_set_hwirq_and_chip(d, virq, hwirq, - &gic_level_irq_controller, - NULL); + return gic_setup_dev_chip(d, virq, spec->hwirq); } else { base_hwirq = find_first_bit(ipi_resrv, gic_shared_intrs); if (base_hwirq == gic_shared_intrs) { @@ -821,7 +823,6 @@ int gic_irq_domain_match(struct irq_domain *d, struct device_node *node, } static const struct irq_domain_ops gic_irq_domain_ops = { - .map = gic_irq_domain_map, .alloc = gic_irq_domain_alloc, .free = gic_irq_domain_free, .match = gic_irq_domain_match, @@ -852,29 +853,20 @@ static int gic_dev_domain_alloc(struct irq_domain *d, unsigned int virq, struct irq_fwspec *fwspec = arg; struct gic_irq_spec spec = { .type = GIC_DEVICE, - .hwirq = fwspec->param[1], }; int i, ret; - bool is_shared = fwspec->param[0] == GIC_SHARED; - if (is_shared) { - ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, &spec); - if (ret) - return ret; - } - - for (i = 0; i < nr_irqs; i++) { - irq_hw_number_t hwirq; + if (fwspec->param[0] == GIC_SHARED) + spec.hwirq = GIC_SHARED_TO_HWIRQ(fwspec->param[1]); + else + spec.hwirq = GIC_LOCAL_TO_HWIRQ(fwspec->param[1]); - if (is_shared) - hwirq = GIC_SHARED_TO_HWIRQ(spec.hwirq + i); - else - hwirq = GIC_LOCAL_TO_HWIRQ(spec.hwirq + i); + ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, &spec); + if (ret) + return ret; - ret = irq_domain_set_hwirq_and_chip(d, virq + i, - hwirq, - &gic_level_irq_controller, - NULL); + for (i = 0; i < nr_irqs; i++) { + ret = gic_setup_dev_chip(d, virq + i, spec.hwirq + i); if (ret) goto error; } @@ -896,7 +888,10 @@ void gic_dev_domain_free(struct irq_domain *d, unsigned int virq, static void gic_dev_domain_activate(struct irq_domain *domain, struct irq_data *d) { - gic_shared_irq_domain_map(domain, d->irq, d->hwirq, 0); + if (GIC_HWIRQ_TO_LOCAL(d->hwirq) < GIC_NUM_LOCAL_INTRS) + gic_local_irq_domain_map(domain, d->irq, d->hwirq); + else + gic_shared_irq_domain_map(domain, d->irq, d->hwirq, 0); } static struct irq_domain_ops gic_dev_domain_ops = { From 0f4ed1580ce6c9499eba2a1ba013759700a5ed14 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Tue, 13 Sep 2016 17:54:27 +0100 Subject: [PATCH 453/538] irqchip/mips-gic: Use for_each_set_bit to iterate over local IRQs The MIPS GIC driver has previously iterated over bits set in a bitmap representing pending local IRQs by calling find_first_bit, clearing that bit then calling find_first_bit again until all bits are clear. If multiple interrupts are pending then this is wasteful, as find_first_bit will have to loop over the whole bitmap from the start. Use the for_each_set_bit macro which performs exactly what we need here instead. It will use find_next_bit and thus only scan over the relevant part of the bitmap, and it makes the intent of the code clearer. This makes the same change for local interrupts that commit cae750bae4e4 ("irqchip: mips-gic: Use for_each_set_bit to iterate over IRQs") made for shared interrupts. Signed-off-by: Paul Burton Cc: Marc Zyngier Cc: linux-mips@linux-mips.org Cc: Jason Cooper Link: http://lkml.kernel.org/r/20160913165427.31686-1-paul.burton@imgtec.com Signed-off-by: Thomas Gleixner --- drivers/irqchip/irq-mips-gic.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 6185696405d5..8f7d38ba24c6 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -518,18 +518,13 @@ static void gic_handle_local_int(bool chained) bitmap_and(&pending, &pending, &masked, GIC_NUM_LOCAL_INTRS); - intr = find_first_bit(&pending, GIC_NUM_LOCAL_INTRS); - while (intr != GIC_NUM_LOCAL_INTRS) { + for_each_set_bit(intr, &pending, GIC_NUM_LOCAL_INTRS) { virq = irq_linear_revmap(gic_irq_domain, GIC_LOCAL_TO_HWIRQ(intr)); if (chained) generic_handle_irq(virq); else do_IRQ(virq); - - /* go to next pending bit */ - bitmap_clear(&pending, intr, 1); - intr = find_first_bit(&pending, GIC_NUM_LOCAL_INTRS); } } From 2fd0c93cd219779eef4b1301f9613e43adc86e39 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Tue, 13 Sep 2016 17:56:43 +0100 Subject: [PATCH 454/538] clocksource/mips-gic-timer: Print an error if IRQ setup fails We've checked for errors from setup_irq_percpu since commit f95ac8558b88 ("CLOCKSOURCE: mips-gic: Add missing error returns checks") but didn't print an error message in the failure case. This makes it very easy to overlook the GIC timer clock event driver not being registered, since we'll generally just use a different clock event driver if that happens. Print an error if IRQ setup fails in order to make such problems harder to miss (ie. not completely silent). Signed-off-by: Paul Burton Cc: linux-mips@linux-mips.org Cc: Daniel Lezcano Link: http://lkml.kernel.org/r/20160913165644.627-1-paul.burton@imgtec.com Signed-off-by: Thomas Gleixner --- drivers/clocksource/mips-gic-timer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/clocksource/mips-gic-timer.c b/drivers/clocksource/mips-gic-timer.c index b4b3ab5a11ad..802055ba1cbf 100644 --- a/drivers/clocksource/mips-gic-timer.c +++ b/drivers/clocksource/mips-gic-timer.c @@ -113,8 +113,11 @@ static int gic_clockevent_init(void) return -ENXIO; ret = setup_percpu_irq(gic_timer_irq, &gic_compare_irqaction); - if (ret < 0) + if (ret < 0) { + pr_err("GIC timer IRQ %d setup failed: %d\n", + gic_timer_irq, ret); return ret; + } cpuhp_setup_state(CPUHP_AP_MIPS_GIC_TIMER_STARTING, "AP_MIPS_GIC_TIMER_STARTING", gic_starting_cpu, From 6982530eab096939f9c5b607a5ce8078df19737e Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Tue, 13 Sep 2016 17:56:44 +0100 Subject: [PATCH 455/538] clocksource/mips-gic-timer: Stop checking cpu_has_counter The cpu_has_counter macro indicates whether the current CPU has a working coprocessor 0 count & compare registers, and has no bearing on the GIC. Stop checking it. Signed-off-by: Paul Burton Cc: linux-mips@linux-mips.org Cc: Daniel Lezcano Link: http://lkml.kernel.org/r/20160913165644.627-2-paul.burton@imgtec.com Signed-off-by: Thomas Gleixner --- drivers/clocksource/mips-gic-timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/mips-gic-timer.c b/drivers/clocksource/mips-gic-timer.c index 802055ba1cbf..7a960cd01104 100644 --- a/drivers/clocksource/mips-gic-timer.c +++ b/drivers/clocksource/mips-gic-timer.c @@ -109,7 +109,7 @@ static int gic_clockevent_init(void) { int ret; - if (!cpu_has_counter || !gic_frequency) + if (!gic_frequency) return -ENXIO; ret = setup_percpu_irq(gic_timer_irq, &gic_compare_irqaction); From 71f5443ebb1227c22e8decbcd28a1ea6deaf8257 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 20 Sep 2016 10:53:40 -0500 Subject: [PATCH 456/538] x86/dumpstack: Fix show_stack() task pointer regression With the following commit: e18bcccd1a4e ("x86/dumpstack: Convert show_trace_log_lvl() to use the new unwinder") The task pointer argument to show_stack_log_lvl() in show_stack() was inadvertently changed to 'current'. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: byungchul.park@lge.com Cc: fweisbec@gmail.com Cc: keescook@chromium.org Cc: linux-tip-commits@vger.kernel.org Cc: luto@amacapital.net Cc: nilayvaish@gmail.com Cc: rostedt@goodmis.org Cc: tip-bot for Josh Poimboeuf Fixes: e18bcccd1a4e ("x86/dumpstack: Convert show_trace_log_lvl() to use the new unwinder") Link: http://lkml.kernel.org/r/20160920155340.yhewlx7vmgmov5fb@treble Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 999de3b3f7f4..9b7cf5c28f5f 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -164,7 +164,7 @@ void show_stack(struct task_struct *task, unsigned long *sp) if (!sp && task == current) sp = get_stack_pointer(current, NULL); - show_stack_log_lvl(current, NULL, sp, ""); + show_stack_log_lvl(task, NULL, sp, ""); } void show_stack_regs(struct pt_regs *regs) From aa4f0601115319a52c80f468c8f007e5aa9277cb Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 20 Sep 2016 08:56:36 -0700 Subject: [PATCH 457/538] mm: usercopy: Check for module addresses While running a compile on arm64, I hit a memory exposure usercopy: kernel memory exposure attempt detected from fffffc0000f3b1a8 (buffer_head) (1 bytes) ------------[ cut here ]------------ kernel BUG at mm/usercopy.c:75! Internal error: Oops - BUG: 0 [#1] SMP Modules linked in: ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink ebtable_broute bridge stp llc ebtable_nat ip6table_security ip6table_raw ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle iptable_security iptable_raw iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle ebtable_filter ebtables ip6table_filter ip6_tables vfat fat xgene_edac xgene_enet edac_core i2c_xgene_slimpro i2c_core at803x realtek xgene_dma mdio_xgene gpio_dwapb gpio_xgene_sb xgene_rng mailbox_xgene_slimpro nfsd auth_rpcgss nfs_acl lockd grace sunrpc xfs libcrc32c sdhci_of_arasan sdhci_pltfm sdhci mmc_core xhci_plat_hcd gpio_keys CPU: 0 PID: 19744 Comm: updatedb Tainted: G W 4.8.0-rc3-threadinfo+ #1 Hardware name: AppliedMicro X-Gene Mustang Board/X-Gene Mustang Board, BIOS 3.06.12 Aug 12 2016 task: fffffe03df944c00 task.stack: fffffe00d128c000 PC is at __check_object_size+0x70/0x3f0 LR is at __check_object_size+0x70/0x3f0 ... [] __check_object_size+0x70/0x3f0 [] filldir64+0x158/0x1a0 [] __fat_readdir+0x4a0/0x558 [fat] [] fat_readdir+0x34/0x40 [fat] [] iterate_dir+0x190/0x1e0 [] SyS_getdents64+0x88/0x120 [] el0_svc_naked+0x24/0x28 fffffc0000f3b1a8 is a module address. Modules may have compiled in strings which could get copied to userspace. In this instance, it looks like "." which matches with a size of 1 byte. Extend the is_vmalloc_addr check to be is_vmalloc_or_module_addr to cover all possible cases. Signed-off-by: Laura Abbott Signed-off-by: Kees Cook --- mm/usercopy.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/usercopy.c b/mm/usercopy.c index 089328f2b920..3c8da0af9695 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -207,8 +207,11 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n, * Some architectures (arm64) return true for virt_addr_valid() on * vmalloced addresses. Work around this by checking for vmalloc * first. + * + * We also need to check for module addresses explicitly since we + * may copy static data from modules to userspace */ - if (is_vmalloc_addr(ptr)) + if (is_vmalloc_or_module_addr(ptr)) return NULL; if (!virt_addr_valid(ptr)) From e23d4159b109167126e5bcd7f3775c95de7fee47 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 20 Sep 2016 20:07:42 +0100 Subject: [PATCH 458/538] fix fault_in_multipages_...() on architectures with no-op access_ok() Switching iov_iter fault-in to multipages variants has exposed an old bug in underlying fault_in_multipages_...(); they break if the range passed to them wraps around. Normally access_ok() done by callers will prevent such (and it's a guaranteed EFAULT - ERR_PTR() values fall into such a range and they should not point to any valid objects). However, on architectures where userland and kernel live in different MMU contexts (e.g. s390) access_ok() is a no-op and on those a range with a wraparound can reach fault_in_multipages_...(). Since any wraparound means EFAULT there, the fix is trivial - turn those while (uaddr <= end) ... into if (unlikely(uaddr > end)) return -EFAULT; do ... while (uaddr <= end); Reported-by: Jan Stancek Tested-by: Jan Stancek Cc: stable@vger.kernel.org # v3.5+ Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 66a1260b33de..7e3d53753612 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -571,56 +571,56 @@ static inline int fault_in_pages_readable(const char __user *uaddr, int size) */ static inline int fault_in_multipages_writeable(char __user *uaddr, int size) { - int ret = 0; char __user *end = uaddr + size - 1; if (unlikely(size == 0)) - return ret; + return 0; + if (unlikely(uaddr > end)) + return -EFAULT; /* * Writing zeroes into userspace here is OK, because we know that if * the zero gets there, we'll be overwriting it. */ - while (uaddr <= end) { - ret = __put_user(0, uaddr); - if (ret != 0) - return ret; + do { + if (unlikely(__put_user(0, uaddr) != 0)) + return -EFAULT; uaddr += PAGE_SIZE; - } + } while (uaddr <= end); /* Check whether the range spilled into the next page. */ if (((unsigned long)uaddr & PAGE_MASK) == ((unsigned long)end & PAGE_MASK)) - ret = __put_user(0, end); + return __put_user(0, end); - return ret; + return 0; } static inline int fault_in_multipages_readable(const char __user *uaddr, int size) { volatile char c; - int ret = 0; const char __user *end = uaddr + size - 1; if (unlikely(size == 0)) - return ret; + return 0; - while (uaddr <= end) { - ret = __get_user(c, uaddr); - if (ret != 0) - return ret; + if (unlikely(uaddr > end)) + return -EFAULT; + + do { + if (unlikely(__get_user(c, uaddr) != 0)) + return -EFAULT; uaddr += PAGE_SIZE; - } + } while (uaddr <= end); /* Check whether the range spilled into the next page. */ if (((unsigned long)uaddr & PAGE_MASK) == ((unsigned long)end & PAGE_MASK)) { - ret = __get_user(c, end); - (void)c; + return __get_user(c, end); } - return ret; + return 0; } int add_to_page_cache_locked(struct page *page, struct address_space *mapping, From 9b86a8d19bd6406a10de5f924bf2a003a502d427 Mon Sep 17 00:00:00 2001 From: Hariprasad Shenai Date: Tue, 20 Sep 2016 12:00:52 +0530 Subject: [PATCH 459/538] cxgb4/cxgb4vf: Allocate more queues for 25G and 100G adapter We were missing check for 25G and 100G while checking port speed, which lead to less number of queues getting allocated for 25G & 100G adapters and leading to low throughput. Adding the missing check for both NIC and vNIC driver. Also fixes port advertisement for 25G and 100G in ethtool output. Signed-off-by: Hariprasad Shenai Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 4 ++-- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 15 +++++++++++++-- drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 7 ++++++- drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h | 6 ++++++ .../net/ethernet/chelsio/cxgb4vf/t4vf_common.h | 15 +++++++++++---- drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c | 9 +++++++-- 6 files changed, 45 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index 2e2aa9fec9bb..edd23386b47d 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -419,8 +419,8 @@ struct link_config { unsigned short supported; /* link capabilities */ unsigned short advertising; /* advertised capabilities */ unsigned short lp_advertising; /* peer advertised capabilities */ - unsigned short requested_speed; /* speed user has requested */ - unsigned short speed; /* actual link speed */ + unsigned int requested_speed; /* speed user has requested */ + unsigned int speed; /* actual link speed */ unsigned char requested_fc; /* flow control user has requested */ unsigned char fc; /* actual link flow control */ unsigned char autoneg; /* autonegotiating? */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index c762a8c8c954..3ceafb55d6da 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -4305,10 +4305,17 @@ static const struct pci_error_handlers cxgb4_eeh = { .resume = eeh_resume, }; +/* Return true if the Link Configuration supports "High Speeds" (those greater + * than 1Gb/s). + */ static inline bool is_x_10g_port(const struct link_config *lc) { - return (lc->supported & FW_PORT_CAP_SPEED_10G) != 0 || - (lc->supported & FW_PORT_CAP_SPEED_40G) != 0; + unsigned int speeds, high_speeds; + + speeds = FW_PORT_CAP_SPEED_V(FW_PORT_CAP_SPEED_G(lc->supported)); + high_speeds = speeds & ~(FW_PORT_CAP_SPEED_100M | FW_PORT_CAP_SPEED_1G); + + return high_speeds != 0; } static inline void init_rspq(struct adapter *adap, struct sge_rspq *q, @@ -4756,8 +4763,12 @@ static void print_port_info(const struct net_device *dev) bufp += sprintf(bufp, "1000/"); if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_10G) bufp += sprintf(bufp, "10G/"); + if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_25G) + bufp += sprintf(bufp, "25G/"); if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_40G) bufp += sprintf(bufp, "40G/"); + if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_100G) + bufp += sprintf(bufp, "100G/"); if (bufp != buf) --bufp; sprintf(bufp, "BASE-%s", t4_get_port_type_description(pi->port_type)); diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index dc92c80a75f4..660204bff726 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -3627,7 +3627,8 @@ void t4_ulprx_read_la(struct adapter *adap, u32 *la_buf) } #define ADVERT_MASK (FW_PORT_CAP_SPEED_100M | FW_PORT_CAP_SPEED_1G |\ - FW_PORT_CAP_SPEED_10G | FW_PORT_CAP_SPEED_40G | \ + FW_PORT_CAP_SPEED_10G | FW_PORT_CAP_SPEED_25G | \ + FW_PORT_CAP_SPEED_40G | FW_PORT_CAP_SPEED_100G | \ FW_PORT_CAP_ANEG) /** @@ -7196,8 +7197,12 @@ void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl) speed = 1000; else if (stat & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_10G)) speed = 10000; + else if (stat & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_25G)) + speed = 25000; else if (stat & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_40G)) speed = 40000; + else if (stat & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_100G)) + speed = 100000; lc = &pi->link_cfg; diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h index a89b30720e38..30507d44422c 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h @@ -2265,6 +2265,12 @@ enum fw_port_cap { FW_PORT_CAP_802_3_ASM_DIR = 0x8000, }; +#define FW_PORT_CAP_SPEED_S 0 +#define FW_PORT_CAP_SPEED_M 0x3f +#define FW_PORT_CAP_SPEED_V(x) ((x) << FW_PORT_CAP_SPEED_S) +#define FW_PORT_CAP_SPEED_G(x) \ + (((x) >> FW_PORT_CAP_SPEED_S) & FW_PORT_CAP_SPEED_M) + enum fw_port_mdi { FW_PORT_CAP_MDI_UNCHANGED, FW_PORT_CAP_MDI_AUTO, diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h index 8ee541431e8b..17a2bbcf93f0 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h +++ b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h @@ -108,8 +108,8 @@ struct link_config { unsigned int supported; /* link capabilities */ unsigned int advertising; /* advertised capabilities */ unsigned short lp_advertising; /* peer advertised capabilities */ - unsigned short requested_speed; /* speed user has requested */ - unsigned short speed; /* actual link speed */ + unsigned int requested_speed; /* speed user has requested */ + unsigned int speed; /* actual link speed */ unsigned char requested_fc; /* flow control user has requested */ unsigned char fc; /* actual link flow control */ unsigned char autoneg; /* autonegotiating? */ @@ -271,10 +271,17 @@ static inline bool is_10g_port(const struct link_config *lc) return (lc->supported & FW_PORT_CAP_SPEED_10G) != 0; } +/* Return true if the Link Configuration supports "High Speeds" (those greater + * than 1Gb/s). + */ static inline bool is_x_10g_port(const struct link_config *lc) { - return (lc->supported & FW_PORT_CAP_SPEED_10G) != 0 || - (lc->supported & FW_PORT_CAP_SPEED_40G) != 0; + unsigned int speeds, high_speeds; + + speeds = FW_PORT_CAP_SPEED_V(FW_PORT_CAP_SPEED_G(lc->supported)); + high_speeds = speeds & ~(FW_PORT_CAP_SPEED_100M | FW_PORT_CAP_SPEED_1G); + + return high_speeds != 0; } static inline unsigned int core_ticks_per_usec(const struct adapter *adapter) diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c index 427bfa71388b..b5622b1689e9 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c @@ -314,8 +314,9 @@ int t4vf_wr_mbox_core(struct adapter *adapter, const void *cmd, int size, } #define ADVERT_MASK (FW_PORT_CAP_SPEED_100M | FW_PORT_CAP_SPEED_1G |\ - FW_PORT_CAP_SPEED_10G | FW_PORT_CAP_SPEED_40G | \ - FW_PORT_CAP_SPEED_100G | FW_PORT_CAP_ANEG) + FW_PORT_CAP_SPEED_10G | FW_PORT_CAP_SPEED_25G | \ + FW_PORT_CAP_SPEED_40G | FW_PORT_CAP_SPEED_100G | \ + FW_PORT_CAP_ANEG) /** * init_link_config - initialize a link's SW state @@ -1712,8 +1713,12 @@ int t4vf_handle_fw_rpl(struct adapter *adapter, const __be64 *rpl) speed = 1000; else if (stat & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_10G)) speed = 10000; + else if (stat & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_25G)) + speed = 25000; else if (stat & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_40G)) speed = 40000; + else if (stat & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_100G)) + speed = 100000; /* * Scan all of our "ports" (Virtual Interfaces) looking for From e6449539828ac3b7c74b648793291640bcca8259 Mon Sep 17 00:00:00 2001 From: hayeswang Date: Tue, 20 Sep 2016 16:22:05 +0800 Subject: [PATCH 460/538] r8152: move some functions Move the following functions forward. r8152_mmd_indirect() r8152_mmd_read() r8152_mmd_write() r8152_eee_en() r8152b_enable_eee() r8153_eee_en() r8153_enable_eee() r8152b_enable_fc() r8153_aldps_en() Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 224 ++++++++++++++++++++-------------------- 1 file changed, 112 insertions(+), 112 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index f41a8ad4740e..ae7db460baa2 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -2552,6 +2552,77 @@ static void r8152_aldps_en(struct r8152 *tp, bool enable) } } +static inline void r8152_mmd_indirect(struct r8152 *tp, u16 dev, u16 reg) +{ + ocp_reg_write(tp, OCP_EEE_AR, FUN_ADDR | dev); + ocp_reg_write(tp, OCP_EEE_DATA, reg); + ocp_reg_write(tp, OCP_EEE_AR, FUN_DATA | dev); +} + +static u16 r8152_mmd_read(struct r8152 *tp, u16 dev, u16 reg) +{ + u16 data; + + r8152_mmd_indirect(tp, dev, reg); + data = ocp_reg_read(tp, OCP_EEE_DATA); + ocp_reg_write(tp, OCP_EEE_AR, 0x0000); + + return data; +} + +static void r8152_mmd_write(struct r8152 *tp, u16 dev, u16 reg, u16 data) +{ + r8152_mmd_indirect(tp, dev, reg); + ocp_reg_write(tp, OCP_EEE_DATA, data); + ocp_reg_write(tp, OCP_EEE_AR, 0x0000); +} + +static void r8152_eee_en(struct r8152 *tp, bool enable) +{ + u16 config1, config2, config3; + u32 ocp_data; + + ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_EEE_CR); + config1 = ocp_reg_read(tp, OCP_EEE_CONFIG1) & ~sd_rise_time_mask; + config2 = ocp_reg_read(tp, OCP_EEE_CONFIG2); + config3 = ocp_reg_read(tp, OCP_EEE_CONFIG3) & ~fast_snr_mask; + + if (enable) { + ocp_data |= EEE_RX_EN | EEE_TX_EN; + config1 |= EEE_10_CAP | EEE_NWAY_EN | TX_QUIET_EN | RX_QUIET_EN; + config1 |= sd_rise_time(1); + config2 |= RG_DACQUIET_EN | RG_LDVQUIET_EN; + config3 |= fast_snr(42); + } else { + ocp_data &= ~(EEE_RX_EN | EEE_TX_EN); + config1 &= ~(EEE_10_CAP | EEE_NWAY_EN | TX_QUIET_EN | + RX_QUIET_EN); + config1 |= sd_rise_time(7); + config2 &= ~(RG_DACQUIET_EN | RG_LDVQUIET_EN); + config3 |= fast_snr(511); + } + + ocp_write_word(tp, MCU_TYPE_PLA, PLA_EEE_CR, ocp_data); + ocp_reg_write(tp, OCP_EEE_CONFIG1, config1); + ocp_reg_write(tp, OCP_EEE_CONFIG2, config2); + ocp_reg_write(tp, OCP_EEE_CONFIG3, config3); +} + +static void r8152b_enable_eee(struct r8152 *tp) +{ + r8152_eee_en(tp, true); + r8152_mmd_write(tp, MDIO_MMD_AN, MDIO_AN_EEE_ADV, MDIO_EEE_100TX); +} + +static void r8152b_enable_fc(struct r8152 *tp) +{ + u16 anar; + + anar = r8152_mdio_read(tp, MII_ADVERTISE); + anar |= ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM; + r8152_mdio_write(tp, MII_ADVERTISE, anar); +} + static void rtl8152_disable(struct r8152 *tp) { r8152_aldps_en(tp, false); @@ -2701,6 +2772,47 @@ static void r8152b_enter_oob(struct r8152 *tp) ocp_write_dword(tp, MCU_TYPE_PLA, PLA_RCR, ocp_data); } +static void r8153_aldps_en(struct r8152 *tp, bool enable) +{ + u16 data; + + data = ocp_reg_read(tp, OCP_POWER_CFG); + if (enable) { + data |= EN_ALDPS; + ocp_reg_write(tp, OCP_POWER_CFG, data); + } else { + data &= ~EN_ALDPS; + ocp_reg_write(tp, OCP_POWER_CFG, data); + msleep(20); + } +} + +static void r8153_eee_en(struct r8152 *tp, bool enable) +{ + u32 ocp_data; + u16 config; + + ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_EEE_CR); + config = ocp_reg_read(tp, OCP_EEE_CFG); + + if (enable) { + ocp_data |= EEE_RX_EN | EEE_TX_EN; + config |= EEE10_EN; + } else { + ocp_data &= ~(EEE_RX_EN | EEE_TX_EN); + config &= ~EEE10_EN; + } + + ocp_write_word(tp, MCU_TYPE_PLA, PLA_EEE_CR, ocp_data); + ocp_reg_write(tp, OCP_EEE_CFG, config); +} + +static void r8153_enable_eee(struct r8152 *tp) +{ + r8153_eee_en(tp, true); + ocp_reg_write(tp, OCP_EEE_ADV, MDIO_EEE_1000T | MDIO_EEE_100TX); +} + static void r8153_hw_phy_cfg(struct r8152 *tp) { u32 ocp_data; @@ -2866,21 +2978,6 @@ static void r8153_enter_oob(struct r8152 *tp) ocp_write_dword(tp, MCU_TYPE_PLA, PLA_RCR, ocp_data); } -static void r8153_aldps_en(struct r8152 *tp, bool enable) -{ - u16 data; - - data = ocp_reg_read(tp, OCP_POWER_CFG); - if (enable) { - data |= EN_ALDPS; - ocp_reg_write(tp, OCP_POWER_CFG, data); - } else { - data &= ~EN_ALDPS; - ocp_reg_write(tp, OCP_POWER_CFG, data); - msleep(20); - } -} - static void rtl8153_disable(struct r8152 *tp) { r8153_aldps_en(tp, false); @@ -3246,103 +3343,6 @@ static int rtl8152_close(struct net_device *netdev) return res; } -static inline void r8152_mmd_indirect(struct r8152 *tp, u16 dev, u16 reg) -{ - ocp_reg_write(tp, OCP_EEE_AR, FUN_ADDR | dev); - ocp_reg_write(tp, OCP_EEE_DATA, reg); - ocp_reg_write(tp, OCP_EEE_AR, FUN_DATA | dev); -} - -static u16 r8152_mmd_read(struct r8152 *tp, u16 dev, u16 reg) -{ - u16 data; - - r8152_mmd_indirect(tp, dev, reg); - data = ocp_reg_read(tp, OCP_EEE_DATA); - ocp_reg_write(tp, OCP_EEE_AR, 0x0000); - - return data; -} - -static void r8152_mmd_write(struct r8152 *tp, u16 dev, u16 reg, u16 data) -{ - r8152_mmd_indirect(tp, dev, reg); - ocp_reg_write(tp, OCP_EEE_DATA, data); - ocp_reg_write(tp, OCP_EEE_AR, 0x0000); -} - -static void r8152_eee_en(struct r8152 *tp, bool enable) -{ - u16 config1, config2, config3; - u32 ocp_data; - - ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_EEE_CR); - config1 = ocp_reg_read(tp, OCP_EEE_CONFIG1) & ~sd_rise_time_mask; - config2 = ocp_reg_read(tp, OCP_EEE_CONFIG2); - config3 = ocp_reg_read(tp, OCP_EEE_CONFIG3) & ~fast_snr_mask; - - if (enable) { - ocp_data |= EEE_RX_EN | EEE_TX_EN; - config1 |= EEE_10_CAP | EEE_NWAY_EN | TX_QUIET_EN | RX_QUIET_EN; - config1 |= sd_rise_time(1); - config2 |= RG_DACQUIET_EN | RG_LDVQUIET_EN; - config3 |= fast_snr(42); - } else { - ocp_data &= ~(EEE_RX_EN | EEE_TX_EN); - config1 &= ~(EEE_10_CAP | EEE_NWAY_EN | TX_QUIET_EN | - RX_QUIET_EN); - config1 |= sd_rise_time(7); - config2 &= ~(RG_DACQUIET_EN | RG_LDVQUIET_EN); - config3 |= fast_snr(511); - } - - ocp_write_word(tp, MCU_TYPE_PLA, PLA_EEE_CR, ocp_data); - ocp_reg_write(tp, OCP_EEE_CONFIG1, config1); - ocp_reg_write(tp, OCP_EEE_CONFIG2, config2); - ocp_reg_write(tp, OCP_EEE_CONFIG3, config3); -} - -static void r8152b_enable_eee(struct r8152 *tp) -{ - r8152_eee_en(tp, true); - r8152_mmd_write(tp, MDIO_MMD_AN, MDIO_AN_EEE_ADV, MDIO_EEE_100TX); -} - -static void r8153_eee_en(struct r8152 *tp, bool enable) -{ - u32 ocp_data; - u16 config; - - ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_EEE_CR); - config = ocp_reg_read(tp, OCP_EEE_CFG); - - if (enable) { - ocp_data |= EEE_RX_EN | EEE_TX_EN; - config |= EEE10_EN; - } else { - ocp_data &= ~(EEE_RX_EN | EEE_TX_EN); - config &= ~EEE10_EN; - } - - ocp_write_word(tp, MCU_TYPE_PLA, PLA_EEE_CR, ocp_data); - ocp_reg_write(tp, OCP_EEE_CFG, config); -} - -static void r8153_enable_eee(struct r8152 *tp) -{ - r8153_eee_en(tp, true); - ocp_reg_write(tp, OCP_EEE_ADV, MDIO_EEE_1000T | MDIO_EEE_100TX); -} - -static void r8152b_enable_fc(struct r8152 *tp) -{ - u16 anar; - - anar = r8152_mdio_read(tp, MII_ADVERTISE); - anar |= ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM; - r8152_mdio_write(tp, MII_ADVERTISE, anar); -} - static void rtl_tally_reset(struct r8152 *tp) { u32 ocp_data; From 2dd436daac7848dbf3fe799cf59c1408871a14e3 Mon Sep 17 00:00:00 2001 From: hayeswang Date: Tue, 20 Sep 2016 16:22:06 +0800 Subject: [PATCH 461/538] r8152: move enabling PHY Move enabling PHY to init(), otherwise some other settings may fail. Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 43 ++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index ae7db460baa2..dbf11ba9d91c 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -2632,14 +2632,6 @@ static void rtl8152_disable(struct r8152 *tp) static void r8152b_hw_phy_cfg(struct r8152 *tp) { - u16 data; - - data = r8152_mdio_read(tp, MII_BMCR); - if (data & BMCR_PDOWN) { - data &= ~BMCR_PDOWN; - r8152_mdio_write(tp, MII_BMCR, data); - } - set_bit(PHY_RESET, &tp->flags); } @@ -2818,16 +2810,6 @@ static void r8153_hw_phy_cfg(struct r8152 *tp) u32 ocp_data; u16 data; - if (tp->version == RTL_VER_03 || tp->version == RTL_VER_04 || - tp->version == RTL_VER_05) - ocp_reg_write(tp, OCP_ADC_CFG, CKADSEL_L | ADC_EN | EN_EMI_L); - - data = r8152_mdio_read(tp, MII_BMCR); - if (data & BMCR_PDOWN) { - data &= ~BMCR_PDOWN; - r8152_mdio_write(tp, MII_BMCR, data); - } - if (tp->version == RTL_VER_03) { data = ocp_reg_read(tp, OCP_EEE_CFG); data &= ~CTAP_SHORT_EN; @@ -3355,10 +3337,17 @@ static void rtl_tally_reset(struct r8152 *tp) static void r8152b_init(struct r8152 *tp) { u32 ocp_data; + u16 data; if (test_bit(RTL8152_UNPLUG, &tp->flags)) return; + data = r8152_mdio_read(tp, MII_BMCR); + if (data & BMCR_PDOWN) { + data &= ~BMCR_PDOWN; + r8152_mdio_write(tp, MII_BMCR, data); + } + r8152_aldps_en(tp, false); if (tp->version == RTL_VER_01) { @@ -3394,6 +3383,7 @@ static void r8152b_init(struct r8152 *tp) static void r8153_init(struct r8152 *tp) { u32 ocp_data; + u16 data; int i; if (test_bit(RTL8152_UNPLUG, &tp->flags)) @@ -3416,6 +3406,23 @@ static void r8153_init(struct r8152 *tp) msleep(20); } + if (tp->version == RTL_VER_03 || tp->version == RTL_VER_04 || + tp->version == RTL_VER_05) + ocp_reg_write(tp, OCP_ADC_CFG, CKADSEL_L | ADC_EN | EN_EMI_L); + + data = r8152_mdio_read(tp, MII_BMCR); + if (data & BMCR_PDOWN) { + data &= ~BMCR_PDOWN; + r8152_mdio_write(tp, MII_BMCR, data); + } + + for (i = 0; i < 500; i++) { + ocp_data = ocp_reg_read(tp, OCP_PHY_STATUS) & PHY_STAT_MASK; + if (ocp_data == PHY_STAT_LAN_ON) + break; + msleep(20); + } + usb_disable_lpm(tp->udev); r8153_u2p3en(tp, false); From ef39df8eaba48c0de779440f41a648b17a560953 Mon Sep 17 00:00:00 2001 From: hayeswang Date: Tue, 20 Sep 2016 16:22:07 +0800 Subject: [PATCH 462/538] r8152: move PHY settings to hw_phy_cfg Move the PHY relative settings together to hw_phy_cfg(). Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index dbf11ba9d91c..9ce5bd549482 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -2632,6 +2632,10 @@ static void rtl8152_disable(struct r8152 *tp) static void r8152b_hw_phy_cfg(struct r8152 *tp) { + r8152b_enable_eee(tp); + r8152_aldps_en(tp, true); + r8152b_enable_fc(tp); + set_bit(PHY_RESET, &tp->flags); } @@ -2839,6 +2843,10 @@ static void r8153_hw_phy_cfg(struct r8152 *tp) sram_write(tp, SRAM_10M_AMP1, 0x00af); sram_write(tp, SRAM_10M_AMP2, 0x0208); + r8153_enable_eee(tp); + r8153_aldps_en(tp, true); + r8152b_enable_fc(tp); + set_bit(PHY_RESET, &tp->flags); } @@ -3369,9 +3377,6 @@ static void r8152b_init(struct r8152 *tp) SPDWN_RXDV_MSK | SPDWN_LINKCHG_MSK; ocp_write_word(tp, MCU_TYPE_PLA, PLA_GPHY_INTR_IMR, ocp_data); - r8152b_enable_eee(tp); - r8152_aldps_en(tp, true); - r8152b_enable_fc(tp); rtl_tally_reset(tp); /* enable rx aggregation */ @@ -3490,9 +3495,6 @@ static void r8153_init(struct r8152 *tp) ocp_write_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL3, 0); ocp_write_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL4, 0); - r8153_enable_eee(tp); - r8153_aldps_en(tp, true); - r8152b_enable_fc(tp); rtl_tally_reset(tp); r8153_u2p3en(tp, true); } From af0287ec10c62c84cc5cd1bad4fd37644a1ac41d Mon Sep 17 00:00:00 2001 From: hayeswang Date: Tue, 20 Sep 2016 16:22:08 +0800 Subject: [PATCH 463/538] r8152: remove r8153_enable_eee Remove r8153_enable_eee(). Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 9ce5bd549482..e7a05dd2be97 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -2803,12 +2803,6 @@ static void r8153_eee_en(struct r8152 *tp, bool enable) ocp_reg_write(tp, OCP_EEE_CFG, config); } -static void r8153_enable_eee(struct r8152 *tp) -{ - r8153_eee_en(tp, true); - ocp_reg_write(tp, OCP_EEE_ADV, MDIO_EEE_1000T | MDIO_EEE_100TX); -} - static void r8153_hw_phy_cfg(struct r8152 *tp) { u32 ocp_data; @@ -2843,7 +2837,9 @@ static void r8153_hw_phy_cfg(struct r8152 *tp) sram_write(tp, SRAM_10M_AMP1, 0x00af); sram_write(tp, SRAM_10M_AMP2, 0x0208); - r8153_enable_eee(tp); + r8153_eee_en(tp, true); + ocp_reg_write(tp, OCP_EEE_ADV, MDIO_EEE_1000T | MDIO_EEE_100TX); + r8153_aldps_en(tp, true); r8152b_enable_fc(tp); From d768c61bc353a0e0de3f839e1de99eee7d4eca10 Mon Sep 17 00:00:00 2001 From: hayeswang Date: Tue, 20 Sep 2016 16:22:09 +0800 Subject: [PATCH 464/538] r8152: disable ALDPS and EEE before setting PHY Disable ALDPS and EEE to avoid the possible failure when setting the PHY. Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index e7a05dd2be97..c254248863d4 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -32,7 +32,7 @@ #define NETNEXT_VERSION "08" /* Information for net */ -#define NET_VERSION "5" +#define NET_VERSION "6" #define DRIVER_VERSION "v1." NETNEXT_VERSION "." NET_VERSION #define DRIVER_AUTHOR "Realtek linux nic maintainers " @@ -2808,6 +2808,13 @@ static void r8153_hw_phy_cfg(struct r8152 *tp) u32 ocp_data; u16 data; + /* disable ALDPS before updating the PHY parameters */ + r8153_aldps_en(tp, false); + + /* disable EEE before updating the PHY parameters */ + r8153_eee_en(tp, false); + ocp_reg_write(tp, OCP_EEE_ADV, 0); + if (tp->version == RTL_VER_03) { data = ocp_reg_read(tp, OCP_EEE_CFG); data &= ~CTAP_SHORT_EN; @@ -3390,7 +3397,6 @@ static void r8153_init(struct r8152 *tp) if (test_bit(RTL8152_UNPLUG, &tp->flags)) return; - r8153_aldps_en(tp, false); r8153_u1u2en(tp, false); for (i = 0; i < 500; i++) { From b5036cd4ed3173ab8cdbc85e2ba74acf46bafb51 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 20 Sep 2016 16:17:22 +0200 Subject: [PATCH 465/538] ipmr, ip6mr: return lastuse relative to now When I introduced the lastuse member I made a subtle error because it was returned as an absolute value but that is meaningless to user-space as it doesn't allow to see how old exactly an entry is. Let's make it similar to how the bridge returns such values and make it relative to "now" (jiffies). This allows us to show the actual age of the entries and is much more useful (e.g. user-space daemons can age out entries, iproute2 can display the lastuse properly). Fixes: 43b9e1274060 ("net: ipmr/ip6mr: add support for keeping an entry age") Reported-by: Satish Ashok Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 7 +++++-- net/ipv6/ip6mr.c | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 26253328d227..a87bcd2d4a94 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2076,6 +2076,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct rta_mfc_stats mfcs; struct nlattr *mp_attr; struct rtnexthop *nhp; + unsigned long lastuse; int ct; /* If cache is unresolved, don't try to parse IIF and OIF */ @@ -2105,12 +2106,14 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, nla_nest_end(skb, mp_attr); + lastuse = READ_ONCE(c->mfc_un.res.lastuse); + lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; + mfcs.mfcs_packets = c->mfc_un.res.pkt; mfcs.mfcs_bytes = c->mfc_un.res.bytes; mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || - nla_put_u64_64bit(skb, RTA_EXPIRES, - jiffies_to_clock_t(c->mfc_un.res.lastuse), + nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), RTA_PAD)) return -EMSGSIZE; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 6122f9c5cc49..fccb5dd91902 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2239,6 +2239,7 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, struct rta_mfc_stats mfcs; struct nlattr *mp_attr; struct rtnexthop *nhp; + unsigned long lastuse; int ct; /* If cache is unresolved, don't try to parse IIF and OIF */ @@ -2269,12 +2270,14 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, nla_nest_end(skb, mp_attr); + lastuse = READ_ONCE(c->mfc_un.res.lastuse); + lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; + mfcs.mfcs_packets = c->mfc_un.res.pkt; mfcs.mfcs_bytes = c->mfc_un.res.bytes; mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || - nla_put_u64_64bit(skb, RTA_EXPIRES, - jiffies_to_clock_t(c->mfc_un.res.lastuse), + nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), RTA_PAD)) return -EMSGSIZE; From 63c43787d35e45562a6b5927e2edc8f4783d95b8 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 19 Sep 2016 16:17:57 +0200 Subject: [PATCH 466/538] vti6: fix input path Since commit 1625f4529957, vti6 is broken, all input packets are dropped (LINUX_MIB_XFRMINNOSTATES is incremented). XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 is set by vti6_rcv() before calling xfrm6_rcv()/xfrm6_rcv_spi(), thus we cannot set to NULL that value in xfrm6_rcv_spi(). A new function xfrm6_rcv_tnl() that enables to pass a value to xfrm6_rcv_spi() is added, so that xfrm6_rcv() is not touched (this function is used in several handlers). CC: Alexey Kodanev Fixes: 1625f4529957 ("net/xfrm_input: fix possible NULL deref of tunnel.ip6->parms.i_key") Signed-off-by: Nicolas Dichtel Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 4 +++- net/ipv6/ip6_vti.c | 4 +--- net/ipv6/xfrm6_input.c | 16 +++++++++++----- net/ipv6/xfrm6_tunnel.c | 2 +- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index adfebd6f243c..17934312eecb 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1540,8 +1540,10 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family); void xfrm4_local_error(struct sk_buff *skb, u32 mtu); int xfrm6_extract_header(struct sk_buff *skb); int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb); -int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi); +int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, + struct ip6_tnl *t); int xfrm6_transport_finish(struct sk_buff *skb, int async); +int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t); int xfrm6_rcv(struct sk_buff *skb); int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 52a2f735881f..5bd3afdcc771 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -321,11 +321,9 @@ static int vti6_rcv(struct sk_buff *skb) goto discard; } - XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; - rcu_read_unlock(); - return xfrm6_rcv(skb); + return xfrm6_rcv_tnl(skb, t); } rcu_read_unlock(); return -EINVAL; diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 00a2d40677d6..b5789562aded 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -21,9 +21,10 @@ int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb) return xfrm6_extract_header(skb); } -int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) +int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, + struct ip6_tnl *t) { - XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; XFRM_SPI_SKB_CB(skb)->family = AF_INET6; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); return xfrm_input(skb, nexthdr, spi, 0); @@ -49,13 +50,18 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) return -1; } -int xfrm6_rcv(struct sk_buff *skb) +int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t) { return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], - 0); + 0, t); } -EXPORT_SYMBOL(xfrm6_rcv); +EXPORT_SYMBOL(xfrm6_rcv_tnl); +int xfrm6_rcv(struct sk_buff *skb) +{ + return xfrm6_rcv_tnl(skb, NULL); +} +EXPORT_SYMBOL(xfrm6_rcv); int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto) { diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 5743044cd660..e1c0bbe7996c 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -236,7 +236,7 @@ static int xfrm6_tunnel_rcv(struct sk_buff *skb) __be32 spi; spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr); - return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi); + return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); } static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, From 8d58790b832e13d6006d842037732304af357c3c Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Mon, 19 Sep 2016 21:34:01 +0200 Subject: [PATCH 467/538] net: can: ifi: Configure transmitter delay Configure the transmitter delay register at +0x1c to correctly handle the CAN FD bitrate switch (BRS). This moves the SSP (secondary sample point) to a proper offset, so that the TDC mechanism works and won't generate error frames on the CAN link. Signed-off-by: Marek Vasut Cc: Marc Kleine-Budde Cc: Mark Rutland Cc: Oliver Hartkopp Cc: Wolfgang Grandegger Cc: linux-stable Signed-off-by: Marc Kleine-Budde --- drivers/net/can/ifi_canfd/ifi_canfd.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/ifi_canfd/ifi_canfd.c b/drivers/net/can/ifi_canfd/ifi_canfd.c index 2d1d22eec750..368bb0710d8f 100644 --- a/drivers/net/can/ifi_canfd/ifi_canfd.c +++ b/drivers/net/can/ifi_canfd/ifi_canfd.c @@ -81,6 +81,10 @@ #define IFI_CANFD_TIME_SET_TIMEA_4_12_6_6 BIT(15) #define IFI_CANFD_TDELAY 0x1c +#define IFI_CANFD_TDELAY_DEFAULT 0xb +#define IFI_CANFD_TDELAY_MASK 0x3fff +#define IFI_CANFD_TDELAY_ABS BIT(14) +#define IFI_CANFD_TDELAY_EN BIT(15) #define IFI_CANFD_ERROR 0x20 #define IFI_CANFD_ERROR_TX_OFFSET 0 @@ -641,7 +645,7 @@ static void ifi_canfd_set_bittiming(struct net_device *ndev) struct ifi_canfd_priv *priv = netdev_priv(ndev); const struct can_bittiming *bt = &priv->can.bittiming; const struct can_bittiming *dbt = &priv->can.data_bittiming; - u16 brp, sjw, tseg1, tseg2; + u16 brp, sjw, tseg1, tseg2, tdc; /* Configure bit timing */ brp = bt->brp - 2; @@ -664,6 +668,11 @@ static void ifi_canfd_set_bittiming(struct net_device *ndev) (brp << IFI_CANFD_TIME_PRESCALE_OFF) | (sjw << IFI_CANFD_TIME_SJW_OFF_7_9_8_8), priv->base + IFI_CANFD_FTIME); + + /* Configure transmitter delay */ + tdc = (dbt->brp * (dbt->phase_seg1 + 1)) & IFI_CANFD_TDELAY_MASK; + writel(IFI_CANFD_TDELAY_EN | IFI_CANFD_TDELAY_ABS | tdc, + priv->base + IFI_CANFD_TDELAY); } static void ifi_canfd_set_filter(struct net_device *ndev, const u32 id, From 3027f78bb7243bef28c103507fc857e1471d769d Mon Sep 17 00:00:00 2001 From: Alexandre TORGUE Date: Tue, 20 Sep 2016 18:00:56 +0200 Subject: [PATCH 468/538] Documentation/dt-bindings: Document STM32 EXTI controller bindings Originally-from: Maxime Coquelin Signed-off-by: Alexandre TORGUE Cc: Mark Rutland Cc: devicetree@vger.kernel.org Cc: Daniel Thompson Cc: Jason Cooper Cc: arnd@arndb.de Cc: Marc Zyngier Cc: bruherrera@gmail.com Cc: Linus Walleij Cc: linux-gpio@vger.kernel.org Cc: Rob Herring Cc: lee.jones@linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1474387259-18926-2-git-send-email-alexandre.torgue@st.com Signed-off-by: Thomas Gleixner --- .../interrupt-controller/st,stm32-exti.txt | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.txt diff --git a/Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.txt b/Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.txt new file mode 100644 index 000000000000..6e7703d4ff5b --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.txt @@ -0,0 +1,20 @@ +STM32 External Interrupt Controller + +Required properties: + +- compatible: Should be "st,stm32-exti" +- reg: Specifies base physical address and size of the registers +- interrupt-controller: Indentifies the node as an interrupt controller +- #interrupt-cells: Specifies the number of cells to encode an interrupt + specifier, shall be 2 +- interrupts: interrupts references to primary interrupt controller + +Example: + +exti: interrupt-controller@40013c00 { + compatible = "st,stm32-exti"; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x40013C00 0x400>; + interrupts = <1>, <2>, <3>, <6>, <7>, <8>, <9>, <10>, <23>, <40>, <41>, <42>, <62>, <76>; +}; From e072041688ca73f125719815fa4b0fd23a45152c Mon Sep 17 00:00:00 2001 From: Alexandre TORGUE Date: Tue, 20 Sep 2016 18:00:57 +0200 Subject: [PATCH 469/538] drivers/irqchip: Add STM32 external interrupts support The STM32 external interrupt controller consists of edge detectors that generate interrupts requests or wake-up events. Each line can be independently configured as interrupt or wake-up source, and triggers either on rising, falling or both edges. Each line can also be masked independently. Originally-from: Maxime Coquelin Signed-off-by: Alexandre TORGUE Cc: Mark Rutland Cc: devicetree@vger.kernel.org Cc: Daniel Thompson Cc: Jason Cooper Cc: arnd@arndb.de Cc: Marc Zyngier Cc: bruherrera@gmail.com Cc: Linus Walleij Cc: linux-gpio@vger.kernel.org Cc: Rob Herring Cc: lee.jones@linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1474387259-18926-3-git-send-email-alexandre.torgue@st.com Signed-off-by: Thomas Gleixner --- drivers/irqchip/Kconfig | 4 + drivers/irqchip/Makefile | 1 + drivers/irqchip/irq-stm32-exti.c | 201 +++++++++++++++++++++++++++++++ 3 files changed, 206 insertions(+) create mode 100644 drivers/irqchip/irq-stm32-exti.c diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 9aeea1d8a579..329c941e46b5 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -265,3 +265,7 @@ config EZNPS_GIC select IRQ_DOMAIN help Support the EZchip NPS400 global interrupt controller + +config STM32_EXTI + bool + select IRQ_DOMAIN diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 4c203b6b8163..96383b22cffe 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -71,3 +71,4 @@ obj-$(CONFIG_MVEBU_ODMI) += irq-mvebu-odmi.o obj-$(CONFIG_LS_SCFG_MSI) += irq-ls-scfg-msi.o obj-$(CONFIG_EZNPS_GIC) += irq-eznps.o obj-$(CONFIG_ARCH_ASPEED) += irq-aspeed-vic.o +obj-$(CONFIG_STM32_EXTI) += irq-stm32-exti.o diff --git a/drivers/irqchip/irq-stm32-exti.c b/drivers/irqchip/irq-stm32-exti.c new file mode 100644 index 000000000000..491568c95aa5 --- /dev/null +++ b/drivers/irqchip/irq-stm32-exti.c @@ -0,0 +1,201 @@ +/* + * Copyright (C) Maxime Coquelin 2015 + * Author: Maxime Coquelin + * License terms: GNU General Public License (GPL), version 2 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define EXTI_IMR 0x0 +#define EXTI_EMR 0x4 +#define EXTI_RTSR 0x8 +#define EXTI_FTSR 0xc +#define EXTI_SWIER 0x10 +#define EXTI_PR 0x14 + +static void stm32_irq_handler(struct irq_desc *desc) +{ + struct irq_domain *domain = irq_desc_get_handler_data(desc); + struct irq_chip_generic *gc = domain->gc->gc[0]; + struct irq_chip *chip = irq_desc_get_chip(desc); + unsigned long pending; + int n; + + chained_irq_enter(chip, desc); + + while ((pending = irq_reg_readl(gc, EXTI_PR))) { + for_each_set_bit(n, &pending, BITS_PER_LONG) { + generic_handle_irq(irq_find_mapping(domain, n)); + irq_reg_writel(gc, BIT(n), EXTI_PR); + } + } + + chained_irq_exit(chip, desc); +} + +static int stm32_irq_set_type(struct irq_data *data, unsigned int type) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data); + int pin = data->hwirq; + u32 rtsr, ftsr; + + irq_gc_lock(gc); + + rtsr = irq_reg_readl(gc, EXTI_RTSR); + ftsr = irq_reg_readl(gc, EXTI_FTSR); + + switch (type) { + case IRQ_TYPE_EDGE_RISING: + rtsr |= BIT(pin); + ftsr &= ~BIT(pin); + break; + case IRQ_TYPE_EDGE_FALLING: + rtsr &= ~BIT(pin); + ftsr |= BIT(pin); + break; + case IRQ_TYPE_EDGE_BOTH: + rtsr |= BIT(pin); + ftsr |= BIT(pin); + break; + default: + irq_gc_unlock(gc); + return -EINVAL; + } + + irq_reg_writel(gc, rtsr, EXTI_RTSR); + irq_reg_writel(gc, ftsr, EXTI_FTSR); + + irq_gc_unlock(gc); + + return 0; +} + +static int stm32_irq_set_wake(struct irq_data *data, unsigned int on) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data); + int pin = data->hwirq; + u32 emr; + + irq_gc_lock(gc); + + emr = irq_reg_readl(gc, EXTI_EMR); + if (on) + emr |= BIT(pin); + else + emr &= ~BIT(pin); + irq_reg_writel(gc, emr, EXTI_EMR); + + irq_gc_unlock(gc); + + return 0; +} + +static int stm32_exti_alloc(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs, void *data) +{ + struct irq_chip_generic *gc = d->gc->gc[0]; + struct irq_fwspec *fwspec = data; + irq_hw_number_t hwirq; + + hwirq = fwspec->param[0]; + + irq_map_generic_chip(d, virq, hwirq); + irq_domain_set_info(d, virq, hwirq, &gc->chip_types->chip, gc, + handle_simple_irq, NULL, NULL); + + return 0; +} + +static void stm32_exti_free(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *data = irq_domain_get_irq_data(d, virq); + + irq_domain_reset_irq_data(data); +} + +struct irq_domain_ops irq_exti_domain_ops = { + .map = irq_map_generic_chip, + .xlate = irq_domain_xlate_onetwocell, + .alloc = stm32_exti_alloc, + .free = stm32_exti_free, +}; + +static int __init stm32_exti_init(struct device_node *node, + struct device_node *parent) +{ + unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN; + int nr_irqs, nr_exti, ret, i; + struct irq_chip_generic *gc; + struct irq_domain *domain; + void *base; + + base = of_iomap(node, 0); + if (!base) { + pr_err("%s: Unable to map registers\n", node->full_name); + return -ENOMEM; + } + + /* Determine number of irqs supported */ + writel_relaxed(~0UL, base + EXTI_RTSR); + nr_exti = fls(readl_relaxed(base + EXTI_RTSR)); + writel_relaxed(0, base + EXTI_RTSR); + + pr_info("%s: %d External IRQs detected\n", node->full_name, nr_exti); + + domain = irq_domain_add_linear(node, nr_exti, + &irq_exti_domain_ops, NULL); + if (!domain) { + pr_err("%s: Could not register interrupt domain.\n", + node->name); + ret = -ENOMEM; + goto out_unmap; + } + + ret = irq_alloc_domain_generic_chips(domain, nr_exti, 1, "exti", + handle_edge_irq, clr, 0, 0); + if (ret) { + pr_err("%s: Could not allocate generic interrupt chip.\n", + node->full_name); + goto out_free_domain; + } + + gc = domain->gc->gc[0]; + gc->reg_base = base; + gc->chip_types->type = IRQ_TYPE_EDGE_BOTH; + gc->chip_types->chip.name = gc->chip_types[0].chip.name; + gc->chip_types->chip.irq_ack = irq_gc_ack_set_bit; + gc->chip_types->chip.irq_mask = irq_gc_mask_clr_bit; + gc->chip_types->chip.irq_unmask = irq_gc_mask_set_bit; + gc->chip_types->chip.irq_set_type = stm32_irq_set_type; + gc->chip_types->chip.irq_set_wake = stm32_irq_set_wake; + gc->chip_types->regs.ack = EXTI_PR; + gc->chip_types->regs.mask = EXTI_IMR; + gc->chip_types->handler = handle_edge_irq; + + nr_irqs = of_irq_count(node); + for (i = 0; i < nr_irqs; i++) { + unsigned int irq = irq_of_parse_and_map(node, i); + + irq_set_handler_data(irq, domain); + irq_set_chained_handler(irq, stm32_irq_handler); + } + + return 0; + +out_free_domain: + irq_domain_remove(domain); +out_unmap: + iounmap(base); + return ret; +} + +IRQCHIP_DECLARE(stm32_exti, "st,stm32-exti", stm32_exti_init); From 47f91519546ce39cceee2c51b0f5045eadc688a9 Mon Sep 17 00:00:00 2001 From: Alexandre TORGUE Date: Tue, 20 Sep 2016 18:00:58 +0200 Subject: [PATCH 470/538] ARM/STM32: Select external interrupts controller Originally-from: Maxime Coquelin Signed-off-by: Alexandre TORGUE Cc: Mark Rutland Cc: devicetree@vger.kernel.org Cc: Daniel Thompson Cc: Jason Cooper Cc: arnd@arndb.de Cc: Marc Zyngier Cc: bruherrera@gmail.com Cc: Linus Walleij Cc: linux-gpio@vger.kernel.org Cc: Rob Herring Cc: lee.jones@linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1474387259-18926-4-git-send-email-alexandre.torgue@st.com Signed-off-by: Thomas Gleixner --- arch/arm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index a9c4e48bb7ec..bc9d6df8bab4 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -878,6 +878,7 @@ config ARCH_STM32 select CLKSRC_STM32 select PINCTRL select RESET_CONTROLLER + select STM32_EXTI help Support for STMicroelectronics STM32 processors. From 5a79d596378b65e773d93d00edcb57a33f87ea94 Mon Sep 17 00:00:00 2001 From: Alexandre TORGUE Date: Tue, 20 Sep 2016 18:00:59 +0200 Subject: [PATCH 471/538] ARM/dts: Add EXTI controller node to stm32f429 Originally-from: Maxime Coquelin Signed-off-by: Alexandre TORGUE Cc: Mark Rutland Cc: devicetree@vger.kernel.org Cc: Daniel Thompson Cc: Jason Cooper Cc: arnd@arndb.de Cc: Marc Zyngier Cc: bruherrera@gmail.com Cc: Linus Walleij Cc: linux-gpio@vger.kernel.org Cc: Rob Herring Cc: lee.jones@linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1474387259-18926-5-git-send-email-alexandre.torgue@st.com Signed-off-by: Thomas Gleixner --- arch/arm/boot/dts/stm32f429.dtsi | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm/boot/dts/stm32f429.dtsi b/arch/arm/boot/dts/stm32f429.dtsi index 35df462559ca..1a189d44ad38 100644 --- a/arch/arm/boot/dts/stm32f429.dtsi +++ b/arch/arm/boot/dts/stm32f429.dtsi @@ -176,6 +176,14 @@ reg = <0x40013800 0x400>; }; + exti: interrupt-controller@40013c00 { + compatible = "st,stm32-exti"; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x40013C00 0x400>; + interrupts = <1>, <2>, <3>, <6>, <7>, <8>, <9>, <10>, <23>, <40>, <41>, <42>, <62>, <76>; + }; + pin-controller { #address-cells = <1>; #size-cells = <1>; From 8c2103f224216a45c1a4d7aebbc13f3e007cde34 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 17 Sep 2016 23:39:25 +0200 Subject: [PATCH 472/538] x86/e820: Mark some static functions __init They are all called only from other __init functions in e820.c Signed-off-by: Denys Vlasenko Acked-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Yinghai Lu Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20160917213927.1787-1-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 871f1863457d..4d3dd9a713c7 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -802,7 +802,7 @@ unsigned long __init e820_end_of_low_ram_pfn(void) return e820_end_pfn(1UL << (32-PAGE_SHIFT)); } -static void early_panic(char *msg) +static void __init early_panic(char *msg) { early_printk(msg); panic(msg); @@ -912,7 +912,7 @@ void __init finish_e820_parsing(void) } } -static const char *e820_type_to_string(int e820_type) +static const char *__init e820_type_to_string(int e820_type) { switch (e820_type) { case E820_RESERVED_KERN: @@ -926,7 +926,7 @@ static const char *e820_type_to_string(int e820_type) } } -static unsigned long e820_type_to_iomem_type(int e820_type) +static unsigned long __init e820_type_to_iomem_type(int e820_type) { switch (e820_type) { case E820_RESERVED_KERN: @@ -942,7 +942,7 @@ static unsigned long e820_type_to_iomem_type(int e820_type) } } -static unsigned long e820_type_to_iores_desc(int e820_type) +static unsigned long __init e820_type_to_iores_desc(int e820_type) { switch (e820_type) { case E820_ACPI: @@ -961,7 +961,7 @@ static unsigned long e820_type_to_iores_desc(int e820_type) } } -static bool do_mark_busy(u32 type, struct resource *res) +static bool __init do_mark_busy(u32 type, struct resource *res) { /* this is the legacy bios/dos rom-shadow + mmio region */ if (res->start < (1ULL<<20)) @@ -1027,7 +1027,7 @@ void __init e820_reserve_resources(void) } /* How much should we pad RAM ending depending on where it is? */ -static unsigned long ram_alignment(resource_size_t pos) +static unsigned long __init ram_alignment(resource_size_t pos) { unsigned long mb = pos >> 20; From 475339684ef19e46f4702e2d185a869a5c454688 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 17 Sep 2016 23:39:26 +0200 Subject: [PATCH 473/538] x86/e820: Prepare e280 code for switch to dynamic storage This patch turns e820 and e820_saved into pointers to e820 tables, of the same size as before. Signed-off-by: Denys Vlasenko Acked-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Yinghai Lu Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20160917213927.1787-2-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/e820.h | 6 +- arch/x86/kernel/e820.c | 125 ++++++++++++++++++------------ arch/x86/kernel/early-quirks.c | 2 +- arch/x86/kernel/kexec-bzimage64.c | 4 +- arch/x86/kernel/resource.c | 4 +- arch/x86/kernel/setup.c | 8 +- arch/x86/kernel/tboot.c | 8 +- arch/x86/mm/init.c | 2 + arch/x86/platform/efi/efi.c | 2 +- arch/x86/xen/setup.c | 2 +- 10 files changed, 98 insertions(+), 65 deletions(-) diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 3ab0537872fb..476b574de99e 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -10,8 +10,8 @@ #include #ifndef __ASSEMBLY__ /* see comment in arch/x86/kernel/e820.c */ -extern struct e820map e820; -extern struct e820map e820_saved; +extern struct e820map *e820; +extern struct e820map *e820_saved; extern unsigned long pci_mem_start; extern int e820_any_mapped(u64 start, u64 end, unsigned type); @@ -53,6 +53,8 @@ extern void e820_reserve_resources_late(void); extern void setup_memory_map(void); extern char *default_machine_specific_memory_setup(void); +extern void e820_reallocate_tables(void); + /* * Returns true iff the specified range [s,e) is completely contained inside * the ISA region. diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 4d3dd9a713c7..585000c98d3e 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -40,8 +40,10 @@ * user can e.g. boot the original kernel with mem=1G while still booting the * next kernel with full memory. */ -struct e820map e820; -struct e820map e820_saved; +static struct e820map initial_e820; +static struct e820map initial_e820_saved; +struct e820map *e820 = &initial_e820; +struct e820map *e820_saved = &initial_e820_saved; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; @@ -58,8 +60,8 @@ e820_any_mapped(u64 start, u64 end, unsigned type) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (type && ei->type != type) continue; @@ -81,8 +83,8 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (type && ei->type != type) continue; @@ -128,7 +130,7 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, void __init e820_add_region(u64 start, u64 size, int type) { - __e820_add_region(&e820, start, size, type); + __e820_add_region(e820, start, size, type); } static void __init e820_print_type(u32 type) @@ -164,12 +166,12 @@ void __init e820_print_map(char *who) { int i; - for (i = 0; i < e820.nr_map; i++) { + for (i = 0; i < e820->nr_map; i++) { printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who, - (unsigned long long) e820.map[i].addr, + (unsigned long long) e820->map[i].addr, (unsigned long long) - (e820.map[i].addr + e820.map[i].size - 1)); - e820_print_type(e820.map[i].type); + (e820->map[i].addr + e820->map[i].size - 1)); + e820_print_type(e820->map[i].type); printk(KERN_CONT "\n"); } } @@ -493,13 +495,13 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, unsigned new_type) { - return __e820_update_range(&e820, start, size, old_type, new_type); + return __e820_update_range(e820, start, size, old_type, new_type); } static u64 __init e820_update_range_saved(u64 start, u64 size, unsigned old_type, unsigned new_type) { - return __e820_update_range(&e820_saved, start, size, old_type, + return __e820_update_range(e820_saved, start, size, old_type, new_type); } @@ -521,8 +523,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, e820_print_type(old_type); printk(KERN_CONT "\n"); - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; u64 final_start, final_end; u64 ei_end; @@ -566,15 +568,15 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, void __init update_e820(void) { - if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map)) + if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map)) return; printk(KERN_INFO "e820: modified physical RAM map:\n"); e820_print_map("modified"); } static void __init update_e820_saved(void) { - sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), - &e820_saved.nr_map); + sanitize_e820_map(e820_saved->map, ARRAY_SIZE(e820_saved->map), + &e820_saved->nr_map); } #define MAX_GAP_END 0x100000000ull /* @@ -584,14 +586,14 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, unsigned long start_addr, unsigned long long end_addr) { unsigned long long last; - int i = e820.nr_map; + int i = e820->nr_map; int found = 0; last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END; while (--i >= 0) { - unsigned long long start = e820.map[i].addr; - unsigned long long end = start + e820.map[i].size; + unsigned long long start = e820->map[i].addr; + unsigned long long end = start + e820->map[i].size; if (end < start_addr) continue; @@ -649,6 +651,33 @@ __init void e820_setup_gap(void) gapstart, gapstart + gapsize - 1); } +/* + * Called late during init, in free_initmem(). + * + * Initial e820 and e820_saved are largish __initdata arrays. + * Copy them to (usually much smaller) dynamically allocated area. + * This is done after all tweaks we ever do to them: + * all functions which modify them are __init functions, + * they won't exist after this point. + */ +__init void e820_reallocate_tables(void) +{ + struct e820map *n; + int size; + + size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820->nr_map; + n = kmalloc(size, GFP_KERNEL); + BUG_ON(!n); + memcpy(n, e820, size); + e820 = n; + + size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820_saved->nr_map; + n = kmalloc(size, GFP_KERNEL); + BUG_ON(!n); + memcpy(n, e820_saved, size); + e820_saved = n; +} + /** * Because of the size limitation of struct boot_params, only first * 128 E820 memory entries are passed to kernel via @@ -665,7 +694,7 @@ void __init parse_e820_ext(u64 phys_addr, u32 data_len) entries = sdata->len / sizeof(struct e820entry); extmap = (struct e820entry *)(sdata->data); __append_e820_map(extmap, entries); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); early_memunmap(sdata, data_len); printk(KERN_INFO "e820: extended physical RAM map:\n"); e820_print_map("extended"); @@ -686,8 +715,8 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn) int i; unsigned long pfn = 0; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (pfn < PFN_UP(ei->addr)) register_nosave_region(pfn, PFN_UP(ei->addr)); @@ -712,8 +741,8 @@ static int __init e820_mark_nvs_memory(void) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (ei->type == E820_NVS) acpi_nvs_register(ei->addr, ei->size); @@ -760,8 +789,8 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn) unsigned long last_pfn = 0; unsigned long max_arch_pfn = MAX_ARCH_PFN; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; unsigned long start_pfn; unsigned long end_pfn; @@ -856,7 +885,7 @@ static int __init parse_memmap_one(char *p) */ saved_max_pfn = e820_end_of_ram_pfn(); #endif - e820.nr_map = 0; + e820->nr_map = 0; userdef = 1; return 0; } @@ -903,8 +932,8 @@ early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) { if (userdef) { - if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), - &e820.nr_map) < 0) + if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), + &e820->nr_map) < 0) early_panic("Invalid user supplied memory map"); printk(KERN_INFO "e820: user-defined physical RAM map:\n"); @@ -991,35 +1020,35 @@ void __init e820_reserve_resources(void) struct resource *res; u64 end; - res = alloc_bootmem(sizeof(struct resource) * e820.nr_map); + res = alloc_bootmem(sizeof(struct resource) * e820->nr_map); e820_res = res; - for (i = 0; i < e820.nr_map; i++) { - end = e820.map[i].addr + e820.map[i].size - 1; + for (i = 0; i < e820->nr_map; i++) { + end = e820->map[i].addr + e820->map[i].size - 1; if (end != (resource_size_t)end) { res++; continue; } - res->name = e820_type_to_string(e820.map[i].type); - res->start = e820.map[i].addr; + res->name = e820_type_to_string(e820->map[i].type); + res->start = e820->map[i].addr; res->end = end; - res->flags = e820_type_to_iomem_type(e820.map[i].type); - res->desc = e820_type_to_iores_desc(e820.map[i].type); + res->flags = e820_type_to_iomem_type(e820->map[i].type); + res->desc = e820_type_to_iores_desc(e820->map[i].type); /* * don't register the region that could be conflicted with * pci device BAR resource and insert them later in * pcibios_resource_survey() */ - if (do_mark_busy(e820.map[i].type, res)) { + if (do_mark_busy(e820->map[i].type, res)) { res->flags |= IORESOURCE_BUSY; insert_resource(&iomem_resource, res); } res++; } - for (i = 0; i < e820_saved.nr_map; i++) { - struct e820entry *entry = &e820_saved.map[i]; + for (i = 0; i < e820_saved->nr_map; i++) { + struct e820entry *entry = &e820_saved->map[i]; firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry->type)); @@ -1051,7 +1080,7 @@ void __init e820_reserve_resources_late(void) struct resource *res; res = e820_res; - for (i = 0; i < e820.nr_map; i++) { + for (i = 0; i < e820->nr_map; i++) { if (!res->parent && res->end) insert_resource_expand_to_fit(&iomem_resource, res); res++; @@ -1061,8 +1090,8 @@ void __init e820_reserve_resources_late(void) * Try to bump up RAM regions to reasonable boundaries to * avoid stolen RAM: */ - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *entry = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *entry = &e820->map[i]; u64 start, end; if (entry->type != E820_RAM) @@ -1110,7 +1139,7 @@ char *__init default_machine_specific_memory_setup(void) who = "BIOS-e801"; } - e820.nr_map = 0; + e820->nr_map = 0; e820_add_region(0, LOWMEMSIZE(), E820_RAM); e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); } @@ -1124,7 +1153,7 @@ void __init setup_memory_map(void) char *who; who = x86_init.resources.memory_setup(); - memcpy(&e820_saved, &e820, sizeof(struct e820map)); + memcpy(e820_saved, e820, sizeof(struct e820map)); printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n"); e820_print_map(who); } @@ -1141,8 +1170,8 @@ void __init memblock_x86_fill(void) */ memblock_allow_resize(); - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; end = ei->addr + ei->size; if (end != (resource_size_t)end) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index de7501edb21c..18bb3a639197 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -555,7 +555,7 @@ intel_graphics_stolen(int num, int slot, int func, /* Mark this space as reserved */ e820_add_region(base, size, E820_RESERVED); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); } static void __init intel_graphics_quirks(int num, int slot, int func) diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index f2356bda2b05..3407b148c240 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -99,14 +99,14 @@ static int setup_e820_entries(struct boot_params *params) { unsigned int nr_e820_entries; - nr_e820_entries = e820_saved.nr_map; + nr_e820_entries = e820_saved->nr_map; /* TODO: Pass entries more than E820MAX in bootparams setup data */ if (nr_e820_entries > E820MAX) nr_e820_entries = E820MAX; params->e820_entries = nr_e820_entries; - memcpy(¶ms->e820_map, &e820_saved.map, + memcpy(¶ms->e820_map, &e820_saved->map, nr_e820_entries * sizeof(struct e820entry)); return 0; diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 80eab01c1a68..2408c1603438 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -27,8 +27,8 @@ static void remove_e820_regions(struct resource *avail) int i; struct e820entry *entry; - for (i = 0; i < e820.nr_map; i++) { - entry = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + entry = &e820->map[i]; resource_clip(avail, entry->addr, entry->addr + entry->size - 1); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0fa60f5f5a16..cc43d660c990 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -458,8 +458,8 @@ static void __init e820_reserve_setup_data(void) early_memunmap(data, sizeof(*data)); } - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - memcpy(&e820_saved, &e820, sizeof(struct e820map)); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + memcpy(e820_saved, e820, sizeof(struct e820map)); printk(KERN_INFO "extended physical RAM map:\n"); e820_print_map("reserve setup_data"); } @@ -763,7 +763,7 @@ static void __init trim_bios_range(void) */ e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); } /* called before trim_bios_range() to spare extra sanitize */ @@ -1032,7 +1032,7 @@ void __init setup_arch(char **cmdline_p) if (ppro_with_ram_bug()) { e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, E820_RESERVED); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); printk(KERN_INFO "fixed physical RAM map:\n"); e820_print_map("bad_ppro"); } diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 654f6c66fe45..8402907825b0 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -188,12 +188,12 @@ static int tboot_setup_sleep(void) tboot->num_mac_regions = 0; - for (i = 0; i < e820.nr_map; i++) { - if ((e820.map[i].type != E820_RAM) - && (e820.map[i].type != E820_RESERVED_KERN)) + for (i = 0; i < e820->nr_map; i++) { + if ((e820->map[i].type != E820_RAM) + && (e820->map[i].type != E820_RESERVED_KERN)) continue; - add_mac_region(e820.map[i].addr, e820.map[i].size); + add_mac_region(e820->map[i].addr, e820->map[i].size); } tboot->acpi_sinfo.kernel_s3_resume_vector = diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index d28a2d741f9e..167deae767cb 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -701,6 +701,8 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) void free_initmem(void) { + /* e820_reallocate_tables(); - disabled for now */ + free_init_pages("unused kernel", (unsigned long)(&__init_begin), (unsigned long)(&__init_end)); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 1fbb408e2e72..2e3433444b65 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -166,7 +166,7 @@ static void __init do_add_efi_memmap(void) } e820_add_region(start, size, e820_type); } - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); } int __init efi_memblock_x86_reserve_range(void) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 176425233e4d..f8960fca0827 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -861,7 +861,7 @@ char * __init xen_memory_setup(void) e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); /* * Check whether the kernel itself conflicts with the target E820 map. From 1827822902cf659d60d3413fd42c7e6cbd18df4d Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 18 Sep 2016 20:21:25 +0200 Subject: [PATCH 474/538] x86/e820: Use much less memory for e820/e820_saved, save up to 120k The maximum size of e820 map array for EFI systems is defined as E820_X_MAX (E820MAX + 3 * MAX_NUMNODES). In x86_64 defconfig, this ends up with E820_X_MAX = 320, e820 and e820_saved are 6404 bytes each. With larger configs, for example Fedora kernels, E820_X_MAX = 3200, e820 and e820_saved are 64004 bytes each. Most of this space is wasted. Typical machines have some 20-30 e820 areas at most. After previous patch, e820 and e820_saved are pointers to e280 maps. Change them to initially point to maps which are __initdata. At the very end of kernel init, just before __init[data] sections are freed in free_initmem(), allocate smaller blocks, copy maps there, and change pointers. The late switch makes sure that all functions which can be used to change e820 maps are no longer accessible (they are all __init functions). Run-tested. Signed-off-by: Denys Vlasenko Acked-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Yinghai Lu Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20160918182125.21000-1-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 8 ++++---- arch/x86/mm/init.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 585000c98d3e..bb8c69079c78 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -40,10 +40,10 @@ * user can e.g. boot the original kernel with mem=1G while still booting the * next kernel with full memory. */ -static struct e820map initial_e820; -static struct e820map initial_e820_saved; -struct e820map *e820 = &initial_e820; -struct e820map *e820_saved = &initial_e820_saved; +static struct e820map initial_e820 __initdata; +static struct e820map initial_e820_saved __initdata; +struct e820map *e820 __refdata = &initial_e820; +struct e820map *e820_saved __refdata = &initial_e820_saved; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 167deae767cb..22af912d66d2 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -699,9 +699,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) } } -void free_initmem(void) +void __ref free_initmem(void) { - /* e820_reallocate_tables(); - disabled for now */ + e820_reallocate_tables(); free_init_pages("unused kernel", (unsigned long)(&__init_begin), From 371a015344b6e270e7e3632107d9554ec6d27a6b Mon Sep 17 00:00:00 2001 From: "Yadi.hu" Date: Sun, 18 Sep 2016 18:52:31 +0800 Subject: [PATCH 475/538] i2c-eg20t: fix race between i2c init and interrupt enable the eg20t driver call request_irq() function before the pch_base_address, base address of i2c controller's register, is assigned an effective value. there is one possible scenario that an interrupt which isn't inside eg20t arrives immediately after request_irq() is executed when i2c controller shares an interrupt number with others. since the interrupt handler pch_i2c_handler() has already active as shared action, it will be called and read its own register to determine if this interrupt is from itself. At that moment, since base address of i2c registers is not remapped in kernel space yet,so the INT handler will access an illegal address and then a error occurs. Signed-off-by: Yadi.hu Signed-off-by: Wolfram Sang Cc: stable@kernel.org --- drivers/i2c/busses/i2c-eg20t.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/i2c/busses/i2c-eg20t.c b/drivers/i2c/busses/i2c-eg20t.c index 137125b5eae7..5ce71ce7b6c4 100644 --- a/drivers/i2c/busses/i2c-eg20t.c +++ b/drivers/i2c/busses/i2c-eg20t.c @@ -773,13 +773,6 @@ static int pch_i2c_probe(struct pci_dev *pdev, /* Set the number of I2C channel instance */ adap_info->ch_num = id->driver_data; - ret = request_irq(pdev->irq, pch_i2c_handler, IRQF_SHARED, - KBUILD_MODNAME, adap_info); - if (ret) { - pch_pci_err(pdev, "request_irq FAILED\n"); - goto err_request_irq; - } - for (i = 0; i < adap_info->ch_num; i++) { pch_adap = &adap_info->pch_data[i].pch_adapter; adap_info->pch_i2c_suspended = false; @@ -797,6 +790,17 @@ static int pch_i2c_probe(struct pci_dev *pdev, pch_adap->dev.of_node = pdev->dev.of_node; pch_adap->dev.parent = &pdev->dev; + } + + ret = request_irq(pdev->irq, pch_i2c_handler, IRQF_SHARED, + KBUILD_MODNAME, adap_info); + if (ret) { + pch_pci_err(pdev, "request_irq FAILED\n"); + goto err_request_irq; + } + + for (i = 0; i < adap_info->ch_num; i++) { + pch_adap = &adap_info->pch_data[i].pch_adapter; pch_i2c_init(&adap_info->pch_data[i]); From 2532fc318db0e1fe68e01407ee27634c76916e44 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Thu, 25 Aug 2016 16:35:14 +0800 Subject: [PATCH 476/538] x86/numa: Online memory-less nodes at boot time For now, x86 does not support memory-less node. A node without memory will not be onlined, and the cpus on it will be mapped to the other online nodes with memory in init_cpu_to_node(). The reason of doing this is to ensure each cpu has mapped to a node with memory, so that it will be able to allocate local memory for that cpu. But we don't have to do it in this way. In this series of patches, we are going to construct cpu <-> node mapping for all possible cpus at boot time, which is a persistent mapping. It means that the cpu will be mapped to the node which it belongs to, and will never be changed. If a node has only cpus but no memory, the cpus on it will be mapped to a memory-less node. And the memory-less node should be onlined. Allocate pgdats for all memory-less nodes and online them at boot time. Then build zonelists for these nodes. As a result, when cpus on these memory-less nodes try to allocate memory from local node, it will automatically fall back to the proper zones in the zonelists. Signed-off-by: Zhu Guihua Signed-off-by: Dou Liyang Acked-by: Ingo Molnar Cc: mika.j.penttila@gmail.com Cc: len.brown@intel.com Cc: Tang Chen Cc: rafael@kernel.org Cc: rjw@rjwysocki.net Cc: yasu.isimatu@gmail.com Cc: linux-mm@kvack.org Cc: linux-acpi@vger.kernel.org Cc: isimatu.yasuaki@jp.fujitsu.com Cc: gongzhaogang@inspur.com Cc: tj@kernel.org Cc: izumi.taku@jp.fujitsu.com Cc: cl@linux.com Cc: chen.tang@easystack.cn Cc: akpm@linux-foundation.org Cc: kamezawa.hiroyu@jp.fujitsu.com Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/1472114120-3281-2-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/numa.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index fb682108f4dc..3f35b48d1d9d 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -722,22 +722,19 @@ void __init x86_numa_init(void) numa_init(dummy_numa_init); } -static __init int find_near_online_node(int node) +static void __init init_memory_less_node(int nid) { - int n, val; - int min_val = INT_MAX; - int best_node = -1; + unsigned long zones_size[MAX_NR_ZONES] = {0}; + unsigned long zholes_size[MAX_NR_ZONES] = {0}; - for_each_online_node(n) { - val = node_distance(node, n); + /* Allocate and initialize node data. Memory-less node is now online.*/ + alloc_node_data(nid); + free_area_init_node(nid, zones_size, 0, zholes_size); - if (val < min_val) { - min_val = val; - best_node = n; - } - } - - return best_node; + /* + * All zonelists will be built later in start_kernel() after per cpu + * areas are initialized. + */ } /* @@ -766,8 +763,10 @@ void __init init_cpu_to_node(void) if (node == NUMA_NO_NODE) continue; + if (!node_online(node)) - node = find_near_online_node(node); + init_memory_less_node(node); + numa_set_node(cpu, node); } } From f7c28833c252031bc68a29e26a18a661797cf3a3 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Thu, 25 Aug 2016 16:35:15 +0800 Subject: [PATCH 477/538] x86/acpi: Enable acpi to register all possible cpus at boot time cpuid <-> nodeid mapping is firstly established at boot time. And workqueue caches the mapping in wq_numa_possible_cpumask in wq_numa_init() at boot time. When doing node online/offline, cpuid <-> nodeid mapping is established/destroyed, which means, cpuid <-> nodeid mapping will change if node hotplug happens. But workqueue does not update wq_numa_possible_cpumask. So here is the problem: Assume we have the following cpuid <-> nodeid in the beginning: Node | CPU ------------------------ node 0 | 0-14, 60-74 node 1 | 15-29, 75-89 node 2 | 30-44, 90-104 node 3 | 45-59, 105-119 and we hot-remove node2 and node3, it becomes: Node | CPU ------------------------ node 0 | 0-14, 60-74 node 1 | 15-29, 75-89 and we hot-add node4 and node5, it becomes: Node | CPU ------------------------ node 0 | 0-14, 60-74 node 1 | 15-29, 75-89 node 4 | 30-59 node 5 | 90-119 But in wq_numa_possible_cpumask, cpu30 is still mapped to node2, and the like. When a pool workqueue is initialized, if its cpumask belongs to a node, its pool->node will be mapped to that node. And memory used by this workqueue will also be allocated on that node. static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs){ ... /* if cpumask is contained inside a NUMA node, we belong to that node */ if (wq_numa_enabled) { for_each_node(node) { if (cpumask_subset(pool->attrs->cpumask, wq_numa_possible_cpumask[node])) { pool->node = node; break; } } } Since wq_numa_possible_cpumask is not updated, it could be mapped to an offline node, which will lead to memory allocation failure: SLUB: Unable to allocate memory on node 2 (gfp=0x80d0) cache: kmalloc-192, object size: 192, buffer size: 192, default order: 1, min order: 0 node 0: slabs: 6172, objs: 259224, free: 245741 node 1: slabs: 3261, objs: 136962, free: 127656 It happens here: create_worker(struct worker_pool *pool) |--> worker = alloc_worker(pool->node); static struct worker *alloc_worker(int node) { struct worker *worker; worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node); --> Here, useing the wrong node. ...... return worker; } [Solution] There are four mappings in the kernel: 1. nodeid (logical node id) <-> pxm 2. apicid (physical cpu id) <-> nodeid 3. cpuid (logical cpu id) <-> apicid 4. cpuid (logical cpu id) <-> nodeid 1. pxm (proximity domain) is provided by ACPI firmware in SRAT, and nodeid <-> pxm mapping is setup at boot time. This mapping is persistent, won't change. 2. apicid <-> nodeid mapping is setup using info in 1. The mapping is setup at boot time and CPU hotadd time, and cleared at CPU hotremove time. This mapping is also persistent. 3. cpuid <-> apicid mapping is setup at boot time and CPU hotadd time. cpuid is allocated, lower ids first, and released at CPU hotremove time, reused for other hotadded CPUs. So this mapping is not persistent. 4. cpuid <-> nodeid mapping is also setup at boot time and CPU hotadd time, and cleared at CPU hotremove time. As a result of 3, this mapping is not persistent. To fix this problem, we establish cpuid <-> nodeid mapping for all the possible cpus at boot time, and make it persistent. And according to init_cpu_to_node(), cpuid <-> nodeid mapping is based on apicid <-> nodeid mapping and cpuid <-> apicid mapping. So the key point is obtaining all cpus' apicid. apicid can be obtained by _MAT (Multiple APIC Table Entry) method or found in MADT (Multiple APIC Description Table). So we finish the job in the following steps: 1. Enable apic registeration flow to handle both enabled and disabled cpus. This is done by introducing an extra parameter to generic_processor_info to let the caller control if disabled cpus are ignored. 2. Introduce a new array storing all possible cpuid <-> apicid mapping. And also modify the way cpuid is calculated. Establish all possible cpuid <-> apicid mapping when registering local apic. Store the mapping in this array. 3. Enable _MAT and MADT relative apis to return non-present or disabled cpus' apicid. This is also done by introducing an extra parameter to these apis to let the caller control if disabled cpus are ignored. 4. Establish all possible cpuid <-> nodeid mapping. This is done via an additional acpi namespace walk for processors. This patch finished step 1. Signed-off-by: Gu Zheng Signed-off-by: Tang Chen Signed-off-by: Zhu Guihua Signed-off-by: Dou Liyang Acked-by: Ingo Molnar Cc: mika.j.penttila@gmail.com Cc: len.brown@intel.com Cc: rafael@kernel.org Cc: rjw@rjwysocki.net Cc: yasu.isimatu@gmail.com Cc: linux-mm@kvack.org Cc: linux-acpi@vger.kernel.org Cc: isimatu.yasuaki@jp.fujitsu.com Cc: gongzhaogang@inspur.com Cc: tj@kernel.org Cc: izumi.taku@jp.fujitsu.com Cc: cl@linux.com Cc: chen.tang@easystack.cn Cc: akpm@linux-foundation.org Cc: kamezawa.hiroyu@jp.fujitsu.com Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/1472114120-3281-3-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/apic.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 779dae5a852f..a8c94bb6b528 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2021,7 +2021,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) apic_write(APIC_LVT1, value); } -int generic_processor_info(int apicid, int version) +static int __generic_processor_info(int apicid, int version, bool enabled) { int cpu, max = nr_cpu_ids; bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, @@ -2087,7 +2087,6 @@ int generic_processor_info(int apicid, int version) return -EINVAL; } - num_processors++; if (apicid == boot_cpu_physical_apicid) { /* * x86_bios_cpu_apicid is required to have processors listed @@ -2110,6 +2109,7 @@ int generic_processor_info(int apicid, int version) pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n", thiscpu, apicid); + disabled_cpus++; return -ENOSPC; } @@ -2128,7 +2128,6 @@ int generic_processor_info(int apicid, int version) boot_cpu_apic_version, cpu, version); } - physid_set(apicid, phys_cpu_present_map); if (apicid > max_physical_apicid) max_physical_apicid = apicid; @@ -2141,11 +2140,23 @@ int generic_processor_info(int apicid, int version) apic->x86_32_early_logical_apicid(cpu); #endif set_cpu_possible(cpu, true); - set_cpu_present(cpu, true); + + if (enabled) { + num_processors++; + physid_set(apicid, phys_cpu_present_map); + set_cpu_present(cpu, true); + } else { + disabled_cpus++; + } return cpu; } +int generic_processor_info(int apicid, int version) +{ + return __generic_processor_info(apicid, version, true); +} + int hard_smp_processor_id(void) { return read_apic_id(); From 8f54969dc8d6704632b42cbb5e47730cd75cc713 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Thu, 25 Aug 2016 16:35:16 +0800 Subject: [PATCH 478/538] x86/acpi: Introduce persistent storage for cpuid <-> apicid mapping The whole patch-set aims at making cpuid <-> nodeid mapping persistent. So that, when node online/offline happens, cache based on cpuid <-> nodeid mapping such as wq_numa_possible_cpumask will not cause any problem. It contains 4 steps: 1. Enable apic registeration flow to handle both enabled and disabled cpus. 2. Introduce a new array storing all possible cpuid <-> apicid mapping. 3. Enable _MAT and MADT relative apis to return non-present or disabled cpus' apicid. 4. Establish all possible cpuid <-> nodeid mapping. This patch finishes step 2. In this patch, we introduce a new static array named cpuid_to_apicid[], which is large enough to store info for all possible cpus. And then, we modify the cpuid calculation. In generic_processor_info(), it simply finds the next unused cpuid. And it is also why the cpuid <-> nodeid mapping changes with node hotplug. After this patch, we find the next unused cpuid, map it to an apicid, and store the mapping in cpuid_to_apicid[], so that cpuid <-> apicid mapping will be persistent. And finally we will use this array to make cpuid <-> nodeid persistent. cpuid <-> apicid mapping is established at local apic registeration time. But non-present or disabled cpus are ignored. In this patch, we establish all possible cpuid <-> apicid mapping when registering local apic. Signed-off-by: Gu Zheng Signed-off-by: Tang Chen Signed-off-by: Zhu Guihua Signed-off-by: Dou Liyang Acked-by: Ingo Molnar Cc: mika.j.penttila@gmail.com Cc: len.brown@intel.com Cc: rafael@kernel.org Cc: rjw@rjwysocki.net Cc: yasu.isimatu@gmail.com Cc: linux-mm@kvack.org Cc: linux-acpi@vger.kernel.org Cc: isimatu.yasuaki@jp.fujitsu.com Cc: gongzhaogang@inspur.com Cc: tj@kernel.org Cc: izumi.taku@jp.fujitsu.com Cc: cl@linux.com Cc: chen.tang@easystack.cn Cc: akpm@linux-foundation.org Cc: kamezawa.hiroyu@jp.fujitsu.com Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/1472114120-3281-4-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/mpspec.h | 1 + arch/x86/kernel/acpi/boot.c | 7 +--- arch/x86/kernel/apic/apic.c | 60 +++++++++++++++++++++++++++++++++-- 3 files changed, 59 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index c2f94dcc92ce..32007041ef8c 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -86,6 +86,7 @@ static inline void early_reserve_e820_mpc_new(void) { } #endif int generic_processor_info(int apicid, int version); +int __generic_processor_info(int apicid, int version, bool enabled); #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0447e314e7f5..7d668d172fce 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -176,15 +176,10 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) return -EINVAL; } - if (!enabled) { - ++disabled_cpus; - return -EINVAL; - } - if (boot_cpu_physical_apicid != -1U) ver = boot_cpu_apic_version; - cpu = generic_processor_info(id, ver); + cpu = __generic_processor_info(id, ver, enabled); if (cpu >= 0) early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a8c94bb6b528..2dc01c38ad8e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2021,7 +2021,53 @@ void disconnect_bsp_APIC(int virt_wire_setup) apic_write(APIC_LVT1, value); } -static int __generic_processor_info(int apicid, int version, bool enabled) +/* + * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated + * contiguously, it equals to current allocated max logical CPU ID plus 1. + * All allocated CPU ID should be in [0, nr_logical_cpuidi), so the maximum of + * nr_logical_cpuids is nr_cpu_ids. + * + * NOTE: Reserve 0 for BSP. + */ +static int nr_logical_cpuids = 1; + +/* + * Used to store mapping between logical CPU IDs and APIC IDs. + */ +static int cpuid_to_apicid[] = { + [0 ... NR_CPUS - 1] = -1, +}; + +/* + * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids + * and cpuid_to_apicid[] synchronized. + */ +static int allocate_logical_cpuid(int apicid) +{ + int i; + + /* + * cpuid <-> apicid mapping is persistent, so when a cpu is up, + * check if the kernel has allocated a cpuid for it. + */ + for (i = 0; i < nr_logical_cpuids; i++) { + if (cpuid_to_apicid[i] == apicid) + return i; + } + + /* Allocate a new cpuid. */ + if (nr_logical_cpuids >= nr_cpu_ids) { + WARN_ONCE(1, "Only %d processors supported." + "Processor %d/0x%x and the rest are ignored.\n", + nr_cpu_ids - 1, nr_logical_cpuids, apicid); + return -1; + } + + cpuid_to_apicid[nr_logical_cpuids] = apicid; + return nr_logical_cpuids++; +} + +int __generic_processor_info(int apicid, int version, bool enabled) { int cpu, max = nr_cpu_ids; bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, @@ -2096,8 +2142,16 @@ static int __generic_processor_info(int apicid, int version, bool enabled) * for BSP. */ cpu = 0; - } else - cpu = cpumask_next_zero(-1, cpu_present_mask); + + /* Logical cpuid 0 is reserved for BSP. */ + cpuid_to_apicid[0] = apicid; + } else { + cpu = allocate_logical_cpuid(apicid); + if (cpu < 0) { + disabled_cpus++; + return -EINVAL; + } + } /* * This can happen on physical hotplug. The sanity check at boot time From 8ad893faf2eaedb710a3073afbb5d569df2c3e41 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Thu, 25 Aug 2016 16:35:17 +0800 Subject: [PATCH 479/538] x86/acpi: Enable MADT APIs to return disabled apicids The whole patch-set aims at making cpuid <-> nodeid mapping persistent. So that, when node online/offline happens, cache based on cpuid <-> nodeid mapping such as wq_numa_possible_cpumask will not cause any problem. It contains 4 steps: 1. Enable apic registeration flow to handle both enabled and disabled cpus. 2. Introduce a new array storing all possible cpuid <-> apicid mapping. 3. Enable _MAT and MADT relative apis to return non-present or disabled cpus' apicid. 4. Establish all possible cpuid <-> nodeid mapping. This patch finishes step 3. There are four mappings in the kernel: 1. nodeid (logical node id) <-> pxm (persistent) 2. apicid (physical cpu id) <-> nodeid (persistent) 3. cpuid (logical cpu id) <-> apicid (not persistent, now persistent by step 2) 4. cpuid (logical cpu id) <-> nodeid (not persistent) So, in order to setup persistent cpuid <-> nodeid mapping for all possible CPUs, we should: 1. Setup cpuid <-> apicid mapping for all possible CPUs, which has been done in step 1, 2. 2. Setup cpuid <-> nodeid mapping for all possible CPUs. But before that, we should obtain all apicids from MADT. All processors' apicids can be obtained by _MAT method or from MADT in ACPI. The current code ignores disabled processors and returns -ENODEV. After this patch, a new parameter will be added to MADT APIs so that caller is able to control if disabled processors are ignored. Signed-off-by: Gu Zheng Signed-off-by: Tang Chen Signed-off-by: Zhu Guihua Signed-off-by: Dou Liyang Acked-by: Ingo Molnar Cc: mika.j.penttila@gmail.com Cc: len.brown@intel.com Cc: rafael@kernel.org Cc: rjw@rjwysocki.net Cc: yasu.isimatu@gmail.com Cc: linux-mm@kvack.org Cc: linux-acpi@vger.kernel.org Cc: isimatu.yasuaki@jp.fujitsu.com Cc: gongzhaogang@inspur.com Cc: tj@kernel.org Cc: izumi.taku@jp.fujitsu.com Cc: cl@linux.com Cc: chen.tang@easystack.cn Cc: akpm@linux-foundation.org Cc: kamezawa.hiroyu@jp.fujitsu.com Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/1472114120-3281-5-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- drivers/acpi/acpi_processor.c | 5 ++- drivers/acpi/processor_core.c | 60 ++++++++++++++++++++++------------- 2 files changed, 42 insertions(+), 23 deletions(-) diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c index c7ba948d253c..02b84aa69fa4 100644 --- a/drivers/acpi/acpi_processor.c +++ b/drivers/acpi/acpi_processor.c @@ -300,8 +300,11 @@ static int acpi_processor_get_info(struct acpi_device *device) * Extra Processor objects may be enumerated on MP systems with * less than the max # of CPUs. They should be ignored _iff * they are physically not present. + * + * NOTE: Even if the processor has a cpuid, it may not be present + * because cpuid <-> apicid mapping is persistent now. */ - if (invalid_logical_cpuid(pr->id)) { + if (invalid_logical_cpuid(pr->id) || !cpu_present(pr->id)) { int ret = acpi_processor_hotadd_init(pr); if (ret) return ret; diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c index 9125d7d96372..fd59ae871db3 100644 --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -32,12 +32,12 @@ static struct acpi_table_madt *get_madt_table(void) } static int map_lapic_id(struct acpi_subtable_header *entry, - u32 acpi_id, phys_cpuid_t *apic_id) + u32 acpi_id, phys_cpuid_t *apic_id, bool ignore_disabled) { struct acpi_madt_local_apic *lapic = container_of(entry, struct acpi_madt_local_apic, header); - if (!(lapic->lapic_flags & ACPI_MADT_ENABLED)) + if (ignore_disabled && !(lapic->lapic_flags & ACPI_MADT_ENABLED)) return -ENODEV; if (lapic->processor_id != acpi_id) @@ -48,12 +48,13 @@ static int map_lapic_id(struct acpi_subtable_header *entry, } static int map_x2apic_id(struct acpi_subtable_header *entry, - int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id) + int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id, + bool ignore_disabled) { struct acpi_madt_local_x2apic *apic = container_of(entry, struct acpi_madt_local_x2apic, header); - if (!(apic->lapic_flags & ACPI_MADT_ENABLED)) + if (ignore_disabled && !(apic->lapic_flags & ACPI_MADT_ENABLED)) return -ENODEV; if (device_declaration && (apic->uid == acpi_id)) { @@ -65,12 +66,13 @@ static int map_x2apic_id(struct acpi_subtable_header *entry, } static int map_lsapic_id(struct acpi_subtable_header *entry, - int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id) + int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id, + bool ignore_disabled) { struct acpi_madt_local_sapic *lsapic = container_of(entry, struct acpi_madt_local_sapic, header); - if (!(lsapic->lapic_flags & ACPI_MADT_ENABLED)) + if (ignore_disabled && !(lsapic->lapic_flags & ACPI_MADT_ENABLED)) return -ENODEV; if (device_declaration) { @@ -87,12 +89,13 @@ static int map_lsapic_id(struct acpi_subtable_header *entry, * Retrieve the ARM CPU physical identifier (MPIDR) */ static int map_gicc_mpidr(struct acpi_subtable_header *entry, - int device_declaration, u32 acpi_id, phys_cpuid_t *mpidr) + int device_declaration, u32 acpi_id, phys_cpuid_t *mpidr, + bool ignore_disabled) { struct acpi_madt_generic_interrupt *gicc = container_of(entry, struct acpi_madt_generic_interrupt, header); - if (!(gicc->flags & ACPI_MADT_ENABLED)) + if (ignore_disabled && !(gicc->flags & ACPI_MADT_ENABLED)) return -ENODEV; /* device_declaration means Device object in DSDT, in the @@ -109,7 +112,7 @@ static int map_gicc_mpidr(struct acpi_subtable_header *entry, } static phys_cpuid_t map_madt_entry(struct acpi_table_madt *madt, - int type, u32 acpi_id) + int type, u32 acpi_id, bool ignore_disabled) { unsigned long madt_end, entry; phys_cpuid_t phys_id = PHYS_CPUID_INVALID; /* CPU hardware ID */ @@ -127,16 +130,20 @@ static phys_cpuid_t map_madt_entry(struct acpi_table_madt *madt, struct acpi_subtable_header *header = (struct acpi_subtable_header *)entry; if (header->type == ACPI_MADT_TYPE_LOCAL_APIC) { - if (!map_lapic_id(header, acpi_id, &phys_id)) + if (!map_lapic_id(header, acpi_id, &phys_id, + ignore_disabled)) break; } else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC) { - if (!map_x2apic_id(header, type, acpi_id, &phys_id)) + if (!map_x2apic_id(header, type, acpi_id, &phys_id, + ignore_disabled)) break; } else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC) { - if (!map_lsapic_id(header, type, acpi_id, &phys_id)) + if (!map_lsapic_id(header, type, acpi_id, &phys_id, + ignore_disabled)) break; } else if (header->type == ACPI_MADT_TYPE_GENERIC_INTERRUPT) { - if (!map_gicc_mpidr(header, type, acpi_id, &phys_id)) + if (!map_gicc_mpidr(header, type, acpi_id, &phys_id, + ignore_disabled)) break; } entry += header->length; @@ -156,14 +163,15 @@ phys_cpuid_t __init acpi_map_madt_entry(u32 acpi_id) if (!madt) return PHYS_CPUID_INVALID; - rv = map_madt_entry(madt, 1, acpi_id); + rv = map_madt_entry(madt, 1, acpi_id, true); early_acpi_os_unmap_memory(madt, tbl_size); return rv; } -static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id) +static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id, + bool ignore_disabled) { struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *obj; @@ -184,30 +192,38 @@ static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id) header = (struct acpi_subtable_header *)obj->buffer.pointer; if (header->type == ACPI_MADT_TYPE_LOCAL_APIC) - map_lapic_id(header, acpi_id, &phys_id); + map_lapic_id(header, acpi_id, &phys_id, ignore_disabled); else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC) - map_lsapic_id(header, type, acpi_id, &phys_id); + map_lsapic_id(header, type, acpi_id, &phys_id, ignore_disabled); else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC) - map_x2apic_id(header, type, acpi_id, &phys_id); + map_x2apic_id(header, type, acpi_id, &phys_id, ignore_disabled); else if (header->type == ACPI_MADT_TYPE_GENERIC_INTERRUPT) - map_gicc_mpidr(header, type, acpi_id, &phys_id); + map_gicc_mpidr(header, type, acpi_id, &phys_id, + ignore_disabled); exit: kfree(buffer.pointer); return phys_id; } -phys_cpuid_t acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id) +static phys_cpuid_t __acpi_get_phys_id(acpi_handle handle, int type, + u32 acpi_id, bool ignore_disabled) { phys_cpuid_t phys_id; - phys_id = map_mat_entry(handle, type, acpi_id); + phys_id = map_mat_entry(handle, type, acpi_id, ignore_disabled); if (invalid_phys_cpuid(phys_id)) - phys_id = map_madt_entry(get_madt_table(), type, acpi_id); + phys_id = map_madt_entry(get_madt_table(), type, acpi_id, + ignore_disabled); return phys_id; } +phys_cpuid_t acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id) +{ + return __acpi_get_phys_id(handle, type, acpi_id, true); +} + int acpi_map_cpuid(phys_cpuid_t phys_id, u32 acpi_id) { #ifdef CONFIG_SMP From dc6db24d2476cd09c0ecf2b8d80313539f737a89 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Thu, 25 Aug 2016 16:35:18 +0800 Subject: [PATCH 480/538] x86/acpi: Set persistent cpuid <-> nodeid mapping when booting The whole patch-set aims at making cpuid <-> nodeid mapping persistent. So that, when node online/offline happens, cache based on cpuid <-> nodeid mapping such as wq_numa_possible_cpumask will not cause any problem. It contains 4 steps: 1. Enable apic registeration flow to handle both enabled and disabled cpus. 2. Introduce a new array storing all possible cpuid <-> apicid mapping. 3. Enable _MAT and MADT relative apis to return non-present or disabled cpus' apicid. 4. Establish all possible cpuid <-> nodeid mapping. This patch finishes step 4. This patch set the persistent cpuid <-> nodeid mapping for all enabled/disabled processors at boot time via an additional acpi namespace walk for processors. [ tglx: Remove the unneeded exports ] Signed-off-by: Gu Zheng Signed-off-by: Tang Chen Signed-off-by: Zhu Guihua Signed-off-by: Dou Liyang Acked-by: Ingo Molnar Cc: mika.j.penttila@gmail.com Cc: len.brown@intel.com Cc: rafael@kernel.org Cc: rjw@rjwysocki.net Cc: yasu.isimatu@gmail.com Cc: linux-mm@kvack.org Cc: linux-acpi@vger.kernel.org Cc: isimatu.yasuaki@jp.fujitsu.com Cc: gongzhaogang@inspur.com Cc: tj@kernel.org Cc: izumi.taku@jp.fujitsu.com Cc: cl@linux.com Cc: chen.tang@easystack.cn Cc: akpm@linux-foundation.org Cc: kamezawa.hiroyu@jp.fujitsu.com Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/1472114120-3281-6-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- arch/ia64/kernel/acpi.c | 2 +- arch/x86/kernel/acpi/boot.c | 3 +- drivers/acpi/acpi_processor.c | 5 +++ drivers/acpi/bus.c | 1 + drivers/acpi/processor_core.c | 68 +++++++++++++++++++++++++++++++++++ include/linux/acpi.h | 3 ++ 6 files changed, 80 insertions(+), 2 deletions(-) diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 92b7bc956795..9273e034b730 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -796,7 +796,7 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) * ACPI based hotplug CPU support */ #ifdef CONFIG_ACPI_HOTPLUG_CPU -static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA /* diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 7d668d172fce..fc8841016116 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -702,7 +702,7 @@ static void __init acpi_set_irq_model_ioapic(void) #ifdef CONFIG_ACPI_HOTPLUG_CPU #include -static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA int nid; @@ -713,6 +713,7 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) numa_set_node(cpu, nid); } #endif + return 0; } int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu) diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c index 02b84aa69fa4..f9f23fdd96a1 100644 --- a/drivers/acpi/acpi_processor.c +++ b/drivers/acpi/acpi_processor.c @@ -182,6 +182,11 @@ int __weak arch_register_cpu(int cpu) void __weak arch_unregister_cpu(int cpu) {} +int __weak acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +{ + return -ENODEV; +} + static int acpi_processor_hotadd_init(struct acpi_processor *pr) { unsigned long long sta; diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 85b7d07fe5c8..a760dac656ea 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -1193,6 +1193,7 @@ static int __init acpi_init(void) acpi_wakeup_device_init(); acpi_debugger_init(); acpi_setup_sb_notify_handler(); + acpi_set_processor_mapping(); return 0; } diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c index fd59ae871db3..88019766a59a 100644 --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -280,6 +280,74 @@ int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id) } EXPORT_SYMBOL_GPL(acpi_get_cpuid); +#ifdef CONFIG_ACPI_HOTPLUG_CPU +static bool __init +map_processor(acpi_handle handle, phys_cpuid_t *phys_id, int *cpuid) +{ + int type; + u32 acpi_id; + acpi_status status; + acpi_object_type acpi_type; + unsigned long long tmp; + union acpi_object object = { 0 }; + struct acpi_buffer buffer = { sizeof(union acpi_object), &object }; + + status = acpi_get_type(handle, &acpi_type); + if (ACPI_FAILURE(status)) + return false; + + switch (acpi_type) { + case ACPI_TYPE_PROCESSOR: + status = acpi_evaluate_object(handle, NULL, NULL, &buffer); + if (ACPI_FAILURE(status)) + return false; + acpi_id = object.processor.proc_id; + break; + case ACPI_TYPE_DEVICE: + status = acpi_evaluate_integer(handle, "_UID", NULL, &tmp); + if (ACPI_FAILURE(status)) + return false; + acpi_id = tmp; + break; + default: + return false; + } + + type = (acpi_type == ACPI_TYPE_DEVICE) ? 1 : 0; + + *phys_id = __acpi_get_phys_id(handle, type, acpi_id, false); + *cpuid = acpi_map_cpuid(*phys_id, acpi_id); + if (*cpuid == -1) + return false; + + return true; +} + +static acpi_status __init +set_processor_node_mapping(acpi_handle handle, u32 lvl, void *context, + void **rv) +{ + phys_cpuid_t phys_id; + int cpu_id; + + if (!map_processor(handle, &phys_id, &cpu_id)) + return AE_ERROR; + + acpi_map_cpu2node(handle, cpu_id, phys_id); + return AE_OK; +} + +void __init acpi_set_processor_mapping(void) +{ + /* Set persistent cpu <-> node mapping for all processors. */ + acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, set_processor_node_mapping, + NULL, NULL, NULL); +} +#else +void __init acpi_set_processor_mapping(void) {} +#endif /* CONFIG_ACPI_HOTPLUG_CPU */ + #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC static int get_ioapic_id(struct acpi_subtable_header *entry, u32 gsi_base, u64 *phys_addr, int *ioapic_id) diff --git a/include/linux/acpi.h b/include/linux/acpi.h index c9a596b9535c..5b4f9accf96b 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -271,8 +271,11 @@ static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id) /* Arch dependent functions for cpu hotplug support */ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu); int acpi_unmap_cpu(int cpu); +int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid); #endif /* CONFIG_ACPI_HOTPLUG_CPU */ +void acpi_set_processor_mapping(void); + #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC int acpi_get_ioapic_id(acpi_handle handle, u32 gsi_base, u64 *phys_addr); #endif From 8e089eaa1999def4bb954caa91941f29b0672b6a Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Thu, 25 Aug 2016 16:35:19 +0800 Subject: [PATCH 481/538] acpi: Provide mechanism to validate processors in the ACPI tables [Problem] When we set cpuid <-> nodeid mapping to be persistent, it will use the DSDT As we know, the ACPI tables are just like user's input in that respect, and we don't crash if user's input is unreasonable. Such as, the mapping of the proc_id and pxm in some machine's ACPI table is like this: proc_id | pxm -------------------- 0 <-> 0 1 <-> 0 2 <-> 1 3 <-> 1 89 <-> 0 89 <-> 0 89 <-> 0 89 <-> 1 89 <-> 1 89 <-> 2 89 <-> 3 ..... We can't be sure which one is correct to the proc_id 89. We may map a wrong node to a cpu. When pages are allocated, this may cause a kernal panic. So, we should provide mechanisms to validate the ACPI tables, just like we do validation to check user's input in web project. The mechanism is that the processor objects which have the duplicate IDs are not valid. [Solution] We add a validation function, like this: foreach Processor in DSDT proc_id = get_ACPI_Processor_number(Processor) if (proc_id exists ) mark both of them as being unreasonable; The function will record the unique or duplicate processor IDs. The duplicate processor IDs such as 89 are regarded as the unreasonable IDs which mean that the processor objects in question are not valid. [ tglx: Add __init[data] annotations ] Signed-off-by: Dou Liyang Acked-by: Ingo Molnar Cc: mika.j.penttila@gmail.com Cc: len.brown@intel.com Cc: rafael@kernel.org Cc: rjw@rjwysocki.net Cc: yasu.isimatu@gmail.com Cc: linux-mm@kvack.org Cc: linux-acpi@vger.kernel.org Cc: isimatu.yasuaki@jp.fujitsu.com Cc: gongzhaogang@inspur.com Cc: tj@kernel.org Cc: izumi.taku@jp.fujitsu.com Cc: cl@linux.com Cc: chen.tang@easystack.cn Cc: akpm@linux-foundation.org Cc: kamezawa.hiroyu@jp.fujitsu.com Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/1472114120-3281-7-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- drivers/acpi/acpi_processor.c | 79 +++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c index f9f23fdd96a1..f27c709186c1 100644 --- a/drivers/acpi/acpi_processor.c +++ b/drivers/acpi/acpi_processor.c @@ -581,8 +581,87 @@ static struct acpi_scan_handler processor_container_handler = { .attach = acpi_processor_container_attach, }; +/* The number of the unique processor IDs */ +static int nr_unique_ids __initdata; + +/* The number of the duplicate processor IDs */ +static int nr_duplicate_ids __initdata; + +/* Used to store the unique processor IDs */ +static int unique_processor_ids[] __initdata = { + [0 ... NR_CPUS - 1] = -1, +}; + +/* Used to store the duplicate processor IDs */ +static int duplicate_processor_ids[] __initdata = { + [0 ... NR_CPUS - 1] = -1, +}; + +static void __init processor_validated_ids_update(int proc_id) +{ + int i; + + if (nr_unique_ids == NR_CPUS||nr_duplicate_ids == NR_CPUS) + return; + + /* + * Firstly, compare the proc_id with duplicate IDs, if the proc_id is + * already in the IDs, do nothing. + */ + for (i = 0; i < nr_duplicate_ids; i++) { + if (duplicate_processor_ids[i] == proc_id) + return; + } + + /* + * Secondly, compare the proc_id with unique IDs, if the proc_id is in + * the IDs, put it in the duplicate IDs. + */ + for (i = 0; i < nr_unique_ids; i++) { + if (unique_processor_ids[i] == proc_id) { + duplicate_processor_ids[nr_duplicate_ids] = proc_id; + nr_duplicate_ids++; + return; + } + } + + /* + * Lastly, the proc_id is a unique ID, put it in the unique IDs. + */ + unique_processor_ids[nr_unique_ids] = proc_id; + nr_unique_ids++; +} + +static acpi_status __init acpi_processor_ids_walk(acpi_handle handle, + u32 lvl, + void *context, + void **rv) +{ + acpi_status status; + union acpi_object object = { 0 }; + struct acpi_buffer buffer = { sizeof(union acpi_object), &object }; + + status = acpi_evaluate_object(handle, NULL, NULL, &buffer); + if (ACPI_FAILURE(status)) + acpi_handle_info(handle, "Not get the processor object\n"); + else + processor_validated_ids_update(object.processor.proc_id); + + return AE_OK; +} + +static void __init acpi_processor_check_duplicates(void) +{ + /* Search all processor nodes in ACPI namespace */ + acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + acpi_processor_ids_walk, + NULL, NULL, NULL); +} + void __init acpi_processor_init(void) { + acpi_processor_check_duplicates(); acpi_scan_add_handler_with_hotplug(&processor_handler, "processor"); acpi_scan_add_handler(&processor_container_handler); } From fd74da217df7d4bd25e95411da64e0b92762842e Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Thu, 25 Aug 2016 16:35:20 +0800 Subject: [PATCH 482/538] acpi: Validate processor id when mapping the processor When we want to identify whether the proc_id is unreasonable or not, we can call the "acpi_processor_validate_proc_id" function. It will search in the duplicate IDs. If we find the proc_id in the IDs, we return true to the call function. Conversely, the false represents available. When we establish all possible cpuid <-> nodeid mapping to handle the cpu hotplugs, we will use the proc_id from ACPI table. We do validation when we get the proc_id. If the result is true, we will stop the mapping. [ tglx: Mark the new function __init ] Signed-off-by: Dou Liyang Acked-by: Ingo Molnar Cc: mika.j.penttila@gmail.com Cc: len.brown@intel.com Cc: rafael@kernel.org Cc: rjw@rjwysocki.net Cc: yasu.isimatu@gmail.com Cc: linux-mm@kvack.org Cc: linux-acpi@vger.kernel.org Cc: isimatu.yasuaki@jp.fujitsu.com Cc: gongzhaogang@inspur.com Cc: tj@kernel.org Cc: izumi.taku@jp.fujitsu.com Cc: cl@linux.com Cc: chen.tang@easystack.cn Cc: akpm@linux-foundation.org Cc: kamezawa.hiroyu@jp.fujitsu.com Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/1472114120-3281-8-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- drivers/acpi/acpi_processor.c | 15 +++++++++++++++ drivers/acpi/processor_core.c | 4 ++++ include/linux/acpi.h | 3 +++ 3 files changed, 22 insertions(+) diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c index f27c709186c1..3de3b6b8f0f1 100644 --- a/drivers/acpi/acpi_processor.c +++ b/drivers/acpi/acpi_processor.c @@ -659,6 +659,21 @@ static void __init acpi_processor_check_duplicates(void) NULL, NULL, NULL); } +bool __init acpi_processor_validate_proc_id(int proc_id) +{ + int i; + + /* + * compare the proc_id with duplicate IDs, if the proc_id is already + * in the duplicate IDs, return true, otherwise, return false. + */ + for (i = 0; i < nr_duplicate_ids; i++) { + if (duplicate_processor_ids[i] == proc_id) + return true; + } + return false; +} + void __init acpi_processor_init(void) { acpi_processor_check_duplicates(); diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c index 88019766a59a..9ac265f235b7 100644 --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -302,6 +302,10 @@ map_processor(acpi_handle handle, phys_cpuid_t *phys_id, int *cpuid) if (ACPI_FAILURE(status)) return false; acpi_id = object.processor.proc_id; + + /* validate the acpi_id */ + if(acpi_processor_validate_proc_id(acpi_id)) + return false; break; case ACPI_TYPE_DEVICE: status = acpi_evaluate_integer(handle, "_UID", NULL, &tmp); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 5b4f9accf96b..7f307f3bd12c 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -267,6 +267,9 @@ static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id) return phys_id == PHYS_CPUID_INVALID; } +/* Validate the processor object's proc_id */ +bool acpi_processor_validate_proc_id(int proc_id); + #ifdef CONFIG_ACPI_HOTPLUG_CPU /* Arch dependent functions for cpu hotplug support */ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu); From 463e8f845cbf1c01e4cc8aeef1703212991d8e1e Mon Sep 17 00:00:00 2001 From: Peter Rosin Date: Wed, 14 Sep 2016 15:24:12 +0200 Subject: [PATCH 483/538] i2c: mux: pca954x: retry updating the mux selection on failure The cached value of the last selected channel prevents retries on the next call, even on failure to update the selected channel. Fix that. Signed-off-by: Peter Rosin Signed-off-by: Wolfram Sang Cc: stable@kernel.org --- drivers/i2c/muxes/i2c-mux-pca954x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/muxes/i2c-mux-pca954x.c b/drivers/i2c/muxes/i2c-mux-pca954x.c index 528e755c468f..3278ebf1cc5c 100644 --- a/drivers/i2c/muxes/i2c-mux-pca954x.c +++ b/drivers/i2c/muxes/i2c-mux-pca954x.c @@ -164,7 +164,7 @@ static int pca954x_select_chan(struct i2c_mux_core *muxc, u32 chan) /* Only select the channel if its different from the last channel */ if (data->last_chan != regval) { ret = pca954x_reg_write(muxc->parent, client, regval); - data->last_chan = regval; + data->last_chan = ret ? 0 : regval; } return ret; From 1e5ec2e709bd8c5588fdbdda909945e4e2be8d23 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 15 Sep 2016 14:57:48 -0400 Subject: [PATCH 484/538] Btrfs: handle quota reserve failure properly btrfs/022 was spitting a warning for the case that we exceed the quota. If we fail to make our quota reservation we need to clean up our data space reservation. Thanks, Signed-off-by: Josef Bacik Tested-by: Jeff Mahoney Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d09cf7aa083b..db76cc18c562 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4271,13 +4271,10 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) if (ret < 0) return ret; - /* - * Use new btrfs_qgroup_reserve_data to reserve precious data space - * - * TODO: Find a good method to avoid reserve data space for NOCOW - * range, but don't impact performance on quota disable case. - */ + /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ ret = btrfs_qgroup_reserve_data(inode, start, len); + if (ret) + btrfs_free_reserved_data_space_noquota(inode, start, len); return ret; } From 325c50e3cebb9208009083e841550f98a863bfa0 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 21 Sep 2016 08:31:29 -0400 Subject: [PATCH 485/538] btrfs: ensure that file descriptor used with subvol ioctls is a dir If the subvol/snapshot create/destroy ioctls are passed a regular file with execute permissions set, we'll eventually Oops while trying to do inode->i_op->lookup via lookup_one_len. This patch ensures that the file descriptor refers to a directory. Fixes: cb8e70901d (Btrfs: Fix subvolume creation locking rules) Fixes: 76dda93c6a (Btrfs: add snapshot/subvolume destroy ioctl) Cc: #v2.6.29+ Signed-off-by: Jeff Mahoney Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b2a2da5893af..7fd939bfbd99 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1634,6 +1634,9 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, int namelen; int ret = 0; + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + ret = mnt_want_write_file(file); if (ret) goto out; @@ -1691,6 +1694,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, struct btrfs_ioctl_vol_args *vol_args; int ret; + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); @@ -1714,6 +1720,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, bool readonly = false; struct btrfs_qgroup_inherit *inherit = NULL; + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); @@ -2357,6 +2366,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, int ret; int err = 0; + if (!S_ISDIR(dir->i_mode)) + return -ENOTDIR; + vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); From fba1296624bf95fc07057da1e26beee8a733180c Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Tue, 20 Sep 2016 14:55:31 +0300 Subject: [PATCH 486/538] net/mlx4_core: Fix to clean devlink resources This patch cleans devlink resources by calling devlink_port_unregister() to avoid the following issues: - Kernel panic when triggering reset flow. - Memory leak due to unfreed resources in mlx4_init_port_info(). Fixes: 09d4d087cd48 ("mlx4: Implement devlink interface") Signed-off-by: Kamal Heib Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 75dd2e3d3059..7183ac4135d2 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -2970,6 +2970,7 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port) mlx4_err(dev, "Failed to create mtu file for port %d\n", port); device_remove_file(&info->dev->persist->pdev->dev, &info->port_attr); + devlink_port_unregister(&info->devlink_port); info->port = -1; } @@ -2984,6 +2985,8 @@ static void mlx4_cleanup_port_info(struct mlx4_port_info *info) device_remove_file(&info->dev->persist->pdev->dev, &info->port_attr); device_remove_file(&info->dev->persist->pdev->dev, &info->port_mtu_attr); + devlink_port_unregister(&info->devlink_port); + #ifdef CONFIG_RFS_ACCEL free_irq_cpu_rmap(info->rmap); info->rmap = NULL; From adb03115f4590baa280ddc440a8eff08a6be0cb7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Sep 2016 18:06:17 -0700 Subject: [PATCH 487/538] net: get rid of an signed integer overflow in ip_idents_reserve() Jiri Pirko reported an UBSAN warning happening in ip_idents_reserve() [] UBSAN: Undefined behaviour in ./arch/x86/include/asm/atomic.h:156:11 [] signed integer overflow: [] -2117905507 + -695755206 cannot be represented in type 'int' Since we do not have uatomic_add_return() yet, use atomic_cmpxchg() so that the arithmetics can be done using unsigned int. Fixes: 04ca6973f7c1 ("ip: make IP identifiers less predictable") Signed-off-by: Eric Dumazet Reported-by: Jiri Pirko Signed-off-by: David S. Miller --- net/ipv4/route.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a1f2830d8110..b5b47a26d4ec 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -476,12 +476,18 @@ u32 ip_idents_reserve(u32 hash, int segs) atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; u32 old = ACCESS_ONCE(*p_tstamp); u32 now = (u32)jiffies; - u32 delta = 0; + u32 new, delta = 0; if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = prandom_u32_max(now - old); - return atomic_add_return(segs + delta, p_id) - segs; + /* Do not use atomic_add_return() as it makes UBSAN unhappy */ + do { + old = (u32)atomic_read(p_id); + new = old + delta + segs; + } while (atomic_cmpxchg(p_id, old, new) != old); + + return new - segs; } EXPORT_SYMBOL(ip_idents_reserve); From 75c9510b8f745f75280029a8a9f96567f55f401e Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 20 Sep 2016 23:33:15 -0400 Subject: [PATCH 488/538] MAINTAINERS: Update b44 maintainer. Taking over as maintainer since Gary Zambrano is no longer working for Broadcom. Signed-off-by: Michael Chan Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 247b418959fa..3df4be3c4e0d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2500,7 +2500,7 @@ S: Supported F: kernel/bpf/ BROADCOM B44 10/100 ETHERNET DRIVER -M: Gary Zambrano +M: Michael Chan L: netdev@vger.kernel.org S: Supported F: drivers/net/ethernet/broadcom/b44.* From de1d657816c6fbb70f07b01d50ec669dff0d4e60 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 21 Sep 2016 16:16:14 -0700 Subject: [PATCH 489/538] tcp: fix under-accounting retransmit SNMP counters This patch fixes these under-accounting SNMP rtx stats LINUX_MIB_TCPFORWARDRETRANS LINUX_MIB_TCPFASTRETRANS LINUX_MIB_TCPSLOWSTARTRETRANS when retransmitting TSO packets Fixes: 10d3be569243 ("tcp-tso: do not split TSO packets at retransmit time") Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f53d0cca5fa4..e15ec82a6319 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2831,7 +2831,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_retransmit_skb(sk, skb, segs)) return; - NET_INC_STATS(sock_net(sk), mib_idx); + NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb)); if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); From 7e32b44361abc77fbc01f2b97b045c405b2583e5 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 21 Sep 2016 16:16:15 -0700 Subject: [PATCH 490/538] tcp: properly account Fast Open SYN-ACK retrans Since the TFO socket is accepted right off SYN-data, the socket owner can call getsockopt(TCP_INFO) to collect ongoing SYN-ACK retransmission or timeout stats (i.e., tcpi_total_retrans, tcpi_retransmits). Currently those stats are only updated upon handshake completes. This patch fixes it. Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 2 ++ net/ipv4/tcp_timer.c | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3ebf45b38bc3..08323bd95f2a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5885,7 +5885,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) * so release it. */ if (req) { - tp->total_retrans = req->num_retrans; + inet_csk(sk)->icsk_retransmits = 0; reqsk_fastopen_remove(sk, req, false); } else { /* Make sure socket is routed, for correct metrics. */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e15ec82a6319..5288cec4a2b2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3568,6 +3568,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) if (!res) { __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + if (unlikely(tcp_passive_fastopen(sk))) + tcp_sk(sk)->total_retrans++; } return res; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index d84930b2dd95..f712b411f6ed 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -384,6 +384,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk) */ inet_rtx_syn_ack(sk, req); req->num_timeout++; + icsk->icsk_retransmits++; inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); } From 005d675aa1909ad70456dec8c5b0ba9b60b52d24 Mon Sep 17 00:00:00 2001 From: Jaehoon Chung Date: Thu, 22 Sep 2016 14:12:00 +0900 Subject: [PATCH 491/538] mmc: dw_mmc: fix the spamming log message When there is no Card which is set to "broken-cd", it's displayed a clock information continuously. Because it's polling for detecting card. This patch is fixed this problem. Fixes: 65257a0deed5 ("mmc: dw_mmc: remove UBSAN warning in dw_mci_setup_bus()") Reported-by: Tobias Jakobi Signed-off-by: Jaehoon Chung Signed-off-by: Ulf Hansson --- drivers/mmc/host/dw_mmc.c | 14 +++++++++----- drivers/mmc/host/dw_mmc.h | 3 +++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 32380d5d4f6b..767af2026f8b 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -1112,11 +1112,12 @@ static void dw_mci_setup_bus(struct dw_mci_slot *slot, bool force_clkinit) div = (host->bus_hz != clock) ? DIV_ROUND_UP(div, 2) : 0; - dev_info(&slot->mmc->class_dev, - "Bus speed (slot %d) = %dHz (slot req %dHz, actual %dHZ div = %d)\n", - slot->id, host->bus_hz, clock, - div ? ((host->bus_hz / div) >> 1) : - host->bus_hz, div); + if (clock != slot->__clk_old || force_clkinit) + dev_info(&slot->mmc->class_dev, + "Bus speed (slot %d) = %dHz (slot req %dHz, actual %dHZ div = %d)\n", + slot->id, host->bus_hz, clock, + div ? ((host->bus_hz / div) >> 1) : + host->bus_hz, div); /* disable clock */ mci_writel(host, CLKENA, 0); @@ -1139,6 +1140,9 @@ static void dw_mci_setup_bus(struct dw_mci_slot *slot, bool force_clkinit) /* inform CIU */ mci_send_cmd(slot, sdmmc_cmd_bits, 0); + + /* keep the last clock value that was requested from core */ + slot->__clk_old = clock; } host->current_speed = clock; diff --git a/drivers/mmc/host/dw_mmc.h b/drivers/mmc/host/dw_mmc.h index 9e740bc232a8..e8cd2dec3263 100644 --- a/drivers/mmc/host/dw_mmc.h +++ b/drivers/mmc/host/dw_mmc.h @@ -249,6 +249,8 @@ extern int dw_mci_resume(struct dw_mci *host); * @queue_node: List node for placing this node in the @queue list of * &struct dw_mci. * @clock: Clock rate configured by set_ios(). Protected by host->lock. + * @__clk_old: The last clock value that was requested from core. + * Keeping track of this helps us to avoid spamming the console. * @flags: Random state bits associated with the slot. * @id: Number of this slot. * @sdio_id: Number of this slot in the SDIO interrupt registers. @@ -263,6 +265,7 @@ struct dw_mci_slot { struct list_head queue_node; unsigned int clock; + unsigned int __clk_old; unsigned long flags; #define DW_MMC_CARD_PRESENT 0 From 67492c86b33db0a8a056c72293d4802b37ac8ac6 Mon Sep 17 00:00:00 2001 From: Andrew Banman Date: Wed, 21 Sep 2016 11:09:12 -0500 Subject: [PATCH 492/538] x86/platform/uv/BAU: Clean up vertical alignment Fix whitespace on blocks of code to be vertically aligned. Signed-off-by: Andrew Banman Acked-by: Thomas Gleixner Acked-by: Mike Travis Acked-by: Dimitri Sivanich Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: akpm@linux-foundation.org Cc: rja@sgi.com Link: http://lkml.kernel.org/r/1474474161-265604-2-git-send-email-abanman@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/tlb_uv.c | 64 +++++++++++++++++------------------ 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index fdb4d42b4ce5..b84c2a22424d 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -55,16 +55,16 @@ static int congested_reps = CONGESTED_REPS; static int disabled_period = DISABLED_PERIOD; static struct tunables tunables[] = { - {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */ - {&plugged_delay, PLUGGED_DELAY}, - {&plugsb4reset, PLUGSB4RESET}, - {&timeoutsb4reset, TIMEOUTSB4RESET}, - {&ipi_reset_limit, IPI_RESET_LIMIT}, - {&complete_threshold, COMPLETE_THRESHOLD}, - {&congested_respns_us, CONGESTED_RESPONSE_US}, - {&congested_reps, CONGESTED_REPS}, - {&disabled_period, DISABLED_PERIOD}, - {&giveup_limit, GIVEUP_LIMIT} + {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */ + {&plugged_delay, PLUGGED_DELAY}, + {&plugsb4reset, PLUGSB4RESET}, + {&timeoutsb4reset, TIMEOUTSB4RESET}, + {&ipi_reset_limit, IPI_RESET_LIMIT}, + {&complete_threshold, COMPLETE_THRESHOLD}, + {&congested_respns_us, CONGESTED_RESPONSE_US}, + {&congested_reps, CONGESTED_REPS}, + {&disabled_period, DISABLED_PERIOD}, + {&giveup_limit, GIVEUP_LIMIT} }; static struct dentry *tunables_dir; @@ -1619,17 +1619,17 @@ static ssize_t tunables_write(struct file *file, const char __user *user, for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); - bcp->max_concurr = max_concurr; - bcp->max_concurr_const = max_concurr; - bcp->plugged_delay = plugged_delay; - bcp->plugsb4reset = plugsb4reset; - bcp->timeoutsb4reset = timeoutsb4reset; - bcp->ipi_reset_limit = ipi_reset_limit; - bcp->complete_threshold = complete_threshold; - bcp->cong_response_us = congested_respns_us; - bcp->cong_reps = congested_reps; - bcp->disabled_period = sec_2_cycles(disabled_period); - bcp->giveup_limit = giveup_limit; + bcp->max_concurr = max_concurr; + bcp->max_concurr_const = max_concurr; + bcp->plugged_delay = plugged_delay; + bcp->plugsb4reset = plugsb4reset; + bcp->timeoutsb4reset = timeoutsb4reset; + bcp->ipi_reset_limit = ipi_reset_limit; + bcp->complete_threshold = complete_threshold; + bcp->cong_response_us = congested_respns_us; + bcp->cong_reps = congested_reps; + bcp->disabled_period = sec_2_cycles(disabled_period); + bcp->giveup_limit = giveup_limit; } return count; } @@ -1740,7 +1740,7 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) memset(bd2, 0, sizeof(struct bau_desc)); if (uv1) { uv1_hdr = &bd2->header.uv1_hdr; - uv1_hdr->swack_flag = 1; + uv1_hdr->swack_flag = 1; /* * The base_dest_nasid set in the message header * is the nasid of the first uvhub in the partition. @@ -1749,10 +1749,10 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) * if nasid striding is being used. */ uv1_hdr->base_dest_nasid = - UV_PNODE_TO_NASID(base_pnode); - uv1_hdr->dest_subnodeid = UV_LB_SUBNODEID; - uv1_hdr->command = UV_NET_ENDPOINT_INTD; - uv1_hdr->int_both = 1; + UV_PNODE_TO_NASID(base_pnode); + uv1_hdr->dest_subnodeid = UV_LB_SUBNODEID; + uv1_hdr->command = UV_NET_ENDPOINT_INTD; + uv1_hdr->int_both = 1; /* * all others need to be set to zero: * fairness chaining multilevel count replied_to @@ -1763,11 +1763,11 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) * uses native mode for selective broadcasts. */ uv2_3_hdr = &bd2->header.uv2_3_hdr; - uv2_3_hdr->swack_flag = 1; + uv2_3_hdr->swack_flag = 1; uv2_3_hdr->base_dest_nasid = - UV_PNODE_TO_NASID(base_pnode); - uv2_3_hdr->dest_subnodeid = UV_LB_SUBNODEID; - uv2_3_hdr->command = UV_NET_ENDPOINT_INTD; + UV_PNODE_TO_NASID(base_pnode); + uv2_3_hdr->dest_subnodeid = UV_LB_SUBNODEID; + uv2_3_hdr->command = UV_NET_ENDPOINT_INTD; } } for_each_present_cpu(cpu) { @@ -1914,8 +1914,8 @@ static void __init init_per_cpu_tunables(void) bcp->complete_threshold = complete_threshold; bcp->cong_response_us = congested_respns_us; bcp->cong_reps = congested_reps; - bcp->disabled_period = sec_2_cycles(disabled_period); - bcp->giveup_limit = giveup_limit; + bcp->disabled_period = sec_2_cycles(disabled_period); + bcp->giveup_limit = giveup_limit; spin_lock_init(&bcp->queue_lock); spin_lock_init(&bcp->uvhub_lock); spin_lock_init(&bcp->disable_lock); From efa59ab3e7526650265f0fd9696ef8be8d88ec13 Mon Sep 17 00:00:00 2001 From: Andrew Banman Date: Wed, 21 Sep 2016 11:09:13 -0500 Subject: [PATCH 493/538] x86/platform/uv/BAU: Clean up and update printks Replace all uses of printk with the appropriate pr_*() function. Signed-off-by: Andrew Banman Acked-by: Thomas Gleixner Acked-by: Mike Travis Acked-by: Dimitri Sivanich Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: akpm@linux-foundation.org Cc: rja@sgi.com Link: http://lkml.kernel.org/r/1474474161-265604-3-git-send-email-abanman@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/tlb_uv.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index b84c2a22424d..8462fd1583ed 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1497,16 +1497,16 @@ static ssize_t ptc_proc_write(struct file *file, const char __user *user, } if (kstrtol(optstr, 10, &input_arg) < 0) { - printk(KERN_DEBUG "%s is invalid\n", optstr); + pr_debug("%s is invalid\n", optstr); return -EINVAL; } if (input_arg == 0) { elements = ARRAY_SIZE(stat_description); - printk(KERN_DEBUG "# cpu: cpu number\n"); - printk(KERN_DEBUG "Sender statistics:\n"); + pr_debug("# cpu: cpu number\n"); + pr_debug("Sender statistics:\n"); for (i = 0; i < elements; i++) - printk(KERN_DEBUG "%s\n", stat_description[i]); + pr_debug("%s\n", stat_description[i]); } else if (input_arg == -1) { for_each_present_cpu(cpu) { stat = &per_cpu(ptcstats, cpu); @@ -1554,7 +1554,7 @@ static int parse_tunables_write(struct bau_control *bcp, char *instr, break; } if (cnt != e) { - printk(KERN_INFO "bau tunable error: should be %d values\n", e); + pr_info("bau tunable error: should be %d values\n", e); return -EINVAL; } @@ -1571,7 +1571,7 @@ static int parse_tunables_write(struct bau_control *bcp, char *instr, continue; } if (val < 1 || val > bcp->cpus_in_uvhub) { - printk(KERN_DEBUG + pr_debug( "Error: BAU max concurrent %d is invalid\n", val); return -EINVAL; @@ -1676,21 +1676,21 @@ static int __init uv_ptc_init(void) proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL, &proc_uv_ptc_operations); if (!proc_uv_ptc) { - printk(KERN_ERR "unable to create %s proc entry\n", + pr_err("unable to create %s proc entry\n", UV_PTC_BASENAME); return -EINVAL; } tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL); if (!tunables_dir) { - printk(KERN_ERR "unable to create debugfs directory %s\n", + pr_err("unable to create debugfs directory %s\n", UV_BAU_TUNABLES_DIR); return -EINVAL; } tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600, tunables_dir, NULL, &tunables_fops); if (!tunables_file) { - printk(KERN_ERR "unable to create debugfs file %s\n", + pr_err("unable to create debugfs file %s\n", UV_BAU_TUNABLES_FILE); return -EINVAL; } @@ -1944,7 +1944,7 @@ static int __init get_cpu_topology(int base_pnode, pnode = uv_cpu_hub_info(cpu)->pnode; if ((pnode - base_pnode) >= UV_DISTRIBUTION_SIZE) { - printk(KERN_EMERG + pr_emerg( "cpu %d pnode %d-%d beyond %d; BAU disabled\n", cpu, pnode, base_pnode, UV_DISTRIBUTION_SIZE); return 1; @@ -1969,7 +1969,7 @@ static int __init get_cpu_topology(int base_pnode, sdp->cpu_number[sdp->num_cpus] = cpu; sdp->num_cpus++; if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) { - printk(KERN_EMERG "%d cpus per socket invalid\n", + pr_emerg("%d cpus per socket invalid\n", sdp->num_cpus); return 1; } @@ -2036,14 +2036,14 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, else if (is_uv3_hub()) bcp->uvhub_version = 3; else { - printk(KERN_EMERG "uvhub version not 1, 2 or 3\n"); + pr_emerg("uvhub version not 1, 2, or 3\n"); return 1; } bcp->uvhub_master = *hmasterp; bcp->uvhub_cpu = uv_cpu_blade_processor_id(cpu); if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { - printk(KERN_EMERG "%d cpus per uvhub invalid\n", + pr_emerg("%d cpus per uvhub invalid\n", bcp->uvhub_cpu); return 1; } From d2a57afa53f3fdf9f68d1f4240ace85a7d20ca20 Mon Sep 17 00:00:00 2001 From: Andrew Banman Date: Wed, 21 Sep 2016 11:09:14 -0500 Subject: [PATCH 494/538] x86/platform/uv/BAU: Clean up pq_init() The payload queue first MMR requires the physical memory address and hub GNODE of where the payload queue resides in memory, but the associated variables are named as if the PNODE were used. Rename gnode-related variables and clarify the definitions of the payload queue head, last, and tail pointers. Signed-off-by: Andrew Banman Acked-by: Thomas Gleixner Acked-by: Mike Travis Acked-by: Dimitri Sivanich Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: akpm@linux-foundation.org Cc: rja@sgi.com Link: http://lkml.kernel.org/r/1474474161-265604-4-git-send-email-abanman@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 2 +- arch/x86/platform/uv/tlb_uv.c | 19 ++++++++----------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index cc44d926c17e..cc058c6b1fc4 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -49,7 +49,7 @@ #define UV_NET_ENDPOINT_INTD (is_uv1_hub() ? \ UV1_NET_ENDPOINT_INTD : UV2_NET_ENDPOINT_INTD) #define UV_DESC_PSHIFT 49 -#define UV_PAYLOADQ_PNODE_SHIFT 49 +#define UV_PAYLOADQ_GNODE_SHIFT 49 #define UV_PTC_BASENAME "sgi_uv/ptc_statistics" #define UV_BAU_BASENAME "sgi_uv/bau_tunables" #define UV_BAU_TUNABLES_DIR "sgi_uv" diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 8462fd1583ed..f6bc43b7e2a0 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1790,10 +1790,7 @@ static void pq_init(int node, int pnode) size_t plsize; char *cp; void *vp; - unsigned long pn; - unsigned long first; - unsigned long pn_first; - unsigned long last; + unsigned long gnode, first, last, tail; struct bau_pq_entry *pqp; struct bau_control *bcp; @@ -1814,16 +1811,16 @@ static void pq_init(int node, int pnode) bcp->bau_msg_head = pqp; bcp->queue_last = pqp + (DEST_Q_SIZE - 1); } - /* - * need the gnode of where the memory was really allocated - */ - pn = uv_gpa_to_gnode(uv_gpa(pqp)); + first = uv_physnodeaddr(pqp); - pn_first = ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | first; last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)); - write_mmr_payload_first(pnode, pn_first); - write_mmr_payload_tail(pnode, first); + tail = first; + gnode = uv_gpa_to_gnode(uv_gpa(pqp)); + first = (gnode << UV_PAYLOADQ_GNODE_SHIFT) | tail; + + write_mmr_payload_first(pnode, first); write_mmr_payload_last(pnode, last); + write_mmr_payload_tail(pnode, tail); write_gmmr_sw_ack(pnode, 0xffffUL); /* in effect, all msg_type's are set to MSG_NOOP */ From 60e1c842c7ea3dd6a65660864554565cc737dd86 Mon Sep 17 00:00:00 2001 From: Andrew Banman Date: Wed, 21 Sep 2016 11:09:15 -0500 Subject: [PATCH 495/538] x86/platform/uv/BAU: Convert uv_physnodeaddr() use to uv_gpa_to_offset() The BAU driver should use the functions provided by uv_hub.h rather than its own implementations. uv_physnodeaddr converts vaddrs to paddrs for BAU MMR fields, but this is done better by uv_gpa_to_offset. Signed-off-by: Andrew Banman Acked-by: Thomas Gleixner Acked-by: Mike Travis Acked-by: Dimitri Sivanich Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: akpm@linux-foundation.org Cc: rja@sgi.com Link: http://lkml.kernel.org/r/1474474161-265604-5-git-send-email-abanman@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 2 -- arch/x86/platform/uv/tlb_uv.c | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index cc058c6b1fc4..a46f270e2789 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -55,8 +55,6 @@ #define UV_BAU_TUNABLES_DIR "sgi_uv" #define UV_BAU_TUNABLES_FILE "bau_tunables" #define WHITESPACE " \t\n" -#define uv_mmask ((1UL << uv_hub_info->m_val) - 1) -#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) #define cpubit_isset(cpu, bau_local_cpumask) \ test_bit((cpu), (bau_local_cpumask).bits) diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index f6bc43b7e2a0..34b2a48143d5 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1812,8 +1812,8 @@ static void pq_init(int node, int pnode) bcp->queue_last = pqp + (DEST_Q_SIZE - 1); } - first = uv_physnodeaddr(pqp); - last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)); + first = uv_gpa_to_offset(uv_gpa(pqp)); + last = uv_gpa_to_offset(uv_gpa(pqp + (DEST_Q_SIZE - 1))); tail = first; gnode = uv_gpa_to_gnode(uv_gpa(pqp)); first = (gnode << UV_PAYLOADQ_GNODE_SHIFT) | tail; From 5e4f96fe2a61c759d5d47f8112813618805c85a0 Mon Sep 17 00:00:00 2001 From: Andrew Banman Date: Wed, 21 Sep 2016 11:09:16 -0500 Subject: [PATCH 496/538] x86/platform/uv/BAU: Add generic function pointers Many BAU functions have different implementations depending on the UV version. Rather than switching on the uvhub_version throughout the driver, we can define a set of operations for each version. This is especially beneficial for UV4, which will require many new MMR read/write functions. Currently, the set of abstracted functions are the same for UV1, UV2, and UV3. The functions were chosen because each one will have a different implementation for UV4. Other functions will be added as needed to handle new implementations or to cleanup the existing differences between UV1, UV2, and UV3, i.e. read_status and wait_completion. Signed-off-by: Andrew Banman Acked-by: Thomas Gleixner Acked-by: Mike Travis Acked-by: Dimitri Sivanich Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: akpm@linux-foundation.org Cc: rja@sgi.com Link: http://lkml.kernel.org/r/1474474161-265604-6-git-send-email-abanman@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 11 +++++++++++ arch/x86/platform/uv/tlb_uv.c | 19 +++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index a46f270e2789..a7a93a5beb00 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -385,6 +385,17 @@ struct uv2_3_bau_msg_header { /* bits 127:120 */ }; +/* Abstracted BAU functions */ +struct bau_operations { + unsigned long (*read_l_sw_ack)(void); + unsigned long (*read_g_sw_ack)(int pnode); + unsigned long (*bau_gpa_to_offset)(unsigned long vaddr); + void (*write_l_sw_ack)(unsigned long mmr); + void (*write_g_sw_ack)(int pnode, unsigned long mmr); + void (*write_payload_first)(int pnode, unsigned long mmr); + void (*write_payload_last)(int pnode, unsigned long mmr); +}; + /* * The activation descriptor: * The format of the message to send, plus all accompanying control diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 34b2a48143d5..a33a43358e4e 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -24,6 +24,18 @@ #include #include +static struct bau_operations ops; + +static struct bau_operations uv123_bau_ops = { + .bau_gpa_to_offset = uv_gpa_to_offset, + .read_l_sw_ack = read_mmr_sw_ack, + .read_g_sw_ack = read_gmmr_sw_ack, + .write_l_sw_ack = write_mmr_sw_ack, + .write_g_sw_ack = write_gmmr_sw_ack, + .write_payload_first = write_mmr_payload_first, + .write_payload_last = write_mmr_payload_last, +}; + /* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */ static int timeout_base_ns[] = { 20, @@ -2135,6 +2147,13 @@ static int __init uv_bau_init(void) if (!is_uv_system()) return 0; + if (is_uv3_hub()) + ops = uv123_bau_ops; + else if (is_uv2_hub()) + ops = uv123_bau_ops; + else if (is_uv1_hub()) + ops = uv123_bau_ops; + for_each_possible_cpu(cur_cpu) { mask = &per_cpu(uv_flush_tlb_mask, cur_cpu); zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu)); From 21e3f12fc0e12181102ad0400bcb50bc7a027106 Mon Sep 17 00:00:00 2001 From: Andrew Banman Date: Wed, 21 Sep 2016 11:09:17 -0500 Subject: [PATCH 497/538] x86/platform/uv/BAU: Use generic function pointers Convert the use of UV version-specific functions to their abstracted counterparts. Signed-off-by: Andrew Banman Acked-by: Thomas Gleixner Acked-by: Mike Travis Acked-by: Dimitri Sivanich Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: akpm@linux-foundation.org Cc: rja@sgi.com Link: http://lkml.kernel.org/r/1474474161-265604-7-git-send-email-abanman@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/tlb_uv.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index a33a43358e4e..030d452c0b68 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -228,7 +228,7 @@ static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp, msg = mdp->msg; if (!msg->canceled && do_acknowledge) { dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec; - write_mmr_sw_ack(dw); + ops.write_l_sw_ack(dw); } msg->replied_to = 1; msg->swack_vec = 0; @@ -264,7 +264,7 @@ static void bau_process_retry_msg(struct msg_desc *mdp, msg->swack_vec) == 0) && (msg2->sending_cpu == msg->sending_cpu) && (msg2->msg_type != MSG_NOOP)) { - mmr = read_mmr_sw_ack(); + mmr = ops.read_l_sw_ack(); msg_res = msg2->swack_vec; /* * This is a message retry; clear the resources held @@ -282,7 +282,7 @@ static void bau_process_retry_msg(struct msg_desc *mdp, stat->d_canceled++; cancel_count++; mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res; - write_mmr_sw_ack(mr); + ops.write_l_sw_ack(mr); } } } @@ -415,12 +415,12 @@ static void do_reset(void *ptr) /* * only reset the resource if it is still pending */ - mmr = read_mmr_sw_ack(); + mmr = ops.read_l_sw_ack(); msg_res = msg->swack_vec; mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res; if (mmr & msg_res) { stat->d_rcanceled++; - write_mmr_sw_ack(mr); + ops.write_l_sw_ack(mr); } } } @@ -1214,7 +1214,7 @@ void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) struct bau_pq_entry *msg = mdp->msg; struct bau_pq_entry *other_msg; - mmr_image = read_mmr_sw_ack(); + mmr_image = ops.read_l_sw_ack(); swack_vec = msg->swack_vec; if ((swack_vec & mmr_image) == 0) { @@ -1443,7 +1443,7 @@ static int ptc_seq_show(struct seq_file *file, void *data) /* destination side statistics */ seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", - read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)), + ops.read_g_sw_ack(uv_cpu_to_pnode(cpu)), stat->d_requestee, cycles_2_us(stat->d_time), stat->d_alltlb, stat->d_onetlb, stat->d_multmsg, stat->d_nomsg, stat->d_retries, stat->d_canceled, @@ -1737,7 +1737,7 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) gpa = uv_gpa(bau_desc); n = uv_gpa_to_gnode(gpa); - m = uv_gpa_to_offset(gpa); + m = ops.bau_gpa_to_offset(gpa); if (is_uv1_hub()) uv1 = 1; @@ -1824,16 +1824,16 @@ static void pq_init(int node, int pnode) bcp->queue_last = pqp + (DEST_Q_SIZE - 1); } - first = uv_gpa_to_offset(uv_gpa(pqp)); - last = uv_gpa_to_offset(uv_gpa(pqp + (DEST_Q_SIZE - 1))); + first = ops.bau_gpa_to_offset(uv_gpa(pqp)); + last = ops.bau_gpa_to_offset(uv_gpa(pqp + (DEST_Q_SIZE - 1))); tail = first; gnode = uv_gpa_to_gnode(uv_gpa(pqp)); first = (gnode << UV_PAYLOADQ_GNODE_SHIFT) | tail; - write_mmr_payload_first(pnode, first); - write_mmr_payload_last(pnode, last); write_mmr_payload_tail(pnode, tail); - write_gmmr_sw_ack(pnode, 0xffffUL); + ops.write_payload_first(pnode, first); + ops.write_payload_last(pnode, last); + ops.write_g_sw_ack(pnode, 0xffffUL); /* in effect, all msg_type's are set to MSG_NOOP */ memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE); From 58d4ab46f21e7e800a7597f271a23ec602796247 Mon Sep 17 00:00:00 2001 From: Andrew Banman Date: Wed, 21 Sep 2016 11:09:18 -0500 Subject: [PATCH 498/538] x86/platform/uv/BAU: Populate ->uvhub_version with UV4 version information Signed-off-by: Andrew Banman Acked-by: Thomas Gleixner Acked-by: Mike Travis Acked-by: Dimitri Sivanich Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: akpm@linux-foundation.org Cc: rja@sgi.com Link: http://lkml.kernel.org/r/1474474161-265604-8-git-send-email-abanman@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/tlb_uv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 030d452c0b68..ddbeb1679b1a 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -2044,8 +2044,10 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, bcp->uvhub_version = 2; else if (is_uv3_hub()) bcp->uvhub_version = 3; + else if (is_uv4_hub()) + bcp->uvhub_version = 4; else { - pr_emerg("uvhub version not 1, 2, or 3\n"); + pr_emerg("uvhub version not 1, 2, 3, or 4\n"); return 1; } bcp->uvhub_master = *hmasterp; From e879c1124a6c5c3367f20a254909605e7ee938c1 Mon Sep 17 00:00:00 2001 From: Andrew Banman Date: Wed, 21 Sep 2016 11:09:19 -0500 Subject: [PATCH 499/538] x86/platform/uv/BAU: Disable software timeout on UV4 hardware Software timeouts are not currently supported on BAU for UV4. Instead, the BAU will rely on hardware-level fairness protocols to determine broadcast timeouts. Do not call enable_timeouts or calculate_destination_timeout on UV4. These functions write to pre-UV4 MMRs so they generate error messages on UV4. Signed-off-by: Andrew Banman Acked-by: Thomas Gleixner Acked-by: Mike Travis Acked-by: Dimitri Sivanich Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: akpm@linux-foundation.org Cc: rja@sgi.com Link: http://lkml.kernel.org/r/1474474161-265604-9-git-send-email-abanman@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/tlb_uv.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index ddbeb1679b1a..72a5de7cf2e0 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -2109,7 +2109,8 @@ static int __init init_per_cpu(int nuvhubs, int base_part_pnode) void *vp; struct uvhub_desc *uvhub_descs; - timeout_us = calculate_destination_timeout(); + if (is_uv3_hub() || is_uv2_hub() || is_uv1_hub()) + timeout_us = calculate_destination_timeout(); vp = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); uvhub_descs = (struct uvhub_desc *)vp; @@ -2171,7 +2172,9 @@ static int __init uv_bau_init(void) uv_base_pnode = uv_blade_to_pnode(uvhub); } - enable_timeouts(); + /* software timeouts are not supported on UV4 */ + if (is_uv3_hub() || is_uv2_hub() || is_uv1_hub()) + enable_timeouts(); if (init_per_cpu(nuvhubs, uv_base_pnode)) { set_bau_off(); From 6d78059bbc0ace5461938aaea8cda95eb6719898 Mon Sep 17 00:00:00 2001 From: Andrew Banman Date: Wed, 21 Sep 2016 11:09:20 -0500 Subject: [PATCH 500/538] x86/platform/uv/BAU: Fix payload queue setup on UV4 hardware The BAU on UV4 does not need to maintain the payload queue tail pointer. Do not initialize the tail pointer MMR on UV4. Note that write_payload_tail is not an abstracted BAU function since it is an operation specific to pre-UV4 versions. Then we must switch on the UV version to control its usage, for which we use uvhub_version rather than is_uv*_hub because it is quicker/more concise. Signed-off-by: Andrew Banman Acked-by: Thomas Gleixner Acked-by: Mike Travis Acked-by: Dimitri Sivanich Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: akpm@linux-foundation.org Cc: rja@sgi.com Link: http://lkml.kernel.org/r/1474474161-265604-10-git-send-email-abanman@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/tlb_uv.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 72a5de7cf2e0..7ca0e5c31477 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1826,11 +1826,19 @@ static void pq_init(int node, int pnode) first = ops.bau_gpa_to_offset(uv_gpa(pqp)); last = ops.bau_gpa_to_offset(uv_gpa(pqp + (DEST_Q_SIZE - 1))); - tail = first; - gnode = uv_gpa_to_gnode(uv_gpa(pqp)); - first = (gnode << UV_PAYLOADQ_GNODE_SHIFT) | tail; - write_mmr_payload_tail(pnode, tail); + /* + * Pre UV4, the gnode is required to locate the payload queue + * and the payload queue tail must be maintained by the kernel. + */ + bcp = &per_cpu(bau_control, smp_processor_id()); + if (bcp->uvhub_version <= 3) { + tail = first; + gnode = uv_gpa_to_gnode(uv_gpa(pqp)); + first = (gnode << UV_PAYLOADQ_GNODE_SHIFT) | tail; + write_mmr_payload_tail(pnode, tail); + } + ops.write_payload_first(pnode, first); ops.write_payload_last(pnode, last); ops.write_g_sw_ack(pnode, 0xffffUL); From 4f059d514f7119a4fdd9934189ff31f2c26b2647 Mon Sep 17 00:00:00 2001 From: Andrew Banman Date: Wed, 21 Sep 2016 11:09:21 -0500 Subject: [PATCH 501/538] x86/platform/uv/BAU: Add UV4-specific functions Add the UV4-specific function definitions and define an operations struct to implement them in the BAU driver. Many BAU MMRs, although functionally the same, have new addresses on UV4 due to hardware changes. Each MMR requires new read/write functions, but their implementation in the driver does not change. Thus, it is enough to enumerate them in the operations struct for the changes to take effect. Signed-off-by: Andrew Banman Acked-by: Thomas Gleixner Acked-by: Mike Travis Acked-by: Dimitri Sivanich Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: akpm@linux-foundation.org Cc: rja@sgi.com Link: http://lkml.kernel.org/r/1474474161-265604-11-git-send-email-abanman@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 30 ++++++++++++++++++++++++++++++ arch/x86/platform/uv/tlb_uv.c | 15 ++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index a7a93a5beb00..57ab86d94d64 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -664,6 +664,16 @@ static inline void write_gmmr_activation(int pnode, unsigned long mmr_image) write_gmmr(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, mmr_image); } +static inline void write_mmr_proc_payload_first(int pnode, unsigned long mmr_image) +{ + write_gmmr(pnode, UV4H_LB_PROC_INTD_QUEUE_FIRST, mmr_image); +} + +static inline void write_mmr_proc_payload_last(int pnode, unsigned long mmr_image) +{ + write_gmmr(pnode, UV4H_LB_PROC_INTD_QUEUE_LAST, mmr_image); +} + static inline void write_mmr_payload_first(int pnode, unsigned long mmr_image) { write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, mmr_image); @@ -709,6 +719,26 @@ static inline unsigned long read_gmmr_sw_ack(int pnode) return read_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); } +static inline void write_mmr_proc_sw_ack(unsigned long mr) +{ + uv_write_local_mmr(UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR, mr); +} + +static inline void write_gmmr_proc_sw_ack(int pnode, unsigned long mr) +{ + write_gmmr(pnode, UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR, mr); +} + +static inline unsigned long read_mmr_proc_sw_ack(void) +{ + return read_lmmr(UV4H_LB_PROC_INTD_SOFT_ACK_PENDING); +} + +static inline unsigned long read_gmmr_proc_sw_ack(int pnode) +{ + return read_gmmr(pnode, UV4H_LB_PROC_INTD_SOFT_ACK_PENDING); +} + static inline void write_mmr_data_config(int pnode, unsigned long mr) { uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, mr); diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 7ca0e5c31477..56c5a3a3884a 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -36,6 +36,17 @@ static struct bau_operations uv123_bau_ops = { .write_payload_last = write_mmr_payload_last, }; +static struct bau_operations uv4_bau_ops = { + .bau_gpa_to_offset = uv_gpa_to_soc_phys_ram, + .read_l_sw_ack = read_mmr_proc_sw_ack, + .read_g_sw_ack = read_gmmr_proc_sw_ack, + .write_l_sw_ack = write_mmr_proc_sw_ack, + .write_g_sw_ack = write_gmmr_proc_sw_ack, + .write_payload_first = write_mmr_proc_payload_first, + .write_payload_last = write_mmr_proc_payload_last, +}; + + /* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */ static int timeout_base_ns[] = { 20, @@ -2158,7 +2169,9 @@ static int __init uv_bau_init(void) if (!is_uv_system()) return 0; - if (is_uv3_hub()) + if (is_uv4_hub()) + ops = uv4_bau_ops; + else if (is_uv3_hub()) ops = uv123_bau_ops; else if (is_uv2_hub()) ops = uv123_bau_ops; From 456bee986e0a372ad4beed5d3cedb3622633d9df Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 20 Sep 2016 20:35:55 +0800 Subject: [PATCH 502/538] KEYS: Fix skcipher IV clobbering The IV must not be modified by the skcipher operation so we need to duplicate it. Fixes: c3917fd9dfbc ("KEYS: Use skcipher") Cc: stable@vger.kernel.org Reported-by: Mimi Zohar Signed-off-by: Herbert Xu --- security/keys/encrypted-keys/encrypted.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c index 5adbfc32242f..17a06105ccb6 100644 --- a/security/keys/encrypted-keys/encrypted.c +++ b/security/keys/encrypted-keys/encrypted.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -478,6 +479,7 @@ static int derived_key_encrypt(struct encrypted_key_payload *epayload, struct crypto_skcipher *tfm; struct skcipher_request *req; unsigned int encrypted_datalen; + u8 iv[AES_BLOCK_SIZE]; unsigned int padlen; char pad[16]; int ret; @@ -500,8 +502,8 @@ static int derived_key_encrypt(struct encrypted_key_payload *epayload, sg_init_table(sg_out, 1); sg_set_buf(sg_out, epayload->encrypted_data, encrypted_datalen); - skcipher_request_set_crypt(req, sg_in, sg_out, encrypted_datalen, - epayload->iv); + memcpy(iv, epayload->iv, sizeof(iv)); + skcipher_request_set_crypt(req, sg_in, sg_out, encrypted_datalen, iv); ret = crypto_skcipher_encrypt(req); tfm = crypto_skcipher_reqtfm(req); skcipher_request_free(req); @@ -581,6 +583,7 @@ static int derived_key_decrypt(struct encrypted_key_payload *epayload, struct crypto_skcipher *tfm; struct skcipher_request *req; unsigned int encrypted_datalen; + u8 iv[AES_BLOCK_SIZE]; char pad[16]; int ret; @@ -599,8 +602,8 @@ static int derived_key_decrypt(struct encrypted_key_payload *epayload, epayload->decrypted_datalen); sg_set_buf(&sg_out[1], pad, sizeof pad); - skcipher_request_set_crypt(req, sg_in, sg_out, encrypted_datalen, - epayload->iv); + memcpy(iv, epayload->iv, sizeof(iv)); + skcipher_request_set_crypt(req, sg_in, sg_out, encrypted_datalen, iv); ret = crypto_skcipher_decrypt(req); tfm = crypto_skcipher_reqtfm(req); skcipher_request_free(req); From 0cf43f509f72128196e23f5ade7e512a72152cc6 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Sep 2016 17:04:57 +0800 Subject: [PATCH 503/538] crypto: rsa-pkcs1pad - Handle leading zero for decryption As the software RSA implementation now produces fixed-length output, we need to eliminate leading zeros in the calling code instead. This patch does just that for pkcs1pad decryption while signature verification was fixed in an earlier patch. Fixes: 9b45b7bba3d2 ("crypto: rsa - Generate fixed-length output") Reported-by: Mat Martineau Signed-off-by: Herbert Xu --- crypto/rsa-pkcs1pad.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/crypto/rsa-pkcs1pad.c b/crypto/rsa-pkcs1pad.c index 877019a6d3ea..8baab4307f7b 100644 --- a/crypto/rsa-pkcs1pad.c +++ b/crypto/rsa-pkcs1pad.c @@ -298,41 +298,48 @@ static int pkcs1pad_decrypt_complete(struct akcipher_request *req, int err) struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); struct pkcs1pad_ctx *ctx = akcipher_tfm_ctx(tfm); struct pkcs1pad_request *req_ctx = akcipher_request_ctx(req); + unsigned int dst_len; unsigned int pos; - - if (err == -EOVERFLOW) - /* Decrypted value had no leading 0 byte */ - err = -EINVAL; + u8 *out_buf; if (err) goto done; - if (req_ctx->child_req.dst_len != ctx->key_size - 1) { - err = -EINVAL; + err = -EINVAL; + dst_len = req_ctx->child_req.dst_len; + if (dst_len < ctx->key_size - 1) goto done; + + out_buf = req_ctx->out_buf; + if (dst_len == ctx->key_size) { + if (out_buf[0] != 0x00) + /* Decrypted value had no leading 0 byte */ + goto done; + + dst_len--; + out_buf++; } - if (req_ctx->out_buf[0] != 0x02) { - err = -EINVAL; + if (out_buf[0] != 0x02) goto done; - } - for (pos = 1; pos < req_ctx->child_req.dst_len; pos++) - if (req_ctx->out_buf[pos] == 0x00) + + for (pos = 1; pos < dst_len; pos++) + if (out_buf[pos] == 0x00) break; - if (pos < 9 || pos == req_ctx->child_req.dst_len) { - err = -EINVAL; + if (pos < 9 || pos == dst_len) goto done; - } pos++; - if (req->dst_len < req_ctx->child_req.dst_len - pos) + err = 0; + + if (req->dst_len < dst_len - pos) err = -EOVERFLOW; - req->dst_len = req_ctx->child_req.dst_len - pos; + req->dst_len = dst_len - pos; if (!err) sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, req->dst_len), - req_ctx->out_buf + pos, req->dst_len); + out_buf + pos, req->dst_len); done: kzfree(req_ctx->out_buf); From f0aa1ce6259eb65f53f969b3250c1d0aac84f30b Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Thu, 22 Sep 2016 12:02:25 +0300 Subject: [PATCH 504/538] regmap: fix deadlock on _regmap_raw_write() error path Commit 815806e39bf6 ("regmap: drop cache if the bus transfer error") added a call to regcache_drop_region() to error path in _regmap_raw_write(). However that path runs with regmap lock taken, and regcache_drop_region() tries to re-take it, causing a deadlock. Fix that by calling map->cache_ops->drop() directly. Signed-off-by: Nikita Yushchenko Signed-off-by: Mark Brown --- drivers/base/regmap/regmap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 25d26bb18970..e964d068874d 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -1475,7 +1475,11 @@ int _regmap_raw_write(struct regmap *map, unsigned int reg, kfree(buf); } else if (ret != 0 && !map->cache_bypass && map->format.parse_val) { - regcache_drop_region(map, reg, reg + 1); + /* regcache_drop_region() takes lock that we already have, + * thus call map->cache_ops->drop() directly + */ + if (map->cache_ops && map->cache_ops->drop) + map->cache_ops->drop(map, reg, reg + 1); } trace_regmap_hw_write_done(map, reg, val_len / map->format.val_bytes); From 917db484dc6a69969d317b3e57add4208a8d9d42 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 21 Sep 2016 12:50:45 -0700 Subject: [PATCH 505/538] x86/boot: Fix kdump, cleanup aborted E820_PRAM max_pfn manipulation In commit: ec776ef6bbe1 ("x86/mm: Add support for the non-standard protected e820 type") Christoph references the original patch I wrote implementing pmem support. The intent of the 'max_pfn' changes in that commit were to enable persistent memory ranges to be covered by the struct page memmap by default. However, that approach was abandoned when Christoph ported the patches [1], and that functionality has since been replaced by devm_memremap_pages(). In the meantime, this max_pfn manipulation is confusing kdump [2] that assumes that everything covered by the max_pfn is "System RAM". This results in kdump hanging or crashing. [1]: https://lists.01.org/pipermail/linux-nvdimm/2015-March/000348.html [2]: https://bugzilla.redhat.com/show_bug.cgi?id=1351098 So fix it. Reported-by: Zhang Yi Reported-by: Jeff Moyer Tested-by: Zhang Yi Signed-off-by: Dan Williams Reviewed-by: Jeff Moyer Cc: # v4.1 and later kernels Cc: Andrew Morton Cc: Boaz Harrosh Cc: Christoph Hellwig Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ross Zwisler Cc: Thomas Gleixner Cc: linux-nvdimm@lists.01.org Fixes: ec776ef6bbe1 ("x86/mm: Add support for the non-standard protected e820 type") Link: http://lkml.kernel.org/r/147448744538.34910.11287693517367139607.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index bb8c69079c78..b85fe5f91c3f 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -350,7 +350,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, * continue building up new bios map based on this * information */ - if (current_type != last_type || current_type == E820_PRAM) { + if (current_type != last_type) { if (last_type != 0) { new_bios[new_bios_entry].size = change_point[chgidx]->addr - last_addr; @@ -783,7 +783,7 @@ u64 __init early_reserve_e820(u64 size, u64 align) /* * Find the highest page frame number we have available */ -static unsigned long __init e820_end_pfn(unsigned long limit_pfn) +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) { int i; unsigned long last_pfn = 0; @@ -794,11 +794,7 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn) unsigned long start_pfn; unsigned long end_pfn; - /* - * Persistent memory is accounted as ram for purposes of - * establishing max_pfn and mem_map. - */ - if (ei->type != E820_RAM && ei->type != E820_PRAM) + if (ei->type != type) continue; start_pfn = ei->addr >> PAGE_SHIFT; @@ -823,12 +819,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn) } unsigned long __init e820_end_of_ram_pfn(void) { - return e820_end_pfn(MAX_ARCH_PFN); + return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); } unsigned long __init e820_end_of_low_ram_pfn(void) { - return e820_end_pfn(1UL << (32-PAGE_SHIFT)); + return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_RAM); } static void __init early_panic(char *msg) From 4fa5cd5245b627db88c9ca08ae442373b02596b4 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 13 Sep 2016 16:27:05 +1000 Subject: [PATCH 506/538] sched/core: Do not use smp_processor_id() with preempt enabled in smpboot_thread_fn() We should not be using smp_processor_id() with preempt enabled. Bug identified and fix provided by Alfred Chen. Reported-by: Alfred Chen Signed-off-by: Con Kolivas Cc: Alfred Chen Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/2042051.3vvUWIM0vs@hex Signed-off-by: Ingo Molnar --- kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 13bc43d1fb22..fc0d8270f69e 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -122,12 +122,12 @@ static int smpboot_thread_fn(void *data) if (kthread_should_park()) { __set_current_state(TASK_RUNNING); - preempt_enable(); if (ht->park && td->status == HP_THREAD_ACTIVE) { BUG_ON(td->cpu != smp_processor_id()); ht->park(td->cpu); td->status = HP_THREAD_PARKED; } + preempt_enable(); kthread_parkme(); /* We might have been woken for stop */ continue; From 9bf6ffdabdd6e70a0b69d032a0aff091afe1773e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Aug 2016 15:06:04 +0200 Subject: [PATCH 507/538] locking/atomic, arch/sh: Fix ATOMIC_FETCH_OP() We cannot use the "z" constraint twice, since its a single register (r0). Change the one not used by movli.l/movco.l to "r". Reported-by: Fengguang Wu Tested-by: Fengguang Wu Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rich Felker Cc: Thomas Gleixner Cc: Yoshinori Sato Signed-off-by: Ingo Molnar --- arch/sh/include/asm/atomic-llsc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/include/asm/atomic-llsc.h b/arch/sh/include/asm/atomic-llsc.h index caea2c45f6c2..1d159ce50f5a 100644 --- a/arch/sh/include/asm/atomic-llsc.h +++ b/arch/sh/include/asm/atomic-llsc.h @@ -60,7 +60,7 @@ static inline int atomic_fetch_##op(int i, atomic_t *v) \ " movco.l %0, @%3 \n" \ " bf 1b \n" \ " synco \n" \ - : "=&z" (temp), "=&z" (res) \ + : "=&z" (temp), "=&r" (res) \ : "r" (i), "r" (&v->counter) \ : "t"); \ \ From 8db549491c4a3ce9e1d509b75f78516e497f48ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 11 Sep 2016 10:36:26 +0200 Subject: [PATCH 508/538] smp: Allocate smp_call_on_cpu() workqueue on stack too The SMP IPI struct descriptor is allocated on the stack except for the workqueue and lockdep complains: INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. CPU: 0 PID: 110 Comm: kworker/0:1 Not tainted 4.8.0-rc5+ #14 Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A13 05/11/2014 Workqueue: events smp_call_on_cpu_callback ... Call Trace: dump_stack register_lock_class ? __lock_acquire __lock_acquire ? __lock_acquire lock_acquire ? process_one_work process_one_work ? process_one_work worker_thread ? process_one_work ? process_one_work kthread ? kthread_create_on_node ret_from_fork So allocate it on the stack too. Signed-off-by: Peter Zijlstra (Intel) [ Test and write commit message. ] Signed-off-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160911084323.jhtnpb4b37t5tlno@pd.tnic Signed-off-by: Ingo Molnar --- kernel/smp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/smp.c b/kernel/smp.c index f4f6137941cb..bba3b201668d 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -759,13 +759,14 @@ static void smp_call_on_cpu_callback(struct work_struct *work) int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) { struct smp_call_on_cpu_struct sscs = { - .work = __WORK_INITIALIZER(sscs.work, smp_call_on_cpu_callback), .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done), .func = func, .data = par, .cpu = phys ? cpu : -1, }; + INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback); + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) return -ENXIO; From 0b8473570ce1af3e80da05b59f9321f30253de4d Mon Sep 17 00:00:00 2001 From: Cheng Chao Date: Wed, 14 Sep 2016 10:18:56 +0800 Subject: [PATCH 509/538] sched/core: Remove unnecessary initialization in sched_init() init_idle() is called immediately after: current->sched_class = &fair_sched_class; init_idle() sets: current->sched_class = &idle_sched_class; First assignment is superfluous. Signed-off-by: Cheng Chao Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1473819536-7398-1-git-send-email-cs.os.kernel@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 860070fba814..c5f020c601b2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7557,11 +7557,6 @@ void __init sched_init(void) atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current); - /* - * During early bootup we pretend to be a normal task: - */ - current->sched_class = &fair_sched_class; - /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, From bf89a304722f6904009499a31dc68ab9a5c9742e Mon Sep 17 00:00:00 2001 From: Cheng Chao Date: Wed, 14 Sep 2016 10:01:50 +0800 Subject: [PATCH 510/538] stop_machine: Avoid a sleep and wakeup in stop_one_cpu() In case @cpu == smp_proccessor_id(), we can avoid a sleep+wakeup cycle by doing a preemption. Callers such as sched_exec() can benefit from this change. Signed-off-by: Cheng Chao Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: chris@chris-wilson.co.uk Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1473818510-6779-1-git-send-email-cs.os.kernel@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 ++++++-- kernel/stop_machine.c | 5 +++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c5f020c601b2..ff4e3c066dc2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1063,8 +1063,12 @@ static int migration_cpu_stop(void *data) * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because * we're holding p->pi_lock. */ - if (task_rq(p) == rq && task_on_rq_queued(p)) - rq = __migrate_task(rq, p, arg->dest_cpu); + if (task_rq(p) == rq) { + if (task_on_rq_queued(p)) + rq = __migrate_task(rq, p, arg->dest_cpu); + else + p->wake_cpu = arg->dest_cpu; + } raw_spin_unlock(&rq->lock); raw_spin_unlock(&p->pi_lock); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 4a1ca5f6da7e..082e71f17a58 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -126,6 +126,11 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) cpu_stop_init_done(&done, 1); if (!cpu_stop_queue_work(cpu, &work)) return -ENOENT; + /* + * In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup + * cycle by doing a preemption: + */ + cond_resched(); wait_for_completion(&done.completion); return done.ret; } From 9af6528ee9b682df7f29dbee86fbba0b67eab944 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 13 Sep 2016 18:37:29 +0200 Subject: [PATCH 511/538] sched/core: Optimize __schedule() Oleg noted that by making do_exit() use __schedule() for the TASK_DEAD context switch, we can avoid the TASK_DEAD special case currently in __schedule() because that avoids the extra preempt_disable() from schedule(). In order to facilitate this, create a do_task_dead() helper which we place in the scheduler code, such that it can access __schedule(). Also add some __noreturn annotations to the functions, there's no coming back from do_exit(). Suggested-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Cheng Chao Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: chris@chris-wilson.co.uk Cc: tj@kernel.org Link: http://lkml.kernel.org/r/20160913163729.GB5012@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 9 +++------ include/linux/sched.h | 2 ++ kernel/exit.c | 26 ++------------------------ kernel/sched/core.c | 38 +++++++++++++++++++++++++++----------- 4 files changed, 34 insertions(+), 41 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d96a6118d26a..74fd6f05bc5b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -259,17 +259,14 @@ static inline void might_fault(void) { } extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(int state); __printf(1, 2) -void panic(const char *fmt, ...) - __noreturn __cold; +void panic(const char *fmt, ...) __noreturn __cold; void nmi_panic(struct pt_regs *regs, const char *msg); extern void oops_enter(void); extern void oops_exit(void); void print_oops_end_marker(void); extern int oops_may_print(void); -void do_exit(long error_code) - __noreturn; -void complete_and_exit(struct completion *, long) - __noreturn; +void do_exit(long error_code) __noreturn; +void complete_and_exit(struct completion *, long) __noreturn; /* Internal, do not use. */ int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res); diff --git a/include/linux/sched.h b/include/linux/sched.h index d75024053e9b..f00ee8e90a29 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -448,6 +448,8 @@ static inline void io_schedule(void) io_schedule_timeout(MAX_SCHEDULE_TIMEOUT); } +void __noreturn do_task_dead(void); + struct nsproxy; struct user_namespace; diff --git a/kernel/exit.c b/kernel/exit.c index 091a78be3b09..1e1d913914c0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -725,7 +725,7 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -void do_exit(long code) +void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; @@ -882,29 +882,7 @@ void do_exit(long code) exit_rcu(); TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); - /* - * The setting of TASK_RUNNING by try_to_wake_up() may be delayed - * when the following two conditions become true. - * - There is race condition of mmap_sem (It is acquired by - * exit_mm()), and - * - SMI occurs before setting TASK_RUNINNG. - * (or hypervisor of virtual machine switches to other guest) - * As a result, we may become TASK_RUNNING after becoming TASK_DEAD - * - * To avoid it, we have to wait for releasing tsk->pi_lock which - * is held by try_to_wake_up() - */ - smp_mb(); - raw_spin_unlock_wait(&tsk->pi_lock); - - /* causes final put_task_struct in finish_task_switch(). */ - tsk->state = TASK_DEAD; - tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ - schedule(); - BUG(); - /* Avoid "noreturn function does return". */ - for (;;) - cpu_relax(); /* For when BUG is null */ + do_task_dead(); } EXPORT_SYMBOL_GPL(do_exit); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ff4e3c066dc2..b2ec53c1a974 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3331,17 +3331,6 @@ static void __sched notrace __schedule(bool preempt) rq = cpu_rq(cpu); prev = rq->curr; - /* - * do_exit() calls schedule() with preemption disabled as an exception; - * however we must fix that up, otherwise the next task will see an - * inconsistent (higher) preempt count. - * - * It also avoids the below schedule_debug() test from complaining - * about this. - */ - if (unlikely(prev->state == TASK_DEAD)) - preempt_enable_no_resched_notrace(); - schedule_debug(prev); if (sched_feat(HRTICK)) @@ -3409,6 +3398,33 @@ static void __sched notrace __schedule(bool preempt) } STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ +void __noreturn do_task_dead(void) +{ + /* + * The setting of TASK_RUNNING by try_to_wake_up() may be delayed + * when the following two conditions become true. + * - There is race condition of mmap_sem (It is acquired by + * exit_mm()), and + * - SMI occurs before setting TASK_RUNINNG. + * (or hypervisor of virtual machine switches to other guest) + * As a result, we may become TASK_RUNNING after becoming TASK_DEAD + * + * To avoid it, we have to wait for releasing tsk->pi_lock which + * is held by try_to_wake_up() + */ + smp_mb(); + raw_spin_unlock_wait(¤t->pi_lock); + + /* causes final put_task_struct in finish_task_switch(). */ + __set_current_state(TASK_DEAD); + current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ + __schedule(false); + BUG(); + /* Avoid "noreturn function does return". */ + for (;;) + cpu_relax(); /* For when BUG is null */ +} + static inline void sched_submit_work(struct task_struct *tsk) { if (!tsk->state || tsk_is_pi_blocked(tsk)) From 35a773a07926a22bf19d77ee00024522279c4e68 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Sep 2016 12:57:53 +0200 Subject: [PATCH 512/538] sched/core: Avoid _cond_resched() for PREEMPT=y On fully preemptible kernels _cond_resched() is pointless, so avoid emitting any code for it. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mikulas Patocka Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ kernel/sched/core.c | 2 ++ 2 files changed, 6 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index f00ee8e90a29..b99fcd1b341e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3209,7 +3209,11 @@ static inline int signal_pending_state(long state, struct task_struct *p) * cond_resched_lock() will drop the spinlock before scheduling, * cond_resched_softirq() will enable bhs before scheduling. */ +#ifndef CONFIG_PREEMPT extern int _cond_resched(void); +#else +static inline int _cond_resched(void) { return 0; } +#endif #define cond_resched() ({ \ ___might_sleep(__FILE__, __LINE__, 0); \ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b2ec53c1a974..d7babcc7cb76 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4883,6 +4883,7 @@ SYSCALL_DEFINE0(sched_yield) return 0; } +#ifndef CONFIG_PREEMPT int __sched _cond_resched(void) { if (should_resched(0)) { @@ -4892,6 +4893,7 @@ int __sched _cond_resched(void) return 0; } EXPORT_SYMBOL(_cond_resched); +#endif /* * __cond_resched_lock() - if a reschedule is pending, drop the given lock, From 8bf46a39be910937d4c9e8d999a7438a7ae1a75b Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Fri, 16 Sep 2016 18:28:51 -0700 Subject: [PATCH 513/538] sched/fair: Fix SCHED_HRTICK bug leading to late preemption of tasks SCHED_HRTICK feature is useful to preempt SCHED_FAIR tasks on-the-dot (just when they would have exceeded their ideal_runtime). It makes use of a per-CPU hrtimer resource and hence arming that hrtimer should be based on total SCHED_FAIR tasks a CPU has across its various cfs_rqs, rather than being based on number of tasks in a particular cfs_rq (as implemented currently). As a result, with current code, its possible for a running task (which is the sole task in its cfs_rq) to be preempted much after its ideal_runtime has elapsed, resulting in increased latency for tasks in other cfs_rq on same CPU. Fix this by arming sched hrtimer based on total number of SCHED_FAIR tasks a CPU has across its various cfs_rqs. Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Joonwoo Park Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1474075731-11550-1-git-send-email-joonwoop@codeaurora.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 986c10c25176..8fb4d1942c14 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4469,7 +4469,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) WARN_ON(task_rq(p) != rq); - if (cfs_rq->nr_running > 1) { + if (rq->cfs.h_nr_running > 1) { u64 slice = sched_slice(cfs_rq, se); u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; s64 delta = slice - ran; From a18a579e5f84daa74f64b1f1b652b4a6a8d6f8b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Sep 2016 11:05:31 +0200 Subject: [PATCH 514/538] sched/debug: Hide printk() by default Dietmar accidentally added an unconditional sched domain printk. Hide it behind the normal sched_debug flag. Reported-by: Christian Borntraeger Signed-off-by: Peter Zijlstra (Intel) Acked-by: Christian Borntraeger Cc: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Fixes: cd92bfd3b8cb ("sched/core: Store maximum per-CPU capacity in root domain") [ Fixed !SCHED_DEBUG build failure. ] Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d7babcc7cb76..8bae0cd09e9e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5739,6 +5739,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) } } #else /* !CONFIG_SCHED_DEBUG */ + +# define sched_debug_enabled 0 # define sched_domain_debug(sd, cpu) do { } while (0) static inline bool sched_debug(void) { @@ -7006,7 +7008,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, } rcu_read_unlock(); - if (rq) { + if (rq && sched_debug_enabled) { pr_info("span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); } From b193049375b04df3ada8c3347b7083db95918bc3 Mon Sep 17 00:00:00 2001 From: Pan Xinhui Date: Mon, 19 Sep 2016 05:23:52 -0400 Subject: [PATCH 515/538] locking/pv-qspinlock: Use cmpxchg_release() in __pv_queued_spin_unlock() cmpxchg_release() is more lighweight than cmpxchg() on some archs(e.g. PPC), moreover, in __pv_queued_spin_unlock() we only needs a RELEASE in the fast path(pairing with *_try_lock() or *_lock()). And the slow path has smp_store_release too. So it's safe to use cmpxchg_release here. Suggested-by: Boqun Feng Signed-off-by: Pan Xinhui Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: benh@kernel.crashing.org Cc: linuxppc-dev@lists.ozlabs.org Cc: mpe@ellerman.id.au Cc: paulmck@linux.vnet.ibm.com Cc: paulus@samba.org Cc: virtualization@lists.linux-foundation.org Cc: waiman.long@hpe.com Link: http://lkml.kernel.org/r/1474277037-15200-2-git-send-email-xinhui.pan@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 3acf16d79cf4..e3b5520005db 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -540,7 +540,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) * unhash. Otherwise it would be possible to have multiple @lock * entries, which would be BAD. */ - locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0); if (likely(locked == _Q_LOCKED_VAL)) return; From 11d9684ca638aad99f740ef3abcba2aa4c9290bf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2015 14:16:31 +0200 Subject: [PATCH 516/538] locking/percpu-rwsem: Add DEFINE_STATIC_PERCPU_RWSEMand percpu_rwsem_assert_held() Provide a static init and a standard locking assertion method. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: der.herr@hofr.at Cc: oleg@redhat.com Cc: paulmck@linux.vnet.ibm.com Cc: riel@redhat.com Cc: tj@kernel.org Cc: viro@ZenIV.linux.org.uk Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/percpu-rwsem.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 146efefde2a1..d402d3924a91 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -16,6 +16,15 @@ struct percpu_rw_semaphore { int readers_block; }; +#define DEFINE_STATIC_PERCPU_RWSEM(name) \ +static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name); \ +static struct percpu_rw_semaphore name = { \ + .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC), \ + .read_count = &__percpu_rwsem_rc_##name, \ + .rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \ + .writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer), \ +} + extern int __percpu_down_read(struct percpu_rw_semaphore *, int); extern void __percpu_up_read(struct percpu_rw_semaphore *); @@ -102,6 +111,9 @@ extern void percpu_free_rwsem(struct percpu_rw_semaphore *); #define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem) +#define percpu_rwsem_assert_held(sem) \ + lockdep_assert_held(&(sem)->rw_sem) + static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, bool read, unsigned long ip) { From aba37660738325d48c913f3a952a7116d6e6a74b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2015 14:16:33 +0200 Subject: [PATCH 517/538] fs/locks: Replace lg_global with a percpu-rwsem Replace the global part of the lglock with a percpu-rwsem. Since fcl_lock is a spinlock and itself nests under i_lock, which too is a spinlock we cannot acquire sleeping locks at locks_{insert,remove}_global_locks(). We can however wrap all fcl_lock acquisitions with percpu_down_read such that all invocations of locks_{insert,remove}_global_locks() have that read lock held. This allows us to replace the lg_global part of the lglock with the write side of the rwsem. In the absense of writers, percpu_{down,up}_read() are free of atomic instructions. This further avoids the very long preempt-disable regions caused by lglock on larger machines. Signed-off-by: Peter Zijlstra (Intel) Cc: Al Viro Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: der.herr@hofr.at Cc: paulmck@linux.vnet.ibm.com Cc: riel@redhat.com Cc: tj@kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- fs/locks.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/fs/locks.c b/fs/locks.c index ee1b15f6fc13..8f609ec03364 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -164,6 +164,7 @@ int lease_break_time = 45; */ DEFINE_STATIC_LGLOCK(file_lock_lglock); static DEFINE_PER_CPU(struct hlist_head, file_lock_list); +DEFINE_STATIC_PERCPU_RWSEM(file_rwsem); /* * The blocked_hash is used to find POSIX lock loops for deadlock detection. @@ -587,6 +588,8 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) /* Must be called with the flc_lock held! */ static void locks_insert_global_locks(struct file_lock *fl) { + percpu_rwsem_assert_held(&file_rwsem); + lg_local_lock(&file_lock_lglock); fl->fl_link_cpu = smp_processor_id(); hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list)); @@ -596,6 +599,8 @@ static void locks_insert_global_locks(struct file_lock *fl) /* Must be called with the flc_lock held! */ static void locks_delete_global_locks(struct file_lock *fl) { + percpu_rwsem_assert_held(&file_rwsem); + /* * Avoid taking lock if already unhashed. This is safe since this check * is done while holding the flc_lock, and new insertions into the list @@ -915,6 +920,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request) return -ENOMEM; } + percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); if (request->fl_flags & FL_ACCESS) goto find_conflict; @@ -955,6 +961,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request) out: spin_unlock(&ctx->flc_lock); + percpu_up_read(&file_rwsem); if (new_fl) locks_free_lock(new_fl); locks_dispose_list(&dispose); @@ -991,6 +998,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, new_fl2 = locks_alloc_lock(); } + percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); /* * New lock request. Walk all POSIX locks and look for conflicts. If @@ -1162,6 +1170,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, } out: spin_unlock(&ctx->flc_lock); + percpu_up_read(&file_rwsem); /* * Free any unused locks. */ @@ -1436,6 +1445,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) return error; } + percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); @@ -1487,9 +1497,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) locks_insert_block(fl, new_fl); trace_break_lease_block(inode, new_fl); spin_unlock(&ctx->flc_lock); + percpu_up_read(&file_rwsem); + locks_dispose_list(&dispose); error = wait_event_interruptible_timeout(new_fl->fl_wait, !new_fl->fl_next, break_time); + + percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); trace_break_lease_unblock(inode, new_fl); locks_delete_block(new_fl); @@ -1506,6 +1520,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) } out: spin_unlock(&ctx->flc_lock); + percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); locks_free_lock(new_fl); return error; @@ -1660,6 +1675,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr return -EINVAL; } + percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); error = check_conflicting_open(dentry, arg, lease->fl_flags); @@ -1730,6 +1746,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr lease->fl_lmops->lm_setup(lease, priv); out: spin_unlock(&ctx->flc_lock); + percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); if (is_deleg) inode_unlock(inode); @@ -1752,6 +1769,7 @@ static int generic_delete_lease(struct file *filp, void *owner) return error; } + percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); list_for_each_entry(fl, &ctx->flc_lease, fl_list) { if (fl->fl_file == filp && @@ -1764,6 +1782,7 @@ static int generic_delete_lease(struct file *filp, void *owner) if (victim) error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose); spin_unlock(&ctx->flc_lock); + percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); return error; } @@ -2703,6 +2722,7 @@ static void *locks_start(struct seq_file *f, loff_t *pos) struct locks_iterator *iter = f->private; iter->li_pos = *pos + 1; + percpu_down_write(&file_rwsem); lg_global_lock(&file_lock_lglock); spin_lock(&blocked_lock_lock); return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos); @@ -2721,6 +2741,7 @@ static void locks_stop(struct seq_file *f, void *v) { spin_unlock(&blocked_lock_lock); lg_global_unlock(&file_lock_lglock); + percpu_up_write(&file_rwsem); } static const struct seq_operations locks_seq_operations = { From 7c3f654d8e18942295eeda42f7d75494443980e0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2015 14:16:34 +0200 Subject: [PATCH 518/538] fs/locks: Replace lg_local with a per-cpu spinlock As Oleg suggested, replace file_lock_list with a structure containing the hlist head and a spinlock. This completely removes the lglock from fs/locks. Suggested-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Al Viro Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: der.herr@hofr.at Cc: paulmck@linux.vnet.ibm.com Cc: riel@redhat.com Cc: tj@kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- fs/Kconfig | 1 + fs/locks.c | 47 +++++++++++++++++++++++++++++------------------ 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/fs/Kconfig b/fs/Kconfig index 2bc7ad775842..3ef62bad8f2b 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -79,6 +79,7 @@ config EXPORTFS_BLOCK_OPS config FILE_LOCKING bool "Enable POSIX file locking API" if EXPERT default y + select PERCPU_RWSEM help This option enables standard file locking support, required for filesystems like NFS and for the flock() system diff --git a/fs/locks.c b/fs/locks.c index 8f609ec03364..c33aa77fada2 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -127,7 +127,6 @@ #include #include #include -#include #define CREATE_TRACE_POINTS #include @@ -158,12 +157,17 @@ int lease_break_time = 45; /* * The global file_lock_list is only used for displaying /proc/locks, so we - * keep a list on each CPU, with each list protected by its own spinlock via - * the file_lock_lglock. Note that alterations to the list also require that - * the relevant flc_lock is held. + * keep a list on each CPU, with each list protected by its own spinlock. + * Global serialization is done using file_rwsem. + * + * Note that alterations to the list also require that the relevant flc_lock is + * held. */ -DEFINE_STATIC_LGLOCK(file_lock_lglock); -static DEFINE_PER_CPU(struct hlist_head, file_lock_list); +struct file_lock_list_struct { + spinlock_t lock; + struct hlist_head hlist; +}; +static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list); DEFINE_STATIC_PERCPU_RWSEM(file_rwsem); /* @@ -588,17 +592,21 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) /* Must be called with the flc_lock held! */ static void locks_insert_global_locks(struct file_lock *fl) { + struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list); + percpu_rwsem_assert_held(&file_rwsem); - lg_local_lock(&file_lock_lglock); + spin_lock(&fll->lock); fl->fl_link_cpu = smp_processor_id(); - hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list)); - lg_local_unlock(&file_lock_lglock); + hlist_add_head(&fl->fl_link, &fll->hlist); + spin_unlock(&fll->lock); } /* Must be called with the flc_lock held! */ static void locks_delete_global_locks(struct file_lock *fl) { + struct file_lock_list_struct *fll; + percpu_rwsem_assert_held(&file_rwsem); /* @@ -608,9 +616,11 @@ static void locks_delete_global_locks(struct file_lock *fl) */ if (hlist_unhashed(&fl->fl_link)) return; - lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu); + + fll = per_cpu_ptr(&file_lock_list, fl->fl_link_cpu); + spin_lock(&fll->lock); hlist_del_init(&fl->fl_link); - lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu); + spin_unlock(&fll->lock); } static unsigned long @@ -2723,9 +2733,8 @@ static void *locks_start(struct seq_file *f, loff_t *pos) iter->li_pos = *pos + 1; percpu_down_write(&file_rwsem); - lg_global_lock(&file_lock_lglock); spin_lock(&blocked_lock_lock); - return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos); + return seq_hlist_start_percpu(&file_lock_list.hlist, &iter->li_cpu, *pos); } static void *locks_next(struct seq_file *f, void *v, loff_t *pos) @@ -2733,14 +2742,13 @@ static void *locks_next(struct seq_file *f, void *v, loff_t *pos) struct locks_iterator *iter = f->private; ++iter->li_pos; - return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos); + return seq_hlist_next_percpu(v, &file_lock_list.hlist, &iter->li_cpu, pos); } static void locks_stop(struct seq_file *f, void *v) __releases(&blocked_lock_lock) { spin_unlock(&blocked_lock_lock); - lg_global_unlock(&file_lock_lglock); percpu_up_write(&file_rwsem); } @@ -2782,10 +2790,13 @@ static int __init filelock_init(void) filelock_cache = kmem_cache_create("file_lock_cache", sizeof(struct file_lock), 0, SLAB_PANIC, NULL); - lg_lock_init(&file_lock_lglock, "file_lock_lglock"); - for_each_possible_cpu(i) - INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i)); + for_each_possible_cpu(i) { + struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i); + + spin_lock_init(&fll->lock); + INIT_HLIST_HEAD(&fll->hlist); + } return 0; } From 259d69b7f056bc9a543c7d184e791ef6c2775081 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Nov 2015 15:23:55 +0100 Subject: [PATCH 519/538] locking/percpu-rwsem: Add down_read_preempt_disable() Provide a down_read()/up_read() variant that keeps preemption disabled over the whole thing, when possible. This avoids a needless preemption point for constructs such as: percpu_down_read(&global_rwsem); spin_lock(&lock); ... spin_unlock(&lock); percpu_up_read(&global_rwsem); Which perturbs timings. In particular it was found to cure a performance regression in a follow up patch in fs/locks.c Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/percpu-rwsem.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index d402d3924a91..5b2e6159b744 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -28,7 +28,7 @@ static struct percpu_rw_semaphore name = { \ extern int __percpu_down_read(struct percpu_rw_semaphore *, int); extern void __percpu_up_read(struct percpu_rw_semaphore *); -static inline void percpu_down_read(struct percpu_rw_semaphore *sem) +static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem) { might_sleep(); @@ -46,13 +46,19 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem) __this_cpu_inc(*sem->read_count); if (unlikely(!rcu_sync_is_idle(&sem->rss))) __percpu_down_read(sem, false); /* Unconditional memory barrier */ - preempt_enable(); + barrier(); /* - * The barrier() from preempt_enable() prevents the compiler from + * The barrier() prevents the compiler from * bleeding the critical section out. */ } +static inline void percpu_down_read(struct percpu_rw_semaphore *sem) +{ + percpu_down_read_preempt_disable(sem); + preempt_enable(); +} + static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem) { int ret = 1; @@ -76,13 +82,13 @@ static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem) return ret; } -static inline void percpu_up_read(struct percpu_rw_semaphore *sem) +static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem) { /* - * The barrier() in preempt_disable() prevents the compiler from + * The barrier() prevents the compiler from * bleeding the critical section out. */ - preempt_disable(); + barrier(); /* * Same as in percpu_down_read(). */ @@ -95,6 +101,12 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem) rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_); } +static inline void percpu_up_read(struct percpu_rw_semaphore *sem) +{ + preempt_disable(); + percpu_up_read_preempt_enable(sem); +} + extern void percpu_down_write(struct percpu_rw_semaphore *); extern void percpu_up_write(struct percpu_rw_semaphore *); From 87709e28dc7c669af1126aa7352ff6f7b035412d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 May 2016 16:48:35 +0200 Subject: [PATCH 520/538] fs/locks: Use percpu_down_read_preempt_disable() Avoid spurious preemption. Signed-off-by: Peter Zijlstra (Intel) Cc: Al Viro Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: der.herr@hofr.at Cc: paulmck@linux.vnet.ibm.com Cc: riel@redhat.com Cc: tj@kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- fs/locks.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index c33aa77fada2..133fb2543d21 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -930,7 +930,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request) return -ENOMEM; } - percpu_down_read(&file_rwsem); + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); if (request->fl_flags & FL_ACCESS) goto find_conflict; @@ -971,7 +971,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request) out: spin_unlock(&ctx->flc_lock); - percpu_up_read(&file_rwsem); + percpu_up_read_preempt_enable(&file_rwsem); if (new_fl) locks_free_lock(new_fl); locks_dispose_list(&dispose); @@ -1008,7 +1008,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, new_fl2 = locks_alloc_lock(); } - percpu_down_read(&file_rwsem); + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); /* * New lock request. Walk all POSIX locks and look for conflicts. If @@ -1180,7 +1180,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, } out: spin_unlock(&ctx->flc_lock); - percpu_up_read(&file_rwsem); + percpu_up_read_preempt_enable(&file_rwsem); /* * Free any unused locks. */ @@ -1455,7 +1455,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) return error; } - percpu_down_read(&file_rwsem); + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); @@ -1507,13 +1507,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) locks_insert_block(fl, new_fl); trace_break_lease_block(inode, new_fl); spin_unlock(&ctx->flc_lock); - percpu_up_read(&file_rwsem); + percpu_up_read_preempt_enable(&file_rwsem); locks_dispose_list(&dispose); error = wait_event_interruptible_timeout(new_fl->fl_wait, !new_fl->fl_next, break_time); - percpu_down_read(&file_rwsem); + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); trace_break_lease_unblock(inode, new_fl); locks_delete_block(new_fl); @@ -1530,7 +1530,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) } out: spin_unlock(&ctx->flc_lock); - percpu_up_read(&file_rwsem); + percpu_up_read_preempt_enable(&file_rwsem); locks_dispose_list(&dispose); locks_free_lock(new_fl); return error; @@ -1685,7 +1685,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr return -EINVAL; } - percpu_down_read(&file_rwsem); + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); error = check_conflicting_open(dentry, arg, lease->fl_flags); @@ -1756,7 +1756,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr lease->fl_lmops->lm_setup(lease, priv); out: spin_unlock(&ctx->flc_lock); - percpu_up_read(&file_rwsem); + percpu_up_read_preempt_enable(&file_rwsem); locks_dispose_list(&dispose); if (is_deleg) inode_unlock(inode); @@ -1779,7 +1779,7 @@ static int generic_delete_lease(struct file *filp, void *owner) return error; } - percpu_down_read(&file_rwsem); + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); list_for_each_entry(fl, &ctx->flc_lease, fl_list) { if (fl->fl_file == filp && @@ -1792,7 +1792,7 @@ static int generic_delete_lease(struct file *filp, void *owner) if (victim) error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose); spin_unlock(&ctx->flc_lock); - percpu_up_read(&file_rwsem); + percpu_up_read_preempt_enable(&file_rwsem); locks_dispose_list(&dispose); return error; } From e6253970413d99f416f7de8bd516e5f1834d8216 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 21 Nov 2015 19:11:48 +0100 Subject: [PATCH 521/538] stop_machine: Remove stop_cpus_lock and lg_double_lock/unlock() stop_two_cpus() and stop_cpus() use stop_cpus_lock to avoid the deadlock, we need to ensure that the stopper functions can't be queued "backwards" from one another. This doesn't look nice; if we use lglock then we do not really need stopper->lock, cpu_stop_queue_work() could use lg_local_lock() under local_irq_save(). OTOH it would be even better to avoid lglock in stop_machine.c and remove lg_double_lock(). This patch adds "bool stop_cpus_in_progress" set/cleared by queue_stop_cpus_work(), and changes cpu_stop_queue_two_works() to busy wait until it is cleared. queue_stop_cpus_work() sets stop_cpus_in_progress = T lockless, but after it queues a work on CPU1 it must be visible to stop_two_cpus(CPU1, CPU2) which checks it under the same lock. And since stop_two_cpus() holds the 2nd lock too, queue_stop_cpus_work() can not clear stop_cpus_in_progress if it is also going to queue a work on CPU2, it needs to take that 2nd lock to do this. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Tejun Heo Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20151121181148.GA433@redhat.com Signed-off-by: Ingo Molnar --- include/linux/lglock.h | 5 ----- kernel/locking/lglock.c | 22 --------------------- kernel/stop_machine.c | 42 +++++++++++++++++++++++++---------------- 3 files changed, 26 insertions(+), 43 deletions(-) diff --git a/include/linux/lglock.h b/include/linux/lglock.h index c92ebd100d9b..0081f000e34b 100644 --- a/include/linux/lglock.h +++ b/include/linux/lglock.h @@ -52,15 +52,10 @@ struct lglock { static struct lglock name = { .lock = &name ## _lock } void lg_lock_init(struct lglock *lg, char *name); - void lg_local_lock(struct lglock *lg); void lg_local_unlock(struct lglock *lg); void lg_local_lock_cpu(struct lglock *lg, int cpu); void lg_local_unlock_cpu(struct lglock *lg, int cpu); - -void lg_double_lock(struct lglock *lg, int cpu1, int cpu2); -void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2); - void lg_global_lock(struct lglock *lg); void lg_global_unlock(struct lglock *lg); diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c index 951cfcd10b4a..86ae2aebf004 100644 --- a/kernel/locking/lglock.c +++ b/kernel/locking/lglock.c @@ -60,28 +60,6 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu) } EXPORT_SYMBOL(lg_local_unlock_cpu); -void lg_double_lock(struct lglock *lg, int cpu1, int cpu2) -{ - BUG_ON(cpu1 == cpu2); - - /* lock in cpu order, just like lg_global_lock */ - if (cpu2 < cpu1) - swap(cpu1, cpu2); - - preempt_disable(); - lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - arch_spin_lock(per_cpu_ptr(lg->lock, cpu1)); - arch_spin_lock(per_cpu_ptr(lg->lock, cpu2)); -} - -void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2) -{ - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1)); - arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2)); - preempt_enable(); -} - void lg_global_lock(struct lglock *lg) { int i; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 4a1ca5f6da7e..ae6f41fb9cba 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -20,7 +20,6 @@ #include #include #include -#include #include /* @@ -47,13 +46,9 @@ struct cpu_stopper { static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); static bool stop_machine_initialized = false; -/* - * Avoids a race between stop_two_cpus and global stop_cpus, where - * the stoppers could get queued up in reverse order, leading to - * system deadlock. Using an lglock means stop_two_cpus remains - * relatively cheap. - */ -DEFINE_STATIC_LGLOCK(stop_cpus_lock); +/* static data for stop_cpus */ +static DEFINE_MUTEX(stop_cpus_mutex); +static bool stop_cpus_in_progress; static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) { @@ -230,14 +225,26 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); int err; - - lg_double_lock(&stop_cpus_lock, cpu1, cpu2); +retry: spin_lock_irq(&stopper1->lock); spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); err = -ENOENT; if (!stopper1->enabled || !stopper2->enabled) goto unlock; + /* + * Ensure that if we race with __stop_cpus() the stoppers won't get + * queued up in reverse order leading to system deadlock. + * + * We can't miss stop_cpus_in_progress if queue_stop_cpus_work() has + * queued a work on cpu1 but not on cpu2, we hold both locks. + * + * It can be falsely true but it is safe to spin until it is cleared, + * queue_stop_cpus_work() does everything under preempt_disable(). + */ + err = -EDEADLK; + if (unlikely(stop_cpus_in_progress)) + goto unlock; err = 0; __cpu_stop_queue_work(stopper1, work1); @@ -245,8 +252,12 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, unlock: spin_unlock(&stopper2->lock); spin_unlock_irq(&stopper1->lock); - lg_double_unlock(&stop_cpus_lock, cpu1, cpu2); + if (unlikely(err == -EDEADLK)) { + while (stop_cpus_in_progress) + cpu_relax(); + goto retry; + } return err; } /** @@ -316,9 +327,6 @@ bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, return cpu_stop_queue_work(cpu, work_buf); } -/* static data for stop_cpus */ -static DEFINE_MUTEX(stop_cpus_mutex); - static bool queue_stop_cpus_work(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg, struct cpu_stop_done *done) @@ -332,7 +340,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask, * preempted by a stopper which might wait for other stoppers * to enter @fn which can lead to deadlock. */ - lg_global_lock(&stop_cpus_lock); + preempt_disable(); + stop_cpus_in_progress = true; for_each_cpu(cpu, cpumask) { work = &per_cpu(cpu_stopper.stop_work, cpu); work->fn = fn; @@ -341,7 +350,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask, if (cpu_stop_queue_work(cpu, work)) queued = true; } - lg_global_unlock(&stop_cpus_lock); + stop_cpus_in_progress = false; + preempt_enable(); return queued; } From d32cdbfb0ba319e44f75437afde868f7cafdc467 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Nov 2015 18:36:16 +0100 Subject: [PATCH 522/538] locking/lglock: Remove lglock implementation It is now unused, remove it before someone else thinks its a good idea to use this. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- Documentation/locking/lglock.txt | 166 ------------------------------- include/linux/lglock.h | 76 -------------- kernel/locking/Makefile | 1 - kernel/locking/lglock.c | 89 ----------------- 4 files changed, 332 deletions(-) delete mode 100644 Documentation/locking/lglock.txt delete mode 100644 include/linux/lglock.h delete mode 100644 kernel/locking/lglock.c diff --git a/Documentation/locking/lglock.txt b/Documentation/locking/lglock.txt deleted file mode 100644 index a6971e34fabe..000000000000 --- a/Documentation/locking/lglock.txt +++ /dev/null @@ -1,166 +0,0 @@ -lglock - local/global locks for mostly local access patterns ------------------------------------------------------------- - -Origin: Nick Piggin's VFS scalability series introduced during - 2.6.35++ [1] [2] -Location: kernel/locking/lglock.c - include/linux/lglock.h -Users: currently only the VFS and stop_machine related code - -Design Goal: ------------- - -Improve scalability of globally used large data sets that are -distributed over all CPUs as per_cpu elements. - -To manage global data structures that are partitioned over all CPUs -as per_cpu elements but can be mostly handled by CPU local actions -lglock will be used where the majority of accesses are cpu local -reading and occasional cpu local writing with very infrequent -global write access. - - -* deal with things locally whenever possible - - very fast access to the local per_cpu data - - reasonably fast access to specific per_cpu data on a different - CPU -* while making global action possible when needed - - by expensive access to all CPUs locks - effectively - resulting in a globally visible critical section. - -Design: -------- - -Basically it is an array of per_cpu spinlocks with the -lg_local_lock/unlock accessing the local CPUs lock object and the -lg_local_lock_cpu/unlock_cpu accessing a remote CPUs lock object -the lg_local_lock has to disable preemption as migration protection so -that the reference to the local CPUs lock does not go out of scope. -Due to the lg_local_lock/unlock only touching cpu-local resources it -is fast. Taking the local lock on a different CPU will be more -expensive but still relatively cheap. - -One can relax the migration constraints by acquiring the current -CPUs lock with lg_local_lock_cpu, remember the cpu, and release that -lock at the end of the critical section even if migrated. This should -give most of the performance benefits without inhibiting migration -though needs careful considerations for nesting of lglocks and -consideration of deadlocks with lg_global_lock. - -The lg_global_lock/unlock locks all underlying spinlocks of all -possible CPUs (including those off-line). The preemption disable/enable -are needed in the non-RT kernels to prevent deadlocks like: - - on cpu 1 - - task A task B - lg_global_lock - got cpu 0 lock - <<<< preempt <<<< - lg_local_lock_cpu for cpu 0 - spin on cpu 0 lock - -On -RT this deadlock scenario is resolved by the arch_spin_locks in the -lglocks being replaced by rt_mutexes which resolve the above deadlock -by boosting the lock-holder. - - -Implementation: ---------------- - -The initial lglock implementation from Nick Piggin used some complex -macros to generate the lglock/brlock in lglock.h - they were later -turned into a set of functions by Andi Kleen [7]. The change to functions -was motivated by the presence of multiple lock users and also by them -being easier to maintain than the generating macros. This change to -functions is also the basis to eliminated the restriction of not -being initializeable in kernel modules (the remaining problem is that -locks are not explicitly initialized - see lockdep-design.txt) - -Declaration and initialization: -------------------------------- - - #include - - DEFINE_LGLOCK(name) - or: - DEFINE_STATIC_LGLOCK(name); - - lg_lock_init(&name, "lockdep_name_string"); - - on UP this is mapped to DEFINE_SPINLOCK(name) in both cases, note - also that as of 3.18-rc6 all declaration in use are of the _STATIC_ - variant (and it seems that the non-static was never in use). - lg_lock_init is initializing the lockdep map only. - -Usage: ------- - -From the locking semantics it is a spinlock. It could be called a -locality aware spinlock. lg_local_* behaves like a per_cpu -spinlock and lg_global_* like a global spinlock. -No surprises in the API. - - lg_local_lock(*lglock); - access to protected per_cpu object on this CPU - lg_local_unlock(*lglock); - - lg_local_lock_cpu(*lglock, cpu); - access to protected per_cpu object on other CPU cpu - lg_local_unlock_cpu(*lglock, cpu); - - lg_global_lock(*lglock); - access all protected per_cpu objects on all CPUs - lg_global_unlock(*lglock); - - There are no _trylock variants of the lglocks. - -Note that the lg_global_lock/unlock has to iterate over all possible -CPUs rather than the actually present CPUs or a CPU could go off-line -with a held lock [4] and that makes it very expensive. A discussion on -these issues can be found at [5] - -Constraints: ------------- - - * currently the declaration of lglocks in kernel modules is not - possible, though this should be doable with little change. - * lglocks are not recursive. - * suitable for code that can do most operations on the CPU local - data and will very rarely need the global lock - * lg_global_lock/unlock is *very* expensive and does not scale - * on UP systems all lg_* primitives are simply spinlocks - * in PREEMPT_RT the spinlock becomes an rt-mutex and can sleep but - does not change the tasks state while sleeping [6]. - * in PREEMPT_RT the preempt_disable/enable in lg_local_lock/unlock - is downgraded to a migrate_disable/enable, the other - preempt_disable/enable are downgraded to barriers [6]. - The deadlock noted for non-RT above is resolved due to rt_mutexes - boosting the lock-holder in this case which arch_spin_locks do - not do. - -lglocks were designed for very specific problems in the VFS and probably -only are the right answer in these corner cases. Any new user that looks -at lglocks probably wants to look at the seqlock and RCU alternatives as -her first choice. There are also efforts to resolve the RCU issues that -currently prevent using RCU in place of view remaining lglocks. - -Note on brlock history: ------------------------ - -The 'Big Reader' read-write spinlocks were originally introduced by -Ingo Molnar in 2000 (2.4/2.5 kernel series) and removed in 2003. They -later were introduced by the VFS scalability patch set in 2.6 series -again as the "big reader lock" brlock [2] variant of lglock which has -been replaced by seqlock primitives or by RCU based primitives in the -3.13 kernel series as was suggested in [3] in 2003. The brlock was -entirely removed in the 3.13 kernel series. - -Link: 1 http://lkml.org/lkml/2010/8/2/81 -Link: 2 http://lwn.net/Articles/401738/ -Link: 3 http://lkml.org/lkml/2003/3/9/205 -Link: 4 https://lkml.org/lkml/2011/8/24/185 -Link: 5 http://lkml.org/lkml/2011/12/18/189 -Link: 6 https://www.kernel.org/pub/linux/kernel/projects/rt/ - patch series - lglocks-rt.patch.patch -Link: 7 http://lkml.org/lkml/2012/3/5/26 diff --git a/include/linux/lglock.h b/include/linux/lglock.h deleted file mode 100644 index 0081f000e34b..000000000000 --- a/include/linux/lglock.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Specialised local-global spinlock. Can only be declared as global variables - * to avoid overhead and keep things simple (and we don't want to start using - * these inside dynamically allocated structures). - * - * "local/global locks" (lglocks) can be used to: - * - * - Provide fast exclusive access to per-CPU data, with exclusive access to - * another CPU's data allowed but possibly subject to contention, and to - * provide very slow exclusive access to all per-CPU data. - * - Or to provide very fast and scalable read serialisation, and to provide - * very slow exclusive serialisation of data (not necessarily per-CPU data). - * - * Brlocks are also implemented as a short-hand notation for the latter use - * case. - * - * Copyright 2009, 2010, Nick Piggin, Novell Inc. - */ -#ifndef __LINUX_LGLOCK_H -#define __LINUX_LGLOCK_H - -#include -#include -#include -#include -#include - -#ifdef CONFIG_SMP - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -#define LOCKDEP_INIT_MAP lockdep_init_map -#else -#define LOCKDEP_INIT_MAP(a, b, c, d) -#endif - -struct lglock { - arch_spinlock_t __percpu *lock; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lock_class_key lock_key; - struct lockdep_map lock_dep_map; -#endif -}; - -#define DEFINE_LGLOCK(name) \ - static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock) \ - = __ARCH_SPIN_LOCK_UNLOCKED; \ - struct lglock name = { .lock = &name ## _lock } - -#define DEFINE_STATIC_LGLOCK(name) \ - static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock) \ - = __ARCH_SPIN_LOCK_UNLOCKED; \ - static struct lglock name = { .lock = &name ## _lock } - -void lg_lock_init(struct lglock *lg, char *name); -void lg_local_lock(struct lglock *lg); -void lg_local_unlock(struct lglock *lg); -void lg_local_lock_cpu(struct lglock *lg, int cpu); -void lg_local_unlock_cpu(struct lglock *lg, int cpu); -void lg_global_lock(struct lglock *lg); -void lg_global_unlock(struct lglock *lg); - -#else -/* When !CONFIG_SMP, map lglock to spinlock */ -#define lglock spinlock -#define DEFINE_LGLOCK(name) DEFINE_SPINLOCK(name) -#define DEFINE_STATIC_LGLOCK(name) static DEFINE_SPINLOCK(name) -#define lg_lock_init(lg, name) spin_lock_init(lg) -#define lg_local_lock spin_lock -#define lg_local_unlock spin_unlock -#define lg_local_lock_cpu(lg, cpu) spin_lock(lg) -#define lg_local_unlock_cpu(lg, cpu) spin_unlock(lg) -#define lg_global_lock spin_lock -#define lg_global_unlock spin_unlock -#endif - -#endif diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 31322a4275cd..6f88e352cd4f 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -18,7 +18,6 @@ obj-$(CONFIG_LOCKDEP) += lockdep_proc.o endif obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o -obj-$(CONFIG_SMP) += lglock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c deleted file mode 100644 index 86ae2aebf004..000000000000 --- a/kernel/locking/lglock.c +++ /dev/null @@ -1,89 +0,0 @@ -/* See include/linux/lglock.h for description */ -#include -#include -#include -#include - -/* - * Note there is no uninit, so lglocks cannot be defined in - * modules (but it's fine to use them from there) - * Could be added though, just undo lg_lock_init - */ - -void lg_lock_init(struct lglock *lg, char *name) -{ - LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0); -} -EXPORT_SYMBOL(lg_lock_init); - -void lg_local_lock(struct lglock *lg) -{ - arch_spinlock_t *lock; - - preempt_disable(); - lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - lock = this_cpu_ptr(lg->lock); - arch_spin_lock(lock); -} -EXPORT_SYMBOL(lg_local_lock); - -void lg_local_unlock(struct lglock *lg) -{ - arch_spinlock_t *lock; - - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - lock = this_cpu_ptr(lg->lock); - arch_spin_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(lg_local_unlock); - -void lg_local_lock_cpu(struct lglock *lg, int cpu) -{ - arch_spinlock_t *lock; - - preempt_disable(); - lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - lock = per_cpu_ptr(lg->lock, cpu); - arch_spin_lock(lock); -} -EXPORT_SYMBOL(lg_local_lock_cpu); - -void lg_local_unlock_cpu(struct lglock *lg, int cpu) -{ - arch_spinlock_t *lock; - - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - lock = per_cpu_ptr(lg->lock, cpu); - arch_spin_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(lg_local_unlock_cpu); - -void lg_global_lock(struct lglock *lg) -{ - int i; - - preempt_disable(); - lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - for_each_possible_cpu(i) { - arch_spinlock_t *lock; - lock = per_cpu_ptr(lg->lock, i); - arch_spin_lock(lock); - } -} -EXPORT_SYMBOL(lg_global_lock); - -void lg_global_unlock(struct lglock *lg) -{ - int i; - - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - for_each_possible_cpu(i) { - arch_spinlock_t *lock; - lock = per_cpu_ptr(lg->lock, i); - arch_spin_unlock(lock); - } - preempt_enable(); -} -EXPORT_SYMBOL(lg_global_unlock); From 3aa601492babdf3acdec89e5aa9c44e1a357a4d8 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Thu, 22 Sep 2016 15:56:21 +0800 Subject: [PATCH 523/538] clocksource/drivers/ti-32k: Prevent ftrace recursion Currently ti-32k can be used as a scheduler clock. We properly marked omap_32k_read_sched_clock() as notrace but we then call another function ti_32k_read_cycles() that _wasn't_ notrace. Having a traceable function in the sched_clock() path leads to a recursion within ftrace and a kernel crash. Fix this by adding notrace attribute to the ti_32k_read_cycles() function. Signed-off-by: Jisheng Zhang Cc: daniel.lezcano@linaro.org Cc: linux-arm-kernel@lists.infradead.org Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20160922075621.3725-1-jszhang@marvell.com Signed-off-by: Thomas Gleixner --- drivers/clocksource/timer-ti-32k.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/timer-ti-32k.c b/drivers/clocksource/timer-ti-32k.c index 92b7e390f6c8..cf5b14e442e4 100644 --- a/drivers/clocksource/timer-ti-32k.c +++ b/drivers/clocksource/timer-ti-32k.c @@ -65,7 +65,7 @@ static inline struct ti_32k *to_ti_32k(struct clocksource *cs) return container_of(cs, struct ti_32k, cs); } -static cycle_t ti_32k_read_cycles(struct clocksource *cs) +static cycle_t notrace ti_32k_read_cycles(struct clocksource *cs) { struct ti_32k *ti = to_ti_32k(cs); From b536fd587044af02183b3c02690431b93154f0fa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Sep 2016 07:48:17 -0700 Subject: [PATCH 524/538] timekeeping: Include the correct header for errno definitions asm-generic headers are only defaults for architectures. We need to get the proper defintion, which goes through and . Signed-off-by: Christoph Hellwig Cc: john.stultz@linaro.org Link: http://lkml.kernel.org/r/1474555697-8206-1-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- include/linux/timekeeping.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 816b7543f81b..09168c52ab64 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -1,7 +1,7 @@ #ifndef _LINUX_TIMEKEEPING_H #define _LINUX_TIMEKEEPING_H -#include +#include /* Included from linux/ktime.h */ From 331dcf421c34d227784d07943eb01e4023a42b0a Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 25 Aug 2016 12:23:39 +0100 Subject: [PATCH 525/538] i2c: qup: skip qup_i2c_suspend if the device is already runtime suspended If the i2c device is already runtime suspended, if qup_i2c_suspend is executed during suspend-to-idle or suspend-to-ram it will result in the following splat: WARNING: CPU: 3 PID: 1593 at drivers/clk/clk.c:476 clk_core_unprepare+0x80/0x90 Modules linked in: CPU: 3 PID: 1593 Comm: bash Tainted: G W 4.8.0-rc3 #14 Hardware name: Qualcomm Technologies, Inc. APQ 8016 SBC (DT) PC is at clk_core_unprepare+0x80/0x90 LR is at clk_unprepare+0x28/0x40 pc : [] lr : [] pstate: 60000145 Call trace: clk_core_unprepare+0x80/0x90 qup_i2c_disable_clocks+0x2c/0x68 qup_i2c_suspend+0x10/0x20 platform_pm_suspend+0x24/0x68 ... This patch fixes the issue by executing qup_i2c_pm_suspend_runtime conditionally in qup_i2c_suspend. Signed-off-by: Sudeep Holla Reviewed-by: Andy Gross Signed-off-by: Wolfram Sang Cc: stable@kernel.org --- drivers/i2c/busses/i2c-qup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-qup.c b/drivers/i2c/busses/i2c-qup.c index 501bd15cb78e..a8497cfdae6f 100644 --- a/drivers/i2c/busses/i2c-qup.c +++ b/drivers/i2c/busses/i2c-qup.c @@ -1599,7 +1599,8 @@ static int qup_i2c_pm_resume_runtime(struct device *device) #ifdef CONFIG_PM_SLEEP static int qup_i2c_suspend(struct device *device) { - qup_i2c_pm_suspend_runtime(device); + if (!pm_runtime_suspended(device)) + return qup_i2c_pm_suspend_runtime(device); return 0; } From 58cbbee2391ce3876e6eee80a4f2a7f025859c52 Mon Sep 17 00:00:00 2001 From: Vadim Pasternak Date: Thu, 22 Sep 2016 21:13:42 +0000 Subject: [PATCH 526/538] x86/platform/mellanox: Introduce support for Mellanox systems platform Enable system support for the Mellanox Technologies platform, which provides support for the next Mellanox basic systems: "msx6710", "msx6720", "msb7700", "msn2700", "msx1410", "msn2410", "msb7800", "msn2740", "msn2100" and also various number of derivative systems from the above basic types. The Kconfig controlling compilation of this code is: MLX_PLATFORM Signed-off-by: Vadim Pasternak Cc: jiri@resnulli.us Cc: gregkh@linuxfoundation.org Cc: platform-driver-x86@vger.kernel.org Cc: geert@linux-m68k.org Cc: linux@roeck-us.net Cc: akpm@linux-foundation.org Cc: mchehab@kernel.org Cc: davem@davemloft.net Cc: kvalo@codeaurora.org Link: http://lkml.kernel.org/r/1474578822-33805-1-git-send-email-vadimp@mellanox.com Signed-off-by: Thomas Gleixner --- MAINTAINERS | 6 + arch/x86/Kconfig | 12 + arch/x86/platform/Makefile | 1 + arch/x86/platform/mellanox/Makefile | 1 + arch/x86/platform/mellanox/mlx-platform.c | 266 ++++++++++++++++++++++ 5 files changed, 286 insertions(+) create mode 100644 arch/x86/platform/mellanox/Makefile create mode 100644 arch/x86/platform/mellanox/mlx-platform.c diff --git a/MAINTAINERS b/MAINTAINERS index a0ce40f4c66c..2adc6ef635ce 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7668,6 +7668,12 @@ W: http://www.mellanox.com Q: http://patchwork.ozlabs.org/project/netdev/list/ F: drivers/net/ethernet/mellanox/mlxsw/ +MELLANOX PLATFORM DRIVER +M: Vadim Pasternak +L: platform-driver-x86@vger.kernel.org +S: Supported +F: arch/x86/platform/mellanox/mlx-platform.c + SOFT-ROCE DRIVER (rxe) M: Moni Shoua L: linux-rdma@vger.kernel.org diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2a1f0ce7c59a..a72b01b225f2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -549,6 +549,18 @@ config X86_INTEL_QUARK Say Y here if you have a Quark based system such as the Arduino compatible Intel Galileo. +config MLX_PLATFORM + tristate "Mellanox Technologies platform support" + depends on X86_64 + depends on X86_EXTENDED_PLATFORM + ---help--- + This option enables system support for the Mellanox Technologies + platform. + + Say Y here if you are building a kernel for Mellanox system. + + Otherwise, say N. + config X86_INTEL_LPSS bool "Intel Low Power Subsystem Support" depends on X86 && ACPI diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index 184842ef332e..3c3c19ea94df 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -8,6 +8,7 @@ obj-y += iris/ obj-y += intel/ obj-y += intel-mid/ obj-y += intel-quark/ +obj-y += mellanox/ obj-y += olpc/ obj-y += scx200/ obj-y += sfi/ diff --git a/arch/x86/platform/mellanox/Makefile b/arch/x86/platform/mellanox/Makefile new file mode 100644 index 000000000000..f43c93188a1d --- /dev/null +++ b/arch/x86/platform/mellanox/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_MLX_PLATFORM) += mlx-platform.o diff --git a/arch/x86/platform/mellanox/mlx-platform.c b/arch/x86/platform/mellanox/mlx-platform.c new file mode 100644 index 000000000000..e1dc1526b5f8 --- /dev/null +++ b/arch/x86/platform/mellanox/mlx-platform.c @@ -0,0 +1,266 @@ +/* + * arch/x86/platform/mellanox/mlx-platform.c + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Vadim Pasternak + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define MLX_PLAT_DEVICE_NAME "mlxplat" + +/* LPC bus IO offsets */ +#define MLXPLAT_CPLD_LPC_I2C_BASE_ADRR 0x2000 +#define MLXPLAT_CPLD_LPC_REG_BASE_ADRR 0x2500 +#define MLXPLAT_CPLD_LPC_IO_RANGE 0x100 +#define MLXPLAT_CPLD_LPC_I2C_CH1_OFF 0xdb +#define MLXPLAT_CPLD_LPC_I2C_CH2_OFF 0xda +#define MLXPLAT_CPLD_LPC_PIO_OFFSET 0x10000UL +#define MLXPLAT_CPLD_LPC_REG1 ((MLXPLAT_CPLD_LPC_REG_BASE_ADRR + \ + MLXPLAT_CPLD_LPC_I2C_CH1_OFF) | \ + MLXPLAT_CPLD_LPC_PIO_OFFSET) +#define MLXPLAT_CPLD_LPC_REG2 ((MLXPLAT_CPLD_LPC_REG_BASE_ADRR + \ + MLXPLAT_CPLD_LPC_I2C_CH2_OFF) | \ + MLXPLAT_CPLD_LPC_PIO_OFFSET) + +/* Start channel numbers */ +#define MLXPLAT_CPLD_CH1 2 +#define MLXPLAT_CPLD_CH2 10 + +/* Number of LPC attached MUX platform devices */ +#define MLXPLAT_CPLD_LPC_MUX_DEVS 2 + +/* mlxplat_priv - platform private data + * @pdev_i2c - i2c controller platform device + * @pdev_mux - array of mux platform devices + */ +struct mlxplat_priv { + struct platform_device *pdev_i2c; + struct platform_device *pdev_mux[MLXPLAT_CPLD_LPC_MUX_DEVS]; +}; + +/* Regions for LPC I2C controller and LPC base register space */ +static const struct resource mlxplat_lpc_resources[] = { + [0] = DEFINE_RES_NAMED(MLXPLAT_CPLD_LPC_I2C_BASE_ADRR, + MLXPLAT_CPLD_LPC_IO_RANGE, + "mlxplat_cpld_lpc_i2c_ctrl", IORESOURCE_IO), + [1] = DEFINE_RES_NAMED(MLXPLAT_CPLD_LPC_REG_BASE_ADRR, + MLXPLAT_CPLD_LPC_IO_RANGE, + "mlxplat_cpld_lpc_regs", + IORESOURCE_IO), +}; + +/* Platform default channels */ +static const int mlxplat_default_channels[][8] = { + { + MLXPLAT_CPLD_CH1, MLXPLAT_CPLD_CH1 + 1, MLXPLAT_CPLD_CH1 + 2, + MLXPLAT_CPLD_CH1 + 3, MLXPLAT_CPLD_CH1 + 4, MLXPLAT_CPLD_CH1 + + 5, MLXPLAT_CPLD_CH1 + 6, MLXPLAT_CPLD_CH1 + 7 + }, + { + MLXPLAT_CPLD_CH2, MLXPLAT_CPLD_CH2 + 1, MLXPLAT_CPLD_CH2 + 2, + MLXPLAT_CPLD_CH2 + 3, MLXPLAT_CPLD_CH2 + 4, MLXPLAT_CPLD_CH2 + + 5, MLXPLAT_CPLD_CH2 + 6, MLXPLAT_CPLD_CH2 + 7 + }, +}; + +/* Platform channels for MSN21xx system family */ +static const int mlxplat_msn21xx_channels[] = { 1, 2, 3, 4, 5, 6, 7, 8 }; + +/* Platform mux data */ +static struct i2c_mux_reg_platform_data mlxplat_mux_data[] = { + { + .parent = 1, + .base_nr = MLXPLAT_CPLD_CH1, + .write_only = 1, + .reg = (void __iomem *)MLXPLAT_CPLD_LPC_REG1, + .reg_size = 1, + .idle_in_use = 1, + }, + { + .parent = 1, + .base_nr = MLXPLAT_CPLD_CH2, + .write_only = 1, + .reg = (void __iomem *)MLXPLAT_CPLD_LPC_REG2, + .reg_size = 1, + .idle_in_use = 1, + }, + +}; + +static struct platform_device *mlxplat_dev; + +static int __init mlxplat_dmi_default_matched(const struct dmi_system_id *dmi) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) { + mlxplat_mux_data[i].values = mlxplat_default_channels[i]; + mlxplat_mux_data[i].n_values = + ARRAY_SIZE(mlxplat_default_channels[i]); + } + + return 1; +}; + +static int __init mlxplat_dmi_msn21xx_matched(const struct dmi_system_id *dmi) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) { + mlxplat_mux_data[i].values = mlxplat_msn21xx_channels; + mlxplat_mux_data[i].n_values = + ARRAY_SIZE(mlxplat_msn21xx_channels); + } + + return 1; +}; + +static struct dmi_system_id mlxplat_dmi_table[] __initdata = { + { + .callback = mlxplat_dmi_default_matched, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"), + DMI_MATCH(DMI_PRODUCT_NAME, "MSN24"), + }, + }, + { + .callback = mlxplat_dmi_default_matched, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"), + DMI_MATCH(DMI_PRODUCT_NAME, "MSN27"), + }, + }, + { + .callback = mlxplat_dmi_default_matched, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"), + DMI_MATCH(DMI_PRODUCT_NAME, "MSB"), + }, + }, + { + .callback = mlxplat_dmi_default_matched, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"), + DMI_MATCH(DMI_PRODUCT_NAME, "MSX"), + }, + }, + { + .callback = mlxplat_dmi_msn21xx_matched, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"), + DMI_MATCH(DMI_PRODUCT_NAME, "MSN21"), + }, + }, + { } +}; + +static int __init mlxplat_init(void) +{ + struct mlxplat_priv *priv; + int i, err; + + if (!dmi_check_system(mlxplat_dmi_table)) + return -ENODEV; + + mlxplat_dev = platform_device_register_simple(MLX_PLAT_DEVICE_NAME, -1, + mlxplat_lpc_resources, + ARRAY_SIZE(mlxplat_lpc_resources)); + + if (!mlxplat_dev) + return -ENOMEM; + + priv = devm_kzalloc(&mlxplat_dev->dev, sizeof(struct mlxplat_priv), + GFP_KERNEL); + if (!priv) { + err = -ENOMEM; + goto fail_alloc; + } + platform_set_drvdata(mlxplat_dev, priv); + + priv->pdev_i2c = platform_device_register_simple("i2c_mlxcpld", -1, + NULL, 0); + if (IS_ERR(priv->pdev_i2c)) { + err = PTR_ERR(priv->pdev_i2c); + goto fail_alloc; + }; + + for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) { + priv->pdev_mux[i] = platform_device_register_resndata( + &mlxplat_dev->dev, + "i2c-mux-reg", i, NULL, + 0, &mlxplat_mux_data[i], + sizeof(mlxplat_mux_data[i])); + if (IS_ERR(priv->pdev_mux[i])) { + err = PTR_ERR(priv->pdev_mux[i]); + goto fail_platform_mux_register; + } + } + + return 0; + +fail_platform_mux_register: + for (i--; i > 0 ; i--) + platform_device_unregister(priv->pdev_mux[i]); + platform_device_unregister(priv->pdev_i2c); +fail_alloc: + platform_device_unregister(mlxplat_dev); + + return err; +} +module_init(mlxplat_init); + +static void __exit mlxplat_exit(void) +{ + struct mlxplat_priv *priv = platform_get_drvdata(mlxplat_dev); + int i; + + for (i = ARRAY_SIZE(mlxplat_mux_data) - 1; i >= 0 ; i--) + platform_device_unregister(priv->pdev_mux[i]); + + platform_device_unregister(priv->pdev_i2c); + platform_device_unregister(mlxplat_dev); +} +module_exit(mlxplat_exit); + +MODULE_AUTHOR("Vadim Pasternak (vadimp@mellanox.com)"); +MODULE_DESCRIPTION("Mellanox platform driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS("dmi:*:*Mellanox*:MSN24*:"); +MODULE_ALIAS("dmi:*:*Mellanox*:MSN27*:"); +MODULE_ALIAS("dmi:*:*Mellanox*:MSB*:"); +MODULE_ALIAS("dmi:*:*Mellanox*:MSX*:"); +MODULE_ALIAS("dmi:*:*Mellanox*:MSN21*:"); From 3b4ac78610690bd83fb33762ef97e8b8a89285ae Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 22 Sep 2016 19:58:17 -0600 Subject: [PATCH 527/538] nvme-rdma: only clear queue flags after successful connect Otherwise, nvme_rdma_stop_and_clear_queue() will incorrectly try to stop/free rdma qps/cm_ids that are already freed. Fixes: e89ca58f9c90 ("nvme-rdma: add DELETING queue flag") Reported-by: Steve Wise Tested-by: Steve Wise Signed-off-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index c2c2c28e6eb5..fbdb2267e460 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -561,7 +561,6 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl, queue = &ctrl->queues[idx]; queue->ctrl = ctrl; - queue->flags = 0; init_completion(&queue->cm_done); if (idx > 0) @@ -595,6 +594,7 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl, goto out_destroy_cm_id; } + clear_bit(NVME_RDMA_Q_DELETING, &queue->flags); set_bit(NVME_RDMA_Q_CONNECTED, &queue->flags); return 0; From c1fad9ef7ed14aad464972e6444e7a3bd5670f26 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 22 Sep 2016 16:21:25 -0500 Subject: [PATCH 528/538] objtool: Add do_task_dead() to global noreturn list objtool reports the following new warning: kernel/exit.o: warning: objtool: do_exit() falls through to next function complete_and_exit() The warning is caused by do_exit()'s new call to do_task_dead(), which is a new "noreturn" function which objtool doesn't know about yet, introduced by: 9af6528ee9b6 ("sched/core: Optimize __schedule()") ( objtool has to know all the global noreturn functions so it can follow the control flow of any functions which call them. Unfortunately they need to be hard-coded because there's no automated way to detect them. ) Reported-by: kbuild test robot Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kbuild-all@01.org Cc: tipbuild@zytor.com Link: http://lkml.kernel.org/r/20160922212125.zbuewckqll4yur25@treble Signed-off-by: Ingo Molnar --- tools/objtool/builtin-check.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index bd09d0effef8..143b6cdd7f06 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -175,6 +175,7 @@ static int __dead_end_function(struct objtool_file *file, struct symbol *func, "__stack_chk_fail", "panic", "do_exit", + "do_task_dead", "__module_put_and_exit", "complete_and_exit", "kvm_spurious_fault", From 96b03ab86d843524ec4aed7fe0ceef412c684c68 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 22 Sep 2016 16:55:13 -0400 Subject: [PATCH 529/538] locking/hung_task: Fix typo in CONFIG_DETECT_HUNG_TASK help text Fix the indefinitiley -> indefinitely typo in Kconfig.debug. Signed-off-by: Vivien Didelot Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160922205513.17821-1-vivien.didelot@savoirfairelinux.com Signed-off-by: Ingo Molnar --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2e2cca509231..cab7405f48d2 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -821,7 +821,7 @@ config DETECT_HUNG_TASK help Say Y here to enable the kernel to detect "hung tasks", which are bugs that cause the task to be stuck in - uninterruptible "D" state indefinitiley. + uninterruptible "D" state indefinitely. When a hung task is detected, the kernel will print the current stack trace (which you should report), but the From c18df0adabf8400c1825b90382d06df5edc303fa Mon Sep 17 00:00:00 2001 From: David Daney Date: Tue, 20 Sep 2016 11:46:35 -0700 Subject: [PATCH 530/538] arm64: Call numa_store_cpu_info() earlier. The wq_numa_init() function makes a private CPU to node map by calling cpu_to_node() early in the boot process, before the non-boot CPUs are brought online. Since the default implementation of cpu_to_node() returns zero for CPUs that have never been brought online, the workqueue system's view is that *all* CPUs are on node zero. When the unbound workqueue for a non-zero node is created, the tsk_cpus_allowed() for the worker threads is the empty set because there are, in the view of the workqueue system, no CPUs on non-zero nodes. The code in try_to_wake_up() using this empty cpumask ends up using the cpumask empty set value of NR_CPUS as an index into the per-CPU area pointer array, and gets garbage as it is one past the end of the array. This results in: [ 0.881970] Unable to handle kernel paging request at virtual address fffffb1008b926a4 [ 1.970095] pgd = fffffc00094b0000 [ 1.973530] [fffffb1008b926a4] *pgd=0000000000000000, *pud=0000000000000000, *pmd=0000000000000000 [ 1.982610] Internal error: Oops: 96000004 [#1] SMP [ 1.987541] Modules linked in: [ 1.990631] CPU: 48 PID: 295 Comm: cpuhp/48 Tainted: G W 4.8.0-rc6-preempt-vol+ #9 [ 1.999435] Hardware name: Cavium ThunderX CN88XX board (DT) [ 2.005159] task: fffffe0fe89cc300 task.stack: fffffe0fe8b8c000 [ 2.011158] PC is at try_to_wake_up+0x194/0x34c [ 2.015737] LR is at try_to_wake_up+0x150/0x34c [ 2.020318] pc : [] lr : [] pstate: 600000c5 [ 2.027803] sp : fffffe0fe8b8fb10 [ 2.031149] x29: fffffe0fe8b8fb10 x28: 0000000000000000 [ 2.036522] x27: fffffc0008c63bc8 x26: 0000000000001000 [ 2.041896] x25: fffffc0008c63c80 x24: fffffc0008bfb200 [ 2.047270] x23: 00000000000000c0 x22: 0000000000000004 [ 2.052642] x21: fffffe0fe89d25bc x20: 0000000000001000 [ 2.058014] x19: fffffe0fe89d1d00 x18: 0000000000000000 [ 2.063386] x17: 0000000000000000 x16: 0000000000000000 [ 2.068760] x15: 0000000000000018 x14: 0000000000000000 [ 2.074133] x13: 0000000000000000 x12: 0000000000000000 [ 2.079505] x11: 0000000000000000 x10: 0000000000000000 [ 2.084879] x9 : 0000000000000000 x8 : 0000000000000000 [ 2.090251] x7 : 0000000000000040 x6 : 0000000000000000 [ 2.095621] x5 : ffffffffffffffff x4 : 0000000000000000 [ 2.100991] x3 : 0000000000000000 x2 : 0000000000000000 [ 2.106364] x1 : fffffc0008be4c24 x0 : ffffff0ffffada80 [ 2.111737] [ 2.113236] Process cpuhp/48 (pid: 295, stack limit = 0xfffffe0fe8b8c020) [ 2.120102] Stack: (0xfffffe0fe8b8fb10 to 0xfffffe0fe8b90000) [ 2.125914] fb00: fffffe0fe8b8fb80 fffffc00080e7648 . . . [ 2.442859] Call trace: [ 2.445327] Exception stack(0xfffffe0fe8b8f940 to 0xfffffe0fe8b8fa70) [ 2.451843] f940: fffffe0fe89d1d00 0000040000000000 fffffe0fe8b8fb10 fffffc00080e7468 [ 2.459767] f960: fffffe0fe8b8f980 fffffc00080e4958 ffffff0ff91ab200 fffffc00080e4b64 [ 2.467690] f980: fffffe0fe8b8f9d0 fffffc00080e515c fffffe0fe8b8fa80 0000000000000000 [ 2.475614] f9a0: fffffe0fe8b8f9d0 fffffc00080e58e4 fffffe0fe8b8fa80 0000000000000000 [ 2.483540] f9c0: fffffe0fe8d10000 0000000000000040 fffffe0fe8b8fa50 fffffc00080e5ac4 [ 2.491465] f9e0: ffffff0ffffada80 fffffc0008be4c24 0000000000000000 0000000000000000 [ 2.499387] fa00: 0000000000000000 ffffffffffffffff 0000000000000000 0000000000000040 [ 2.507309] fa20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 2.515233] fa40: 0000000000000000 0000000000000000 0000000000000000 0000000000000018 [ 2.523156] fa60: 0000000000000000 0000000000000000 [ 2.528089] [] try_to_wake_up+0x194/0x34c [ 2.533723] [] wake_up_process+0x28/0x34 [ 2.539275] [] create_worker+0x110/0x19c [ 2.544824] [] alloc_unbound_pwq+0x3cc/0x4b0 [ 2.550724] [] wq_update_unbound_numa+0x10c/0x1e4 [ 2.557066] [] workqueue_online_cpu+0x220/0x28c [ 2.563234] [] cpuhp_invoke_callback+0x6c/0x168 [ 2.569398] [] cpuhp_up_callbacks+0x44/0xe4 [ 2.575210] [] cpuhp_thread_fun+0x13c/0x148 [ 2.581027] [] smpboot_thread_fn+0x19c/0x1a8 [ 2.586929] [] kthread+0xdc/0xf0 [ 2.591776] [] ret_from_fork+0x10/0x50 [ 2.597147] Code: b00057e1 91304021 91005021 b8626822 (b8606821) [ 2.603464] ---[ end trace 58c0cd36b88802bc ]--- [ 2.608138] Kernel panic - not syncing: Fatal exception Fix by moving call to numa_store_cpu_info() for all CPUs into smp_prepare_cpus(), which happens before wq_numa_init(). Since smp_store_cpu_info() now contains only a single function call, simplify by removing the function and out-lining its contents. Suggested-by: Robert Richter Fixes: 1a2db300348b ("arm64, numa: Add NUMA support for arm64 platforms.") Cc: # 4.7.x- Signed-off-by: David Daney Reviewed-by: Robert Richter Tested-by: Yisheng Xie Signed-off-by: Catalin Marinas --- arch/arm64/kernel/smp.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index d93d43352504..3ff173e92582 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -201,12 +201,6 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) return ret; } -static void smp_store_cpu_info(unsigned int cpuid) -{ - store_cpu_topology(cpuid); - numa_store_cpu_info(cpuid); -} - /* * This is the secondary CPU boot entry. We're using this CPUs * idle thread stack, but a set of temporary page tables. @@ -254,7 +248,7 @@ asmlinkage void secondary_start_kernel(void) */ notify_cpu_starting(cpu); - smp_store_cpu_info(cpu); + store_cpu_topology(cpu); /* * OK, now it's safe to let the boot CPU continue. Wait for @@ -689,10 +683,13 @@ void __init smp_prepare_cpus(unsigned int max_cpus) { int err; unsigned int cpu; + unsigned int this_cpu; init_cpu_topology(); - smp_store_cpu_info(smp_processor_id()); + this_cpu = smp_processor_id(); + store_cpu_topology(this_cpu); + numa_store_cpu_info(this_cpu); /* * If UP is mandated by "nosmp" (which implies "maxcpus=0"), don't set @@ -719,6 +716,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) continue; set_cpu_present(cpu, true); + numa_store_cpu_info(cpu); } } From 67787b68ec48c239d5ec12f9bf5adaf5c459517a Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Fri, 23 Sep 2016 16:42:08 +0900 Subject: [PATCH 531/538] arm64: kgdb: handle read-only text / modules Handle read-only cases when CONFIG_DEBUG_RODATA (4.0) or CONFIG_DEBUG_SET_MODULE_RONX (3.18) are enabled by using aarch64_insn_write() instead of probe_kernel_write() as introduced by commit 2f896d586610 ("arm64: use fixmap for text patching") in 4.0. Fixes: 11d91a770f1f ("arm64: Add CONFIG_DEBUG_SET_MODULE_RONX support") Signed-off-by: AKASHI Takahiro Reviewed-by: Mark Rutland Cc: Will Deacon Cc: Jason Wessel Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/debug-monitors.h | 2 -- arch/arm64/kernel/kgdb.c | 36 ++++++++++++++++--------- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h index 4b6b3f72a215..b71420a12f26 100644 --- a/arch/arm64/include/asm/debug-monitors.h +++ b/arch/arm64/include/asm/debug-monitors.h @@ -61,8 +61,6 @@ #define AARCH64_BREAK_KGDB_DYN_DBG \ (AARCH64_BREAK_MON | (KGDB_DYN_DBG_BRK_IMM << 5)) -#define KGDB_DYN_BRK_INS_BYTE(x) \ - ((AARCH64_BREAK_KGDB_DYN_DBG >> (8 * (x))) & 0xff) #define CACHE_FLUSH_IS_SAFE 1 diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c index 8c57f6496e56..e017a9493b92 100644 --- a/arch/arm64/kernel/kgdb.c +++ b/arch/arm64/kernel/kgdb.c @@ -19,10 +19,13 @@ * along with this program. If not, see . */ +#include #include #include #include #include +#include +#include #include struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { @@ -338,15 +341,24 @@ void kgdb_arch_exit(void) unregister_die_notifier(&kgdb_notifier); } -/* - * ARM instructions are always in LE. - * Break instruction is encoded in LE format - */ -struct kgdb_arch arch_kgdb_ops = { - .gdb_bpt_instr = { - KGDB_DYN_BRK_INS_BYTE(0), - KGDB_DYN_BRK_INS_BYTE(1), - KGDB_DYN_BRK_INS_BYTE(2), - KGDB_DYN_BRK_INS_BYTE(3), - } -}; +struct kgdb_arch arch_kgdb_ops; + +int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) +{ + int err; + + BUILD_BUG_ON(AARCH64_INSN_SIZE != BREAK_INSTR_SIZE); + + err = aarch64_insn_read((void *)bpt->bpt_addr, (u32 *)bpt->saved_instr); + if (err) + return err; + + return aarch64_insn_write((void *)bpt->bpt_addr, + (u32)AARCH64_BREAK_KGDB_DYN_DBG); +} + +int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) +{ + return aarch64_insn_write((void *)bpt->bpt_addr, + *(u32 *)bpt->saved_instr); +} From c183a603e8d8a5a189729b77d0c623a3d5950e5f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 23 Sep 2016 17:08:04 +0200 Subject: [PATCH 532/538] acpi: Fix broken error check in map_processor() map_processor() checks the cpuid value returned by acpi_map_cpuid() for -1 but acpi_map_cpuid() returns -EINVAL in case of error. As a consequence the error is ignored and the following access into percpu data with that negative cpuid results in a boot crash. This happens always when NR_CPUS/nr_cpu_ids is smaller than the number of processors listed in the ACPI tables. Use a proper error check for id < 0 so the function returns instead of trying to map CPU#(-EINVAL). Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: Dou Liyang Cc: Gu Zheng Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Tang Chen Cc: Zhu Guihua Cc: akpm@linux-foundation.org Cc: chen.tang@easystack.cn Cc: cl@linux.com Cc: gongzhaogang@inspur.com Cc: isimatu.yasuaki@jp.fujitsu.com Cc: izumi.taku@jp.fujitsu.com Cc: kamezawa.hiroyu@jp.fujitsu.com Cc: len.brown@intel.com Cc: lenb@kernel.org Cc: linux-acpi@vger.kernel.org Cc: linux-mm@kvack.org Cc: mika.j.penttila@gmail.com Cc: rafael@kernel.org Cc: rjw@rjwysocki.net Cc: tj@kernel.org Cc: yasu.isimatu@gmail.com Fixes: dc6db24d2476 ("x86/acpi: Set persistent cpuid <-> nodeid mapping when booting") Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1609231705570.5640@nanos Signed-off-by: Ingo Molnar --- drivers/acpi/processor_core.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c index 9ac265f235b7..5c78ee1860b0 100644 --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -284,7 +284,7 @@ EXPORT_SYMBOL_GPL(acpi_get_cpuid); static bool __init map_processor(acpi_handle handle, phys_cpuid_t *phys_id, int *cpuid) { - int type; + int type, id; u32 acpi_id; acpi_status status; acpi_object_type acpi_type; @@ -320,10 +320,11 @@ map_processor(acpi_handle handle, phys_cpuid_t *phys_id, int *cpuid) type = (acpi_type == ACPI_TYPE_DEVICE) ? 1 : 0; *phys_id = __acpi_get_phys_id(handle, type, acpi_id, false); - *cpuid = acpi_map_cpuid(*phys_id, acpi_id); - if (*cpuid == -1) - return false; + id = acpi_map_cpuid(*phys_id, acpi_id); + if (id < 0) + return false; + *cpuid = id; return true; } From c8712c6a674e3382fe4d26d108251ccfa55d08e0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2016 10:25:48 -0600 Subject: [PATCH 533/538] blk-mq: skip unmapped queues in blk_mq_alloc_request_hctx This provides the caller a feedback that a given hctx is not mapped and thus no command can be sent on it. Signed-off-by: Christoph Hellwig Tested-by: Steve Wise Signed-off-by: Jens Axboe --- block/blk-mq.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 13f5a6c1de76..c207fa9870eb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -296,17 +296,29 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, if (ret) return ERR_PTR(ret); + /* + * Check if the hardware context is actually mapped to anything. + * If not tell the caller that it should skip this queue. + */ hctx = q->queue_hw_ctx[hctx_idx]; + if (!blk_mq_hw_queue_mapped(hctx)) { + ret = -EXDEV; + goto out_queue_exit; + } ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask)); blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw, 0); if (!rq) { - blk_queue_exit(q); - return ERR_PTR(-EWOULDBLOCK); + ret = -EWOULDBLOCK; + goto out_queue_exit; } return rq; + +out_queue_exit: + blk_queue_exit(q); + return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); From 317c2ce77d8ab73c24f4fb9c75e5bb441fbe3e30 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 23 Sep 2016 16:49:39 -0500 Subject: [PATCH 534/538] x86/alternatives: Add stack frame dependency to alternative_call_2() Linus reported the following objtool warning: kernel/signal.o: warning: objtool: .altinstr_replacement+0x54: call without frame pointer save/setup The warning is valid. It's caused by the fact that gcc placed the call instruction in alternative_call_2()'s inline asm before the frame pointer setup, which breaks frame pointer convention and can result in a bad stack trace. Force a stack frame to be created before the call instruction by listing the stack pointer as an output operand in the inline asm statement. Reported-and-tested-by: Linus Torvalds Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160923214939.j5o7c67nhepzmh3t@treble Signed-off-by: Ingo Molnar --- arch/x86/include/asm/alternative.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index e77a6443104f..1b020381ab38 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -217,10 +217,14 @@ static inline int alternatives_text_reserved(void *start, void *end) */ #define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \ output, input...) \ +{ \ + register void *__sp asm(_ASM_SP); \ asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\ "call %P[new2]", feature2) \ - : output : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ - [new2] "i" (newfunc2), ## input) + : output, "+r" (__sp) \ + : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ + [new2] "i" (newfunc2), ## input); \ +} /* * use this macro(s) if you need more than one output parameter From 907241dccb4ce5d9413cf3c030b32b0cfc184914 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 23 Sep 2016 18:24:07 +0100 Subject: [PATCH 535/538] thread_info: Use unsigned long for flags The generic THREAD_INFO_IN_TASK definition of thread_info::flags is a u32, matching x86 prior to the introduction of THREAD_INFO_IN_TASK. However, common helpers like test_ti_thread_flag() implicitly assume that thread_info::flags has at least the size and alignment of unsigned long, and relying on padding and alignment provided by other elements of task_struct is somewhat fragile. Additionally, some architectures use more that 32 bits for thread_info::flags, and others may need to in future. With THREAD_INFO_IN_TASK, task struct follows thread_info with a long field, and thus we no longer save any space as we did back in commit: affa219b60a11b32 ("x86: change thread_info's flag field back to 32 bits") Given all this, it makes more sense for the generic thread_info::flags to be an unsigned long. In fact given contains/uses the helpers mentioned above, BE arches *must* use unsigned long (or something of the same size) today, or they wouldn't work. Make it so. Signed-off-by: Mark Rutland Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1474651447-30447-1-git-send-email-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- include/linux/thread_info.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index e2d0fd81b1ba..45f004e9cc59 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -15,7 +15,7 @@ struct compat_timespec; #ifdef CONFIG_THREAD_INFO_IN_TASK struct thread_info { - u32 flags; /* low level flags */ + unsigned long flags; /* low level flags */ }; #define INIT_THREAD_INFO(tsk) \ From 65f7422288cd19c4a9202c4ac7a9cca673b9b2ea Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sat, 24 Sep 2016 11:48:13 +0000 Subject: [PATCH 536/538] x86/platform/mellanox: Fix return value check in mlxplat_init() In case of error, the function platform_device_register_simple() returns ERR_PTR() and never returns NULL. The NULL test in the return value check must therefor be replaced with IS_ERR(). Signed-off-by: Wei Yongjun Acked-by: Vadim Pasternak Cc: platform-driver-x86@vger.kernel.org Signed-off-by: Thomas Gleixner --- arch/x86/platform/mellanox/mlx-platform.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/platform/mellanox/mlx-platform.c b/arch/x86/platform/mellanox/mlx-platform.c index e1dc1526b5f8..7dcfcca97399 100644 --- a/arch/x86/platform/mellanox/mlx-platform.c +++ b/arch/x86/platform/mellanox/mlx-platform.c @@ -200,8 +200,8 @@ static int __init mlxplat_init(void) mlxplat_lpc_resources, ARRAY_SIZE(mlxplat_lpc_resources)); - if (!mlxplat_dev) - return -ENOMEM; + if (IS_ERR(mlxplat_dev)) + return PTR_ERR(mlxplat_dev); priv = devm_kzalloc(&mlxplat_dev->dev, sizeof(struct mlxplat_priv), GFP_KERNEL); From bba6a49f24d73b2fcc653773c825fcfcd443a137 Mon Sep 17 00:00:00 2001 From: hiyo Date: Mon, 26 Sep 2016 21:31:13 +0900 Subject: [PATCH 537/538] test Signed-off-by: Wookje Kwon --- tools/perf/Documentation/tips.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/Documentation/tips.txt b/tools/perf/Documentation/tips.txt index 5950b5a24efd..2e6a66c222c1 100644 --- a/tools/perf/Documentation/tips.txt +++ b/tools/perf/Documentation/tips.txt @@ -28,3 +28,4 @@ To change sampling frequency to 100 Hz: perf record -F 100 See assembly instructions with percentage: perf annotate If you prefer Intel style assembly, try: perf annotate -M intel For hierarchical output, try: perf report --hierarchy +wookje kwon From 83bd5d44e8da7710987296e1c473bd60c407267b Mon Sep 17 00:00:00 2001 From: Wookje Kwon Date: Mon, 26 Sep 2016 22:38:23 +0900 Subject: [PATCH 538/538] perf config: fix bug in parsing 'man..*' config To add new man viewer, configs like 'man..cmd', 'man..path' can be set into config file (~/.perfconfig). But parsing config file is stopped because the config variable contains '.' character i.e. If setting 'man.xman.cmd' into config file, [man] gman.cmd = gman when launching perf an error message is printed like below. Fatal: bad config file line 11 in /home/taeung/.perfconfig So modify iskeychar() function to decide '.' character as key character parsing config file. Acked-by: Namhyung Kim Cc: Jiri Olsa Signed-off-by: Wookje Kwon diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index 5c20d78..002e416 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -122,7 +122,7 @@ static char *parse_value(void) static inline int iskeychar(int c) { - return isalnum(c) || c == '-' || c == '_'; + return isalnum(c) || c == '-' || c == '_' || c == '.'; } static int get_value(config_fn_t fn, void *data, char *name, unsigned int len) --- tools/perf/util/config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index 18dae745034f..f5c3882a1149 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -124,7 +124,7 @@ static char *parse_value(void) static inline int iskeychar(int c) { - return isalnum(c) || c == '-' || c == '_'; + return isalnum(c) || c == '-' || c == '_' || c == '.'; } static int get_value(config_fn_t fn, void *data, char *name, unsigned int len)