Skip to content

Commit ac0da57

Browse files
author
Fox Snowpatch
committed
1 parent 85ff933 commit ac0da57

11 files changed

Lines changed: 383 additions & 70 deletions

File tree

arch/powerpc/include/asm/paravirt.h

Lines changed: 9 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,15 @@ static inline bool vcpu_is_preempted(int cpu)
145145
if (!is_shared_processor())
146146
return false;
147147

148+
#ifdef CONFIG_PPC_SPLPAR
149+
/*
150+
* Assume the target CPU to be preempted if it is above soft
151+
* entitlement limit
152+
*/
153+
if (!is_kvm_guest())
154+
return !cpu_active(cpu);
155+
#endif
156+
148157
/*
149158
* If the hypervisor has dispatched the target CPU on a physical
150159
* processor, then the target CPU is definitely not preempted.
@@ -159,59 +168,6 @@ static inline bool vcpu_is_preempted(int cpu)
159168
if (!is_vcpu_idle(cpu))
160169
return true;
161170

162-
#ifdef CONFIG_PPC_SPLPAR
163-
if (!is_kvm_guest()) {
164-
int first_cpu, i;
165-
166-
/*
167-
* The result of vcpu_is_preempted() is used in a
168-
* speculative way, and is always subject to invalidation
169-
* by events internal and external to Linux. While we can
170-
* be called in preemptable context (in the Linux sense),
171-
* we're not accessing per-cpu resources in a way that can
172-
* race destructively with Linux scheduler preemption and
173-
* migration, and callers can tolerate the potential for
174-
* error introduced by sampling the CPU index without
175-
* pinning the task to it. So it is permissible to use
176-
* raw_smp_processor_id() here to defeat the preempt debug
177-
* warnings that can arise from using smp_processor_id()
178-
* in arbitrary contexts.
179-
*/
180-
first_cpu = cpu_first_thread_sibling(raw_smp_processor_id());
181-
182-
/*
183-
* The PowerVM hypervisor dispatches VMs on a whole core
184-
* basis. So we know that a thread sibling of the executing CPU
185-
* cannot have been preempted by the hypervisor, even if it
186-
* has called H_CONFER, which will set the yield bit.
187-
*/
188-
if (cpu_first_thread_sibling(cpu) == first_cpu)
189-
return false;
190-
191-
/*
192-
* The specific target CPU was marked by guest OS as idle, but
193-
* then also check all other cpus in the core for PowerVM
194-
* because it does core scheduling and one of the vcpu
195-
* of the core getting preempted by hypervisor implies
196-
* other vcpus can also be considered preempted.
197-
*/
198-
first_cpu = cpu_first_thread_sibling(cpu);
199-
for (i = first_cpu; i < first_cpu + threads_per_core; i++) {
200-
if (i == cpu)
201-
continue;
202-
if (vcpu_is_dispatched(i))
203-
return false;
204-
if (!is_vcpu_idle(i))
205-
return true;
206-
}
207-
}
208-
#endif
209-
210-
/*
211-
* None of the threads in target CPU's core are running but none of
212-
* them were preempted too. Hence assume the target CPU to be
213-
* non-preempted.
214-
*/
215171
return false;
216172
}
217173

arch/powerpc/include/asm/smp.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ struct smp_ops_t {
6060
#ifdef CONFIG_HOTPLUG_CPU
6161
void (*cpu_offline_self)(void);
6262
#endif
63+
#ifdef CONFIG_PPC_SPLPAR
64+
unsigned int (*num_available_cores)(void);
65+
#endif
6366
};
6467

6568
extern struct task_struct *secondary_current;
@@ -266,6 +269,9 @@ extern char __secondary_hold;
266269
extern unsigned int booting_thread_hwid;
267270

268271
extern void __early_start(void);
272+
#ifdef CONFIG_PPC_SPLPAR
273+
int arch_update_cpu_topology(void);
274+
#endif /* CONFIG_PPC_SPLPAR */
269275
#endif /* __ASSEMBLER__ */
270276

271277
#endif /* __KERNEL__ */

arch/powerpc/include/asm/topology.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,5 +177,10 @@ static inline bool topology_is_core_online(unsigned int cpu)
177177
}
178178
#endif
179179

180+
#ifdef CONFIG_PPC_SPLPAR
181+
#define arch_scale_cpu_capacity arch_scale_cpu_capacity
182+
unsigned long arch_scale_cpu_capacity(int cpu);
183+
#endif
184+
180185
#endif /* __KERNEL__ */
181186
#endif /* _ASM_POWERPC_TOPOLOGY_H */

arch/powerpc/kernel/smp.c

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,9 @@ bool has_big_cores __ro_after_init;
8282
bool coregroup_enabled __ro_after_init;
8383
bool thread_group_shares_l2 __ro_after_init;
8484
bool thread_group_shares_l3 __ro_after_init;
85+
#ifdef CONFIG_PPC_SPLPAR
86+
bool process_steal_enable __ro_after_init;
87+
#endif
8588

8689
DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
8790
DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
@@ -1755,6 +1758,16 @@ void __init smp_cpus_done(unsigned int max_cpus)
17551758

17561759
dump_numa_cpu_topology();
17571760
build_sched_topology();
1761+
1762+
#ifdef CONFIG_PPC_SPLPAR
1763+
if (smp_ops->num_available_cores)
1764+
smp_ops->num_available_cores();
1765+
1766+
if (is_shared_processor() && !is_kvm_guest())
1767+
process_steal_enable = true;
1768+
else
1769+
process_steal_enable = false;
1770+
#endif
17581771
}
17591772

17601773
/*
@@ -1821,3 +1834,28 @@ void __noreturn arch_cpu_idle_dead(void)
18211834
}
18221835

18231836
#endif
1837+
1838+
#ifdef CONFIG_PPC_SPLPAR
1839+
#define MIN_CAPACITY 1
1840+
1841+
/*
1842+
* Assume CPU capacity to be low if CPU number happens be above soft
1843+
* available limit. This forces load balancer to prefer higher capacity CPUs
1844+
*/
1845+
unsigned long arch_scale_cpu_capacity(int cpu)
1846+
{
1847+
if (is_shared_processor() && !is_kvm_guest()) {
1848+
if (!cpu_active(cpu))
1849+
return MIN_CAPACITY;
1850+
}
1851+
return SCHED_CAPACITY_SCALE;
1852+
}
1853+
1854+
int arch_update_cpu_topology(void)
1855+
{
1856+
if (is_shared_processor() && !is_kvm_guest())
1857+
return (num_online_cpus() != cpumask_weight(cpu_active_mask));
1858+
1859+
return 0;
1860+
}
1861+
#endif /* CONFIG_PPC_SPLPAR */

arch/powerpc/platforms/pseries/hotplug-cpu.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,9 @@ static int pseries_add_processor(struct device_node *np)
284284

285285
out:
286286
cpu_maps_update_done();
287+
#ifdef CONFIG_PPC_SPLPAR
288+
pseries_num_available_cores();
289+
#endif
287290
free_cpumask_var(cpu_mask);
288291
return rc;
289292
}
@@ -323,6 +326,9 @@ static void pseries_remove_processor(struct device_node *np)
323326
"with physical id 0x%x\n", thread);
324327
}
325328
cpu_maps_update_done();
329+
#ifdef CONFIG_PPC_SPLPAR
330+
pseries_num_available_cores();
331+
#endif
326332
}
327333

328334
static int dlpar_offline_cpu(struct device_node *dn)

arch/powerpc/platforms/pseries/lpar.c

Lines changed: 67 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -659,18 +659,78 @@ static int __init vcpudispatch_stats_procfs_init(void)
659659
machine_device_initcall(pseries, vcpudispatch_stats_procfs_init);
660660

661661
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
662+
#define STEAL_MULTIPLE (STEAL_RATIO * STEAL_RATIO)
663+
664+
static u8 steal_interval = 1;
665+
666+
static bool should_cpu_process_steal(int cpu)
667+
{
668+
if (cpu == cpumask_first(cpu_online_mask))
669+
return true;
670+
671+
return false;
672+
}
673+
674+
extern bool process_steal_enable;
675+
static void process_steal(int cpu)
676+
{
677+
unsigned long steal_ratio, delta_tb, interval_tb;
678+
static unsigned long next_tb, prev_steal;
679+
unsigned long tb = mftb();
680+
unsigned long steal = 0;
681+
unsigned int i;
682+
683+
if (!process_steal_enable)
684+
return;
685+
686+
if (!should_cpu_process_steal(cpu))
687+
return;
688+
689+
if (tb < next_tb)
690+
return;
691+
692+
for_each_online_cpu(i) {
693+
struct lppaca *lppaca = &lppaca_of(i);
694+
695+
steal += be64_to_cpu(READ_ONCE(lppaca->ready_enqueue_tb));
696+
steal += be64_to_cpu(READ_ONCE(lppaca->enqueue_dispatch_tb));
697+
}
698+
699+
if (!steal_interval)
700+
steal_interval = 1;
701+
702+
interval_tb = steal_interval * tb_ticks_per_sec;
703+
if (next_tb && prev_steal) {
704+
delta_tb = max(tb - (next_tb - interval_tb), 1);
705+
steal_ratio = (steal - prev_steal) * STEAL_MULTIPLE;
706+
steal_ratio /= (delta_tb * num_online_cpus());
707+
trigger_softoffline(steal_ratio);
708+
}
709+
710+
next_tb = tb + interval_tb;
711+
prev_steal = steal;
712+
}
713+
662714
u64 pseries_paravirt_steal_clock(int cpu)
663715
{
664716
struct lppaca *lppaca = &lppaca_of(cpu);
717+
unsigned long steal;
718+
719+
steal = be64_to_cpu(READ_ONCE(lppaca->ready_enqueue_tb));
720+
steal += be64_to_cpu(READ_ONCE(lppaca->enqueue_dispatch_tb));
721+
722+
if (is_shared_processor() && !is_kvm_guest())
723+
process_steal(cpu);
665724

666725
/*
667726
* VPA steal time counters are reported at TB frequency. Hence do a
668-
* conversion to ns before returning
727+
* conversion to ns before using.
669728
*/
670-
return tb_to_ns(be64_to_cpu(READ_ONCE(lppaca->enqueue_dispatch_tb)) +
671-
be64_to_cpu(READ_ONCE(lppaca->ready_enqueue_tb)));
729+
steal = tb_to_ns(steal);
730+
731+
return steal;
672732
}
673-
#endif
733+
#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */
674734

675735
#endif /* CONFIG_PPC_SPLPAR */
676736

@@ -2025,6 +2085,9 @@ static int __init vpa_debugfs_init(void)
20252085
debugfs_create_file(name, 0400, vpa_dir, (void *)i, &vpa_fops);
20262086
}
20272087

2088+
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
2089+
debugfs_create_u8("steal_interval_secs", 0600, arch_debugfs_dir, &steal_interval);
2090+
#endif
20282091
return 0;
20292092
}
20302093
machine_arch_initcall(pseries, vpa_debugfs_init);

arch/powerpc/platforms/pseries/pseries.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ void pSeries_machine_check_log_err(void);
2626
#ifdef CONFIG_SMP
2727
extern void smp_init_pseries(void);
2828

29+
#ifdef CONFIG_PPC_SPLPAR
30+
#define STEAL_RATIO 100
31+
#endif
32+
2933
/* Get state of physical CPU from query_cpu_stopped */
3034
int smp_query_cpu_stopped(unsigned int pcpu);
3135
#define QCSS_STOPPED 0
@@ -115,6 +119,10 @@ int dlpar_workqueue_init(void);
115119

116120
extern u32 pseries_security_flavor;
117121
void pseries_setup_security_mitigations(void);
122+
#ifdef CONFIG_PPC_SPLPAR
123+
void trigger_softoffline(unsigned long steal_ratio);
124+
unsigned int pseries_num_available_cores(void);
125+
#endif
118126

119127
#ifdef CONFIG_PPC_64S_HASH_MMU
120128
void pseries_lpar_read_hblkrm_characteristics(void);

0 commit comments

Comments
 (0)