Skip to content

Commit e97ce89

Browse files
committed
Merge branches 'pm-cpuidle' and 'pm-em' into linux-next
* pm-cpuidle: cpuidle: psci: Avoid initializing faux device if no DT idle states are present Documentation: ABI: testing: document the new cpuidle sysfs file Documentation: admin-guide: pm: Document intel_idle C1 demotion intel_idle: Add C1 demotion on/off sysfs knob cpuidle: psci: Transition to the faux device interface cpuidle: menu: Optimize bucket assignment when next_timer_ns equals KTIME_MAX cpuidle: teo: Fix typos in two comments * pm-em: PM: EM: Documentation: fix typo in energy-model.rst PM: EM: Fix potential division-by-zero error in em_compute_costs()
3 parents 5fd1808 + 5836ebe + 7330e00 commit e97ce89

8 files changed

Lines changed: 164 additions & 21 deletions

File tree

Documentation/ABI/testing/sysfs-devices-system-cpu

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ What: /sys/devices/system/cpu/cpuidle/available_governors
111111
/sys/devices/system/cpu/cpuidle/current_driver
112112
/sys/devices/system/cpu/cpuidle/current_governor
113113
/sys/devices/system/cpu/cpuidle/current_governer_ro
114+
/sys/devices/system/cpu/cpuidle/intel_c1_demotion
114115
Date: September 2007
115116
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
116117
Description: Discover cpuidle policy and mechanism
@@ -132,7 +133,11 @@ Description: Discover cpuidle policy and mechanism
132133

133134
current_governor_ro: (RO) displays current idle policy.
134135

135-
See Documentation/admin-guide/pm/cpuidle.rst and
136+
intel_c1_demotion: (RW) enables/disables the C1 demotion
137+
feature on Intel CPUs.
138+
139+
See Documentation/admin-guide/pm/cpuidle.rst,
140+
Documentation/admin-guide/pm/intel_idle.rst, and
136141
Documentation/driver-api/pm/cpuidle.rst for more information.
137142

138143

Documentation/admin-guide/pm/intel_idle.rst

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,27 @@ instruction at all.
3838
only way to pass early-configuration-time parameters to it is via the kernel
3939
command line.
4040

41+
Sysfs Interface
42+
===============
43+
44+
The ``intel_idle`` driver exposes the following ``sysfs`` attributes in
45+
``/sys/devices/system/cpu/cpuidle/``:
46+
47+
``intel_c1_demotion``
48+
Enable or disable C1 demotion for all CPUs in the system. This file is
49+
only exposed on platforms that support the C1 demotion feature and where
50+
it was tested. Value 0 means that C1 demotion is disabled, value 1 means
51+
that it is enabled. Write 0 or 1 to disable or enable C1 demotion for
52+
all CPUs.
53+
54+
The C1 demotion feature involves the platform firmware demoting deep
55+
C-state requests from the OS (e.g., C6 requests) to C1. The idea is that
56+
firmware monitors CPU wake-up rate, and if it is higher than a
57+
platform-specific threshold, the firmware demotes deep C-state requests
58+
to C1. For example, Linux requests C6, but firmware noticed too many
59+
wake-ups per second, and it keeps the CPU in C1. When the CPU stays in
60+
C1 long enough, the platform promotes it back to C6. This may improve
61+
some workloads' performance, but it may also increase power consumption.
4162

4263
.. _intel-idle-enumeration-of-states:
4364

Documentation/power/energy-model.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ Drivers must provide a pointer to the allocated and initialized new EM
230230
and will be visible to other sub-systems in the kernel (thermal, powercap).
231231
The main design goal for this API is to be fast and avoid extra calculations
232232
or memory allocations at runtime. When pre-computed EMs are available in the
233-
device driver, than it should be possible to simply re-use them with low
233+
device driver, then it should be possible to simply reuse them with low
234234
performance overhead.
235235

236236
In order to free the EM, provided earlier by the driver (e.g. when the module

drivers/cpuidle/cpuidle-psci.c

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#include <linux/kernel.h>
1717
#include <linux/module.h>
1818
#include <linux/of.h>
19-
#include <linux/platform_device.h>
19+
#include <linux/device/faux.h>
2020
#include <linux/psci.h>
2121
#include <linux/pm_domain.h>
2222
#include <linux/pm_runtime.h>
@@ -407,14 +407,14 @@ static int psci_idle_init_cpu(struct device *dev, int cpu)
407407
* to register cpuidle driver then rollback to cancel all CPUs
408408
* registration.
409409
*/
410-
static int psci_cpuidle_probe(struct platform_device *pdev)
410+
static int psci_cpuidle_probe(struct faux_device *fdev)
411411
{
412412
int cpu, ret;
413413
struct cpuidle_driver *drv;
414414
struct cpuidle_device *dev;
415415

416416
for_each_present_cpu(cpu) {
417-
ret = psci_idle_init_cpu(&pdev->dev, cpu);
417+
ret = psci_idle_init_cpu(&fdev->dev, cpu);
418418
if (ret)
419419
goto out_fail;
420420
}
@@ -434,26 +434,37 @@ static int psci_cpuidle_probe(struct platform_device *pdev)
434434
return ret;
435435
}
436436

437-
static struct platform_driver psci_cpuidle_driver = {
437+
static struct faux_device_ops psci_cpuidle_ops = {
438438
.probe = psci_cpuidle_probe,
439-
.driver = {
440-
.name = "psci-cpuidle",
441-
},
442439
};
443440

441+
static bool __init dt_idle_state_present(void)
442+
{
443+
struct device_node *cpu_node __free(device_node);
444+
struct device_node *state_node __free(device_node);
445+
446+
cpu_node = of_cpu_device_node_get(cpumask_first(cpu_possible_mask));
447+
if (!cpu_node)
448+
return false;
449+
450+
state_node = of_get_cpu_state_node(cpu_node, 0);
451+
if (!state_node)
452+
return false;
453+
454+
return !!of_match_node(psci_idle_state_match, state_node);
455+
}
456+
444457
static int __init psci_idle_init(void)
445458
{
446-
struct platform_device *pdev;
447-
int ret;
459+
struct faux_device *fdev;
448460

449-
ret = platform_driver_register(&psci_cpuidle_driver);
450-
if (ret)
451-
return ret;
461+
if (!dt_idle_state_present())
462+
return 0;
452463

453-
pdev = platform_device_register_simple("psci-cpuidle", -1, NULL, 0);
454-
if (IS_ERR(pdev)) {
455-
platform_driver_unregister(&psci_cpuidle_driver);
456-
return PTR_ERR(pdev);
464+
fdev = faux_device_create("psci-cpuidle", NULL, &psci_cpuidle_ops);
465+
if (!fdev) {
466+
pr_err("Failed to create psci-cpuidle device\n");
467+
return -ENODEV;
457468
}
458469

459470
return 0;

drivers/cpuidle/governors/menu.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
255255
*/
256256
data->next_timer_ns = KTIME_MAX;
257257
delta_tick = TICK_NSEC / 2;
258-
data->bucket = which_bucket(KTIME_MAX);
258+
data->bucket = BUCKETS - 1;
259259
}
260260

261261
if (unlikely(drv->state_count <= 1 || latency_req == 0) ||

drivers/cpuidle/governors/teo.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
*
2020
* Of course, non-timer wakeup sources are more important in some use cases,
2121
* but even then it is generally unnecessary to consider idle duration values
22-
* greater than the time time till the next timer event, referred as the sleep
22+
* greater than the time till the next timer event, referred as the sleep
2323
* length in what follows, because the closest timer will ultimately wake up the
2424
* CPU anyway unless it is woken up earlier.
2525
*
@@ -311,7 +311,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
311311
struct cpuidle_state *s = &drv->states[i];
312312

313313
/*
314-
* Update the sums of idle state mertics for all of the states
314+
* Update the sums of idle state metrics for all of the states
315315
* shallower than the current one.
316316
*/
317317
intercept_sum += prev_bin->intercepts;

drivers/idle/intel_idle.c

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,11 @@
4848
#include <trace/events/power.h>
4949
#include <linux/sched.h>
5050
#include <linux/sched/smt.h>
51+
#include <linux/mutex.h>
5152
#include <linux/notifier.h>
5253
#include <linux/cpu.h>
5354
#include <linux/moduleparam.h>
55+
#include <linux/sysfs.h>
5456
#include <asm/cpuid.h>
5557
#include <asm/cpu_device_id.h>
5658
#include <asm/intel-family.h>
@@ -92,9 +94,15 @@ struct idle_cpu {
9294
*/
9395
unsigned long auto_demotion_disable_flags;
9496
bool disable_promotion_to_c1e;
97+
bool c1_demotion_supported;
9598
bool use_acpi;
9699
};
97100

101+
static bool c1_demotion_supported;
102+
static DEFINE_MUTEX(c1_demotion_mutex);
103+
104+
static struct device *sysfs_root __initdata;
105+
98106
static const struct idle_cpu *icpu __initdata;
99107
static struct cpuidle_state *cpuidle_state_table __initdata;
100108

@@ -1549,18 +1557,21 @@ static const struct idle_cpu idle_cpu_gmt __initconst = {
15491557
static const struct idle_cpu idle_cpu_spr __initconst = {
15501558
.state_table = spr_cstates,
15511559
.disable_promotion_to_c1e = true,
1560+
.c1_demotion_supported = true,
15521561
.use_acpi = true,
15531562
};
15541563

15551564
static const struct idle_cpu idle_cpu_gnr __initconst = {
15561565
.state_table = gnr_cstates,
15571566
.disable_promotion_to_c1e = true,
1567+
.c1_demotion_supported = true,
15581568
.use_acpi = true,
15591569
};
15601570

15611571
static const struct idle_cpu idle_cpu_gnrd __initconst = {
15621572
.state_table = gnrd_cstates,
15631573
.disable_promotion_to_c1e = true,
1574+
.c1_demotion_supported = true,
15641575
.use_acpi = true,
15651576
};
15661577

@@ -1599,12 +1610,14 @@ static const struct idle_cpu idle_cpu_snr __initconst = {
15991610
static const struct idle_cpu idle_cpu_grr __initconst = {
16001611
.state_table = grr_cstates,
16011612
.disable_promotion_to_c1e = true,
1613+
.c1_demotion_supported = true,
16021614
.use_acpi = true,
16031615
};
16041616

16051617
static const struct idle_cpu idle_cpu_srf __initconst = {
16061618
.state_table = srf_cstates,
16071619
.disable_promotion_to_c1e = true,
1620+
.c1_demotion_supported = true,
16081621
.use_acpi = true,
16091622
};
16101623

@@ -2324,6 +2337,88 @@ static void __init intel_idle_cpuidle_devices_uninit(void)
23242337
cpuidle_unregister_device(per_cpu_ptr(intel_idle_cpuidle_devices, i));
23252338
}
23262339

2340+
static void intel_c1_demotion_toggle(void *enable)
2341+
{
2342+
unsigned long long msr_val;
2343+
2344+
rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_val);
2345+
/*
2346+
* Enable/disable C1 undemotion along with C1 demotion, as this is the
2347+
* most sensible configuration in general.
2348+
*/
2349+
if (enable)
2350+
msr_val |= NHM_C1_AUTO_DEMOTE | SNB_C1_AUTO_UNDEMOTE;
2351+
else
2352+
msr_val &= ~(NHM_C1_AUTO_DEMOTE | SNB_C1_AUTO_UNDEMOTE);
2353+
wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_val);
2354+
}
2355+
2356+
static ssize_t intel_c1_demotion_store(struct device *dev,
2357+
struct device_attribute *attr,
2358+
const char *buf, size_t count)
2359+
{
2360+
bool enable;
2361+
int err;
2362+
2363+
err = kstrtobool(buf, &enable);
2364+
if (err)
2365+
return err;
2366+
2367+
mutex_lock(&c1_demotion_mutex);
2368+
/* Enable/disable C1 demotion on all CPUs */
2369+
on_each_cpu(intel_c1_demotion_toggle, (void *)enable, 1);
2370+
mutex_unlock(&c1_demotion_mutex);
2371+
2372+
return count;
2373+
}
2374+
2375+
static ssize_t intel_c1_demotion_show(struct device *dev,
2376+
struct device_attribute *attr, char *buf)
2377+
{
2378+
unsigned long long msr_val;
2379+
2380+
/*
2381+
* Read the MSR value for a CPU and assume it is the same for all CPUs. Any other
2382+
* configuration would be a BIOS bug.
2383+
*/
2384+
rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_val);
2385+
return sysfs_emit(buf, "%d\n", !!(msr_val & NHM_C1_AUTO_DEMOTE));
2386+
}
2387+
static DEVICE_ATTR_RW(intel_c1_demotion);
2388+
2389+
static int __init intel_idle_sysfs_init(void)
2390+
{
2391+
int err;
2392+
2393+
if (!c1_demotion_supported)
2394+
return 0;
2395+
2396+
sysfs_root = bus_get_dev_root(&cpu_subsys);
2397+
if (!sysfs_root)
2398+
return 0;
2399+
2400+
err = sysfs_add_file_to_group(&sysfs_root->kobj,
2401+
&dev_attr_intel_c1_demotion.attr,
2402+
"cpuidle");
2403+
if (err) {
2404+
put_device(sysfs_root);
2405+
return err;
2406+
}
2407+
2408+
return 0;
2409+
}
2410+
2411+
static void __init intel_idle_sysfs_uninit(void)
2412+
{
2413+
if (!sysfs_root)
2414+
return;
2415+
2416+
sysfs_remove_file_from_group(&sysfs_root->kobj,
2417+
&dev_attr_intel_c1_demotion.attr,
2418+
"cpuidle");
2419+
put_device(sysfs_root);
2420+
}
2421+
23272422
static int __init intel_idle_init(void)
23282423
{
23292424
const struct x86_cpu_id *id;
@@ -2374,6 +2469,8 @@ static int __init intel_idle_init(void)
23742469
auto_demotion_disable_flags = icpu->auto_demotion_disable_flags;
23752470
if (icpu->disable_promotion_to_c1e)
23762471
c1e_promotion = C1E_PROMOTION_DISABLE;
2472+
if (icpu->c1_demotion_supported)
2473+
c1_demotion_supported = true;
23772474
if (icpu->use_acpi || force_use_acpi)
23782475
intel_idle_acpi_cst_extract();
23792476
} else if (!intel_idle_acpi_cst_extract()) {
@@ -2387,6 +2484,10 @@ static int __init intel_idle_init(void)
23872484
if (!intel_idle_cpuidle_devices)
23882485
return -ENOMEM;
23892486

2487+
retval = intel_idle_sysfs_init();
2488+
if (retval)
2489+
pr_warn("failed to initialized sysfs");
2490+
23902491
intel_idle_cpuidle_driver_init(&intel_idle_driver);
23912492

23922493
retval = cpuidle_register_driver(&intel_idle_driver);
@@ -2411,6 +2512,7 @@ static int __init intel_idle_init(void)
24112512
intel_idle_cpuidle_devices_uninit();
24122513
cpuidle_unregister_driver(&intel_idle_driver);
24132514
init_driver_fail:
2515+
intel_idle_sysfs_uninit();
24142516
free_percpu(intel_idle_cpuidle_devices);
24152517
return retval;
24162518

kernel/power/energy_model.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,10 @@ static int em_compute_costs(struct device *dev, struct em_perf_state *table,
233233
unsigned long prev_cost = ULONG_MAX;
234234
int i, ret;
235235

236+
/* This is needed only for CPUs and EAS skip other devices */
237+
if (!_is_cpu_device(dev))
238+
return 0;
239+
236240
/* Compute the cost of each performance state. */
237241
for (i = nr_states - 1; i >= 0; i--) {
238242
unsigned long power_res, cost;

0 commit comments

Comments
 (0)