aboutsummaryrefslogtreecommitdiff
path: root/drivers/cpufreq/cpufreq_governor.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/cpufreq/cpufreq_governor.c')
-rw-r--r--drivers/cpufreq/cpufreq_governor.c169
1 files changed, 96 insertions, 73 deletions
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index dc9b72e25c1..1b44496b2d2 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -16,28 +16,12 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <asm/cputime.h>
-#include <linux/cpufreq.h>
-#include <linux/cpumask.h>
#include <linux/export.h>
#include <linux/kernel_stat.h>
-#include <linux/mutex.h>
#include <linux/slab.h>
-#include <linux/tick.h>
-#include <linux/types.h>
-#include <linux/workqueue.h>
-#include <linux/cpu.h>
#include "cpufreq_governor.h"
-static struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy)
-{
- if (have_governor_per_policy())
- return &policy->kobj;
- else
- return cpufreq_global_kobject;
-}
-
static struct attribute_group *get_sysfs_attr(struct dbs_data *dbs_data)
{
if (have_governor_per_policy())
@@ -46,59 +30,39 @@ static struct attribute_group *get_sysfs_attr(struct dbs_data *dbs_data)
return dbs_data->cdata->attr_group_gov_sys;
}
-static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
-{
- u64 idle_time;
- u64 cur_wall_time;
- u64 busy_time;
-
- cur_wall_time = jiffies64_to_cputime64(get_jiffies_64());
-
- busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER];
- busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM];
- busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ];
- busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ];
- busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
- busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
-
- idle_time = cur_wall_time - busy_time;
- if (wall)
- *wall = cputime_to_usecs(cur_wall_time);
-
- return cputime_to_usecs(idle_time);
-}
-
-u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy)
-{
- u64 idle_time = get_cpu_idle_time_us(cpu, io_busy ? wall : NULL);
-
- if (idle_time == -1ULL)
- return get_cpu_idle_time_jiffy(cpu, wall);
- else if (!io_busy)
- idle_time += get_cpu_iowait_time_us(cpu, wall);
-
- return idle_time;
-}
-EXPORT_SYMBOL_GPL(get_cpu_idle_time);
-
void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
{
struct cpu_dbs_common_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
struct od_dbs_tuners *od_tuners = dbs_data->tuners;
struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
struct cpufreq_policy *policy;
+ unsigned int sampling_rate;
unsigned int max_load = 0;
unsigned int ignore_nice;
unsigned int j;
- if (dbs_data->cdata->governor == GOV_ONDEMAND)
- ignore_nice = od_tuners->ignore_nice;
- else
- ignore_nice = cs_tuners->ignore_nice;
+ if (dbs_data->cdata->governor == GOV_ONDEMAND) {
+ struct od_cpu_dbs_info_s *od_dbs_info =
+ dbs_data->cdata->get_cpu_dbs_info_s(cpu);
+
+ /*
+ * Sometimes, the ondemand governor uses an additional
+ * multiplier to give long delays. So apply this multiplier to
+ * the 'sampling_rate', so as to keep the wake-up-from-idle
+ * detection logic a bit conservative.
+ */
+ sampling_rate = od_tuners->sampling_rate;
+ sampling_rate *= od_dbs_info->rate_mult;
+
+ ignore_nice = od_tuners->ignore_nice_load;
+ } else {
+ sampling_rate = cs_tuners->sampling_rate;
+ ignore_nice = cs_tuners->ignore_nice_load;
+ }
policy = cdbs->cur_policy;
- /* Get Absolute Load (in terms of freq for ondemand gov) */
+ /* Get Absolute Load */
for_each_cpu(j, policy->cpus) {
struct cpu_dbs_common_info *j_cdbs;
u64 cur_wall_time, cur_idle_time;
@@ -147,14 +111,45 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
if (unlikely(!wall_time || wall_time < idle_time))
continue;
- load = 100 * (wall_time - idle_time) / wall_time;
-
- if (dbs_data->cdata->governor == GOV_ONDEMAND) {
- int freq_avg = __cpufreq_driver_getavg(policy, j);
- if (freq_avg <= 0)
- freq_avg = policy->cur;
+ /*
+ * If the CPU had gone completely idle, and a task just woke up
+ * on this CPU now, it would be unfair to calculate 'load' the
+ * usual way for this elapsed time-window, because it will show
+ * near-zero load, irrespective of how CPU intensive that task
+ * actually is. This is undesirable for latency-sensitive bursty
+ * workloads.
+ *
+ * To avoid this, we reuse the 'load' from the previous
+ * time-window and give this task a chance to start with a
+ * reasonably high CPU frequency. (However, we shouldn't over-do
+ * this copy, lest we get stuck at a high load (high frequency)
+ * for too long, even when the current system load has actually
+ * dropped down. So we perform the copy only once, upon the
+ * first wake-up from idle.)
+ *
+ * Detecting this situation is easy: the governor's deferrable
+ * timer would not have fired during CPU-idle periods. Hence
+ * an unusually large 'wall_time' (as compared to the sampling
+ * rate) indicates this scenario.
+ *
+ * prev_load can be zero in two cases and we must recalculate it
+ * for both cases:
+ * - during long idle intervals
+ * - explicitly set to zero
+ */
+ if (unlikely(wall_time > (2 * sampling_rate) &&
+ j_cdbs->prev_load)) {
+ load = j_cdbs->prev_load;
- load *= freq_avg;
+ /*
+ * Perform a destructive copy, to ensure that we copy
+ * the previous load only once, upon the first wake-up
+ * from idle.
+ */
+ j_cdbs->prev_load = 0;
+ } else {
+ load = 100 * (wall_time - idle_time) / wall_time;
+ j_cdbs->prev_load = load;
}
if (load > max_load)
@@ -178,14 +173,26 @@ void gov_queue_work(struct dbs_data *dbs_data, struct cpufreq_policy *policy,
{
int i;
+ mutex_lock(&cpufreq_governor_lock);
+ if (!policy->governor_enabled)
+ goto out_unlock;
+
if (!all_cpus) {
- __gov_queue_work(smp_processor_id(), dbs_data, delay);
+ /*
+ * Use raw_smp_processor_id() to avoid preemptible warnings.
+ * We know that this is only called with all_cpus == false from
+ * works that have been queued with *_work_on() functions and
+ * those works are canceled during CPU_DOWN_PREPARE so they
+ * can't possibly run on any other CPU.
+ */
+ __gov_queue_work(raw_smp_processor_id(), dbs_data, delay);
} else {
- get_online_cpus();
for_each_cpu(i, policy->cpus)
__gov_queue_work(i, dbs_data, delay);
- put_online_cpus();
}
+
+out_unlock:
+ mutex_unlock(&cpufreq_governor_lock);
}
EXPORT_SYMBOL_GPL(gov_queue_work);
@@ -278,6 +285,9 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
return rc;
}
+ if (!have_governor_per_policy())
+ WARN_ON(cpufreq_get_global_kobject());
+
rc = sysfs_create_group(get_governor_parent_kobj(policy),
get_sysfs_attr(dbs_data));
if (rc) {
@@ -288,7 +298,7 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
policy->governor_data = dbs_data;
- /* policy latency is in nS. Convert it to uS first */
+ /* policy latency is in ns. Convert it to us first */
latency = policy->cpuinfo.transition_latency / 1000;
if (latency == 0)
latency = 1;
@@ -316,6 +326,9 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
sysfs_remove_group(get_governor_parent_kobj(policy),
get_sysfs_attr(dbs_data));
+ if (!have_governor_per_policy())
+ cpufreq_put_global_kobject();
+
if ((dbs_data->cdata->governor == GOV_CONSERVATIVE) &&
(policy->governor->initialized == 1)) {
struct cs_ops *cs_ops = dbs_data->cdata->gov_ops;
@@ -339,12 +352,12 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
cs_tuners = dbs_data->tuners;
cs_dbs_info = dbs_data->cdata->get_cpu_dbs_info_s(cpu);
sampling_rate = cs_tuners->sampling_rate;
- ignore_nice = cs_tuners->ignore_nice;
+ ignore_nice = cs_tuners->ignore_nice_load;
} else {
od_tuners = dbs_data->tuners;
od_dbs_info = dbs_data->cdata->get_cpu_dbs_info_s(cpu);
sampling_rate = od_tuners->sampling_rate;
- ignore_nice = od_tuners->ignore_nice;
+ ignore_nice = od_tuners->ignore_nice_load;
od_ops = dbs_data->cdata->gov_ops;
io_busy = od_tuners->io_is_busy;
}
@@ -359,11 +372,18 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
for_each_cpu(j, policy->cpus) {
struct cpu_dbs_common_info *j_cdbs =
dbs_data->cdata->get_cpu_cdbs(j);
+ unsigned int prev_load;
j_cdbs->cpu = j;
j_cdbs->cur_policy = policy;
j_cdbs->prev_cpu_idle = get_cpu_idle_time(j,
&j_cdbs->prev_cpu_wall, io_busy);
+
+ prev_load = (unsigned int)
+ (j_cdbs->prev_cpu_wall - j_cdbs->prev_cpu_idle);
+ j_cdbs->prev_load = 100 * prev_load /
+ (unsigned int) j_cdbs->prev_cpu_wall;
+
if (ignore_nice)
j_cdbs->prev_cpu_nice =
kcpustat_cpu(j).cpustat[CPUTIME_NICE];
@@ -373,10 +393,6 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
dbs_data->cdata->gov_dbs_timer);
}
- /*
- * conservative does not implement micro like ondemand
- * governor, thus we are bound to jiffes/HZ
- */
if (dbs_data->cdata->governor == GOV_CONSERVATIVE) {
cs_dbs_info->down_skip = 0;
cs_dbs_info->enable = 1;
@@ -404,12 +420,18 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
mutex_lock(&dbs_data->mutex);
mutex_destroy(&cpu_cdbs->timer_mutex);
+ cpu_cdbs->cur_policy = NULL;
mutex_unlock(&dbs_data->mutex);
break;
case CPUFREQ_GOV_LIMITS:
+ mutex_lock(&dbs_data->mutex);
+ if (!cpu_cdbs->cur_policy) {
+ mutex_unlock(&dbs_data->mutex);
+ break;
+ }
mutex_lock(&cpu_cdbs->timer_mutex);
if (policy->max < cpu_cdbs->cur_policy->cur)
__cpufreq_driver_target(cpu_cdbs->cur_policy,
@@ -419,6 +441,7 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
policy->min, CPUFREQ_RELATION_L);
dbs_check_cpu(dbs_data, cpu);
mutex_unlock(&cpu_cdbs->timer_mutex);
+ mutex_unlock(&dbs_data->mutex);
break;
}
return 0;