diff options
Diffstat (limited to 'drivers/cpufreq/cpufreq_governor.c')
| -rw-r--r-- | drivers/cpufreq/cpufreq_governor.c | 169 |
1 files changed, 96 insertions, 73 deletions
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index dc9b72e25c1..1b44496b2d2 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -16,28 +16,12 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <asm/cputime.h> -#include <linux/cpufreq.h> -#include <linux/cpumask.h> #include <linux/export.h> #include <linux/kernel_stat.h> -#include <linux/mutex.h> #include <linux/slab.h> -#include <linux/tick.h> -#include <linux/types.h> -#include <linux/workqueue.h> -#include <linux/cpu.h> #include "cpufreq_governor.h" -static struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy) -{ - if (have_governor_per_policy()) - return &policy->kobj; - else - return cpufreq_global_kobject; -} - static struct attribute_group *get_sysfs_attr(struct dbs_data *dbs_data) { if (have_governor_per_policy()) @@ -46,59 +30,39 @@ static struct attribute_group *get_sysfs_attr(struct dbs_data *dbs_data) return dbs_data->cdata->attr_group_gov_sys; } -static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall) -{ - u64 idle_time; - u64 cur_wall_time; - u64 busy_time; - - cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); - - busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER]; - busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM]; - busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ]; - busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ]; - busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL]; - busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE]; - - idle_time = cur_wall_time - busy_time; - if (wall) - *wall = cputime_to_usecs(cur_wall_time); - - return cputime_to_usecs(idle_time); -} - -u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy) -{ - u64 idle_time = get_cpu_idle_time_us(cpu, io_busy ? wall : NULL); - - if (idle_time == -1ULL) - return get_cpu_idle_time_jiffy(cpu, wall); - else if (!io_busy) - idle_time += get_cpu_iowait_time_us(cpu, wall); - - return idle_time; -} -EXPORT_SYMBOL_GPL(get_cpu_idle_time); - void dbs_check_cpu(struct dbs_data *dbs_data, int cpu) { struct cpu_dbs_common_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu); struct od_dbs_tuners *od_tuners = dbs_data->tuners; struct cs_dbs_tuners *cs_tuners = dbs_data->tuners; struct cpufreq_policy *policy; + unsigned int sampling_rate; unsigned int max_load = 0; unsigned int ignore_nice; unsigned int j; - if (dbs_data->cdata->governor == GOV_ONDEMAND) - ignore_nice = od_tuners->ignore_nice; - else - ignore_nice = cs_tuners->ignore_nice; + if (dbs_data->cdata->governor == GOV_ONDEMAND) { + struct od_cpu_dbs_info_s *od_dbs_info = + dbs_data->cdata->get_cpu_dbs_info_s(cpu); + + /* + * Sometimes, the ondemand governor uses an additional + * multiplier to give long delays. So apply this multiplier to + * the 'sampling_rate', so as to keep the wake-up-from-idle + * detection logic a bit conservative. + */ + sampling_rate = od_tuners->sampling_rate; + sampling_rate *= od_dbs_info->rate_mult; + + ignore_nice = od_tuners->ignore_nice_load; + } else { + sampling_rate = cs_tuners->sampling_rate; + ignore_nice = cs_tuners->ignore_nice_load; + } policy = cdbs->cur_policy; - /* Get Absolute Load (in terms of freq for ondemand gov) */ + /* Get Absolute Load */ for_each_cpu(j, policy->cpus) { struct cpu_dbs_common_info *j_cdbs; u64 cur_wall_time, cur_idle_time; @@ -147,14 +111,45 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu) if (unlikely(!wall_time || wall_time < idle_time)) continue; - load = 100 * (wall_time - idle_time) / wall_time; - - if (dbs_data->cdata->governor == GOV_ONDEMAND) { - int freq_avg = __cpufreq_driver_getavg(policy, j); - if (freq_avg <= 0) - freq_avg = policy->cur; + /* + * If the CPU had gone completely idle, and a task just woke up + * on this CPU now, it would be unfair to calculate 'load' the + * usual way for this elapsed time-window, because it will show + * near-zero load, irrespective of how CPU intensive that task + * actually is. This is undesirable for latency-sensitive bursty + * workloads. + * + * To avoid this, we reuse the 'load' from the previous + * time-window and give this task a chance to start with a + * reasonably high CPU frequency. (However, we shouldn't over-do + * this copy, lest we get stuck at a high load (high frequency) + * for too long, even when the current system load has actually + * dropped down. So we perform the copy only once, upon the + * first wake-up from idle.) + * + * Detecting this situation is easy: the governor's deferrable + * timer would not have fired during CPU-idle periods. Hence + * an unusually large 'wall_time' (as compared to the sampling + * rate) indicates this scenario. + * + * prev_load can be zero in two cases and we must recalculate it + * for both cases: + * - during long idle intervals + * - explicitly set to zero + */ + if (unlikely(wall_time > (2 * sampling_rate) && + j_cdbs->prev_load)) { + load = j_cdbs->prev_load; - load *= freq_avg; + /* + * Perform a destructive copy, to ensure that we copy + * the previous load only once, upon the first wake-up + * from idle. + */ + j_cdbs->prev_load = 0; + } else { + load = 100 * (wall_time - idle_time) / wall_time; + j_cdbs->prev_load = load; } if (load > max_load) @@ -178,14 +173,26 @@ void gov_queue_work(struct dbs_data *dbs_data, struct cpufreq_policy *policy, { int i; + mutex_lock(&cpufreq_governor_lock); + if (!policy->governor_enabled) + goto out_unlock; + if (!all_cpus) { - __gov_queue_work(smp_processor_id(), dbs_data, delay); + /* + * Use raw_smp_processor_id() to avoid preemptible warnings. + * We know that this is only called with all_cpus == false from + * works that have been queued with *_work_on() functions and + * those works are canceled during CPU_DOWN_PREPARE so they + * can't possibly run on any other CPU. + */ + __gov_queue_work(raw_smp_processor_id(), dbs_data, delay); } else { - get_online_cpus(); for_each_cpu(i, policy->cpus) __gov_queue_work(i, dbs_data, delay); - put_online_cpus(); } + +out_unlock: + mutex_unlock(&cpufreq_governor_lock); } EXPORT_SYMBOL_GPL(gov_queue_work); @@ -278,6 +285,9 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy, return rc; } + if (!have_governor_per_policy()) + WARN_ON(cpufreq_get_global_kobject()); + rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr(dbs_data)); if (rc) { @@ -288,7 +298,7 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy, policy->governor_data = dbs_data; - /* policy latency is in nS. Convert it to uS first */ + /* policy latency is in ns. Convert it to us first */ latency = policy->cpuinfo.transition_latency / 1000; if (latency == 0) latency = 1; @@ -316,6 +326,9 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy, sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr(dbs_data)); + if (!have_governor_per_policy()) + cpufreq_put_global_kobject(); + if ((dbs_data->cdata->governor == GOV_CONSERVATIVE) && (policy->governor->initialized == 1)) { struct cs_ops *cs_ops = dbs_data->cdata->gov_ops; @@ -339,12 +352,12 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy, cs_tuners = dbs_data->tuners; cs_dbs_info = dbs_data->cdata->get_cpu_dbs_info_s(cpu); sampling_rate = cs_tuners->sampling_rate; - ignore_nice = cs_tuners->ignore_nice; + ignore_nice = cs_tuners->ignore_nice_load; } else { od_tuners = dbs_data->tuners; od_dbs_info = dbs_data->cdata->get_cpu_dbs_info_s(cpu); sampling_rate = od_tuners->sampling_rate; - ignore_nice = od_tuners->ignore_nice; + ignore_nice = od_tuners->ignore_nice_load; od_ops = dbs_data->cdata->gov_ops; io_busy = od_tuners->io_is_busy; } @@ -359,11 +372,18 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy, for_each_cpu(j, policy->cpus) { struct cpu_dbs_common_info *j_cdbs = dbs_data->cdata->get_cpu_cdbs(j); + unsigned int prev_load; j_cdbs->cpu = j; j_cdbs->cur_policy = policy; j_cdbs->prev_cpu_idle = get_cpu_idle_time(j, &j_cdbs->prev_cpu_wall, io_busy); + + prev_load = (unsigned int) + (j_cdbs->prev_cpu_wall - j_cdbs->prev_cpu_idle); + j_cdbs->prev_load = 100 * prev_load / + (unsigned int) j_cdbs->prev_cpu_wall; + if (ignore_nice) j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; @@ -373,10 +393,6 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy, dbs_data->cdata->gov_dbs_timer); } - /* - * conservative does not implement micro like ondemand - * governor, thus we are bound to jiffes/HZ - */ if (dbs_data->cdata->governor == GOV_CONSERVATIVE) { cs_dbs_info->down_skip = 0; cs_dbs_info->enable = 1; @@ -404,12 +420,18 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy, mutex_lock(&dbs_data->mutex); mutex_destroy(&cpu_cdbs->timer_mutex); + cpu_cdbs->cur_policy = NULL; mutex_unlock(&dbs_data->mutex); break; case CPUFREQ_GOV_LIMITS: + mutex_lock(&dbs_data->mutex); + if (!cpu_cdbs->cur_policy) { + mutex_unlock(&dbs_data->mutex); + break; + } mutex_lock(&cpu_cdbs->timer_mutex); if (policy->max < cpu_cdbs->cur_policy->cur) __cpufreq_driver_target(cpu_cdbs->cur_policy, @@ -419,6 +441,7 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy, policy->min, CPUFREQ_RELATION_L); dbs_check_cpu(dbs_data, cpu); mutex_unlock(&cpu_cdbs->timer_mutex); + mutex_unlock(&dbs_data->mutex); break; } return 0; |
