diff options
Diffstat (limited to 'drivers/cpufreq/cpufreq_governor.c')
| -rw-r--r-- | drivers/cpufreq/cpufreq_governor.c | 453 |
1 files changed, 292 insertions, 161 deletions
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 6c5f1d383cd..1b44496b2d2 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -16,80 +16,71 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <asm/cputime.h> -#include <linux/cpufreq.h> -#include <linux/cpumask.h> #include <linux/export.h> #include <linux/kernel_stat.h> -#include <linux/mutex.h> -#include <linux/tick.h> -#include <linux/types.h> -#include <linux/workqueue.h> +#include <linux/slab.h> #include "cpufreq_governor.h" -static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall) +static struct attribute_group *get_sysfs_attr(struct dbs_data *dbs_data) { - u64 idle_time; - u64 cur_wall_time; - u64 busy_time; - - cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); - - busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER]; - busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM]; - busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ]; - busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ]; - busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL]; - busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE]; - - idle_time = cur_wall_time - busy_time; - if (wall) - *wall = cputime_to_usecs(cur_wall_time); - - return cputime_to_usecs(idle_time); -} - -u64 get_cpu_idle_time(unsigned int cpu, u64 *wall) -{ - u64 idle_time = get_cpu_idle_time_us(cpu, NULL); - - if (idle_time == -1ULL) - return get_cpu_idle_time_jiffy(cpu, wall); + if (have_governor_per_policy()) + return dbs_data->cdata->attr_group_gov_pol; else - idle_time += get_cpu_iowait_time_us(cpu, wall); - - return idle_time; + return dbs_data->cdata->attr_group_gov_sys; } -EXPORT_SYMBOL_GPL(get_cpu_idle_time); void dbs_check_cpu(struct dbs_data *dbs_data, int cpu) { - struct cpu_dbs_common_info *cdbs = dbs_data->get_cpu_cdbs(cpu); + struct cpu_dbs_common_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu); struct od_dbs_tuners *od_tuners = dbs_data->tuners; struct cs_dbs_tuners *cs_tuners = dbs_data->tuners; struct cpufreq_policy *policy; + unsigned int sampling_rate; unsigned int max_load = 0; unsigned int ignore_nice; unsigned int j; - if (dbs_data->governor == GOV_ONDEMAND) - ignore_nice = od_tuners->ignore_nice; - else - ignore_nice = cs_tuners->ignore_nice; + if (dbs_data->cdata->governor == GOV_ONDEMAND) { + struct od_cpu_dbs_info_s *od_dbs_info = + dbs_data->cdata->get_cpu_dbs_info_s(cpu); + + /* + * Sometimes, the ondemand governor uses an additional + * multiplier to give long delays. So apply this multiplier to + * the 'sampling_rate', so as to keep the wake-up-from-idle + * detection logic a bit conservative. + */ + sampling_rate = od_tuners->sampling_rate; + sampling_rate *= od_dbs_info->rate_mult; + + ignore_nice = od_tuners->ignore_nice_load; + } else { + sampling_rate = cs_tuners->sampling_rate; + ignore_nice = cs_tuners->ignore_nice_load; + } policy = cdbs->cur_policy; - /* Get Absolute Load (in terms of freq for ondemand gov) */ + /* Get Absolute Load */ for_each_cpu(j, policy->cpus) { struct cpu_dbs_common_info *j_cdbs; - u64 cur_wall_time, cur_idle_time, cur_iowait_time; - unsigned int idle_time, wall_time, iowait_time; + u64 cur_wall_time, cur_idle_time; + unsigned int idle_time, wall_time; unsigned int load; + int io_busy = 0; - j_cdbs = dbs_data->get_cpu_cdbs(j); + j_cdbs = dbs_data->cdata->get_cpu_cdbs(j); - cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); + /* + * For the purpose of ondemand, waiting for disk IO is + * an indication that you're performance critical, and + * not that the system is actually idle. So do not add + * the iowait time to the cpu idle time. + */ + if (dbs_data->cdata->governor == GOV_ONDEMAND) + io_busy = od_tuners->io_is_busy; + cur_idle_time = get_cpu_idle_time(j, &cur_wall_time, io_busy); wall_time = (unsigned int) (cur_wall_time - j_cdbs->prev_cpu_wall); @@ -117,191 +108,330 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu) idle_time += jiffies_to_usecs(cur_nice_jiffies); } - if (dbs_data->governor == GOV_ONDEMAND) { - struct od_cpu_dbs_info_s *od_j_dbs_info = - dbs_data->get_cpu_dbs_info_s(cpu); - - cur_iowait_time = get_cpu_iowait_time_us(j, - &cur_wall_time); - if (cur_iowait_time == -1ULL) - cur_iowait_time = 0; + if (unlikely(!wall_time || wall_time < idle_time)) + continue; - iowait_time = (unsigned int) (cur_iowait_time - - od_j_dbs_info->prev_cpu_iowait); - od_j_dbs_info->prev_cpu_iowait = cur_iowait_time; + /* + * If the CPU had gone completely idle, and a task just woke up + * on this CPU now, it would be unfair to calculate 'load' the + * usual way for this elapsed time-window, because it will show + * near-zero load, irrespective of how CPU intensive that task + * actually is. This is undesirable for latency-sensitive bursty + * workloads. + * + * To avoid this, we reuse the 'load' from the previous + * time-window and give this task a chance to start with a + * reasonably high CPU frequency. (However, we shouldn't over-do + * this copy, lest we get stuck at a high load (high frequency) + * for too long, even when the current system load has actually + * dropped down. So we perform the copy only once, upon the + * first wake-up from idle.) + * + * Detecting this situation is easy: the governor's deferrable + * timer would not have fired during CPU-idle periods. Hence + * an unusually large 'wall_time' (as compared to the sampling + * rate) indicates this scenario. + * + * prev_load can be zero in two cases and we must recalculate it + * for both cases: + * - during long idle intervals + * - explicitly set to zero + */ + if (unlikely(wall_time > (2 * sampling_rate) && + j_cdbs->prev_load)) { + load = j_cdbs->prev_load; /* - * For the purpose of ondemand, waiting for disk IO is - * an indication that you're performance critical, and - * not that the system is actually idle. So subtract the - * iowait time from the cpu idle time. + * Perform a destructive copy, to ensure that we copy + * the previous load only once, upon the first wake-up + * from idle. */ - if (od_tuners->io_is_busy && idle_time >= iowait_time) - idle_time -= iowait_time; + j_cdbs->prev_load = 0; + } else { + load = 100 * (wall_time - idle_time) / wall_time; + j_cdbs->prev_load = load; } - if (unlikely(!wall_time || wall_time < idle_time)) - continue; + if (load > max_load) + max_load = load; + } - load = 100 * (wall_time - idle_time) / wall_time; + dbs_data->cdata->gov_check_cpu(cpu, max_load); +} +EXPORT_SYMBOL_GPL(dbs_check_cpu); - if (dbs_data->governor == GOV_ONDEMAND) { - int freq_avg = __cpufreq_driver_getavg(policy, j); - if (freq_avg <= 0) - freq_avg = policy->cur; +static inline void __gov_queue_work(int cpu, struct dbs_data *dbs_data, + unsigned int delay) +{ + struct cpu_dbs_common_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu); - load *= freq_avg; - } + mod_delayed_work_on(cpu, system_wq, &cdbs->work, delay); +} - if (load > max_load) - max_load = load; +void gov_queue_work(struct dbs_data *dbs_data, struct cpufreq_policy *policy, + unsigned int delay, bool all_cpus) +{ + int i; + + mutex_lock(&cpufreq_governor_lock); + if (!policy->governor_enabled) + goto out_unlock; + + if (!all_cpus) { + /* + * Use raw_smp_processor_id() to avoid preemptible warnings. + * We know that this is only called with all_cpus == false from + * works that have been queued with *_work_on() functions and + * those works are canceled during CPU_DOWN_PREPARE so they + * can't possibly run on any other CPU. + */ + __gov_queue_work(raw_smp_processor_id(), dbs_data, delay); + } else { + for_each_cpu(i, policy->cpus) + __gov_queue_work(i, dbs_data, delay); } - dbs_data->gov_check_cpu(cpu, max_load); +out_unlock: + mutex_unlock(&cpufreq_governor_lock); +} +EXPORT_SYMBOL_GPL(gov_queue_work); + +static inline void gov_cancel_work(struct dbs_data *dbs_data, + struct cpufreq_policy *policy) +{ + struct cpu_dbs_common_info *cdbs; + int i; + + for_each_cpu(i, policy->cpus) { + cdbs = dbs_data->cdata->get_cpu_cdbs(i); + cancel_delayed_work_sync(&cdbs->work); + } } -EXPORT_SYMBOL_GPL(dbs_check_cpu); -static inline void dbs_timer_init(struct dbs_data *dbs_data, - struct cpu_dbs_common_info *cdbs, unsigned int sampling_rate) +/* Will return if we need to evaluate cpu load again or not */ +bool need_load_eval(struct cpu_dbs_common_info *cdbs, + unsigned int sampling_rate) { - int delay = delay_for_sampling_rate(sampling_rate); + if (policy_is_shared(cdbs->cur_policy)) { + ktime_t time_now = ktime_get(); + s64 delta_us = ktime_us_delta(time_now, cdbs->time_stamp); + + /* Do nothing if we recently have sampled */ + if (delta_us < (s64)(sampling_rate / 2)) + return false; + else + cdbs->time_stamp = time_now; + } - INIT_DEFERRABLE_WORK(&cdbs->work, dbs_data->gov_dbs_timer); - schedule_delayed_work_on(cdbs->cpu, &cdbs->work, delay); + return true; } +EXPORT_SYMBOL_GPL(need_load_eval); -static inline void dbs_timer_exit(struct cpu_dbs_common_info *cdbs) +static void set_sampling_rate(struct dbs_data *dbs_data, + unsigned int sampling_rate) { - cancel_delayed_work_sync(&cdbs->work); + if (dbs_data->cdata->governor == GOV_CONSERVATIVE) { + struct cs_dbs_tuners *cs_tuners = dbs_data->tuners; + cs_tuners->sampling_rate = sampling_rate; + } else { + struct od_dbs_tuners *od_tuners = dbs_data->tuners; + od_tuners->sampling_rate = sampling_rate; + } } -int cpufreq_governor_dbs(struct dbs_data *dbs_data, - struct cpufreq_policy *policy, unsigned int event) +int cpufreq_governor_dbs(struct cpufreq_policy *policy, + struct common_dbs_data *cdata, unsigned int event) { + struct dbs_data *dbs_data; struct od_cpu_dbs_info_s *od_dbs_info = NULL; struct cs_cpu_dbs_info_s *cs_dbs_info = NULL; - struct od_dbs_tuners *od_tuners = dbs_data->tuners; - struct cs_dbs_tuners *cs_tuners = dbs_data->tuners; + struct od_ops *od_ops = NULL; + struct od_dbs_tuners *od_tuners = NULL; + struct cs_dbs_tuners *cs_tuners = NULL; struct cpu_dbs_common_info *cpu_cdbs; - unsigned int *sampling_rate, latency, ignore_nice, j, cpu = policy->cpu; + unsigned int sampling_rate, latency, ignore_nice, j, cpu = policy->cpu; + int io_busy = 0; int rc; - cpu_cdbs = dbs_data->get_cpu_cdbs(cpu); + if (have_governor_per_policy()) + dbs_data = policy->governor_data; + else + dbs_data = cdata->gdbs_data; - if (dbs_data->governor == GOV_CONSERVATIVE) { - cs_dbs_info = dbs_data->get_cpu_dbs_info_s(cpu); - sampling_rate = &cs_tuners->sampling_rate; - ignore_nice = cs_tuners->ignore_nice; - } else { - od_dbs_info = dbs_data->get_cpu_dbs_info_s(cpu); - sampling_rate = &od_tuners->sampling_rate; - ignore_nice = od_tuners->ignore_nice; - } + WARN_ON(!dbs_data && (event != CPUFREQ_GOV_POLICY_INIT)); switch (event) { - case CPUFREQ_GOV_START: - if ((!cpu_online(cpu)) || (!policy->cur)) - return -EINVAL; - - mutex_lock(&dbs_data->mutex); + case CPUFREQ_GOV_POLICY_INIT: + if (have_governor_per_policy()) { + WARN_ON(dbs_data); + } else if (dbs_data) { + dbs_data->usage_count++; + policy->governor_data = dbs_data; + return 0; + } - dbs_data->enable++; - cpu_cdbs->cpu = cpu; - for_each_cpu(j, policy->cpus) { - struct cpu_dbs_common_info *j_cdbs; - j_cdbs = dbs_data->get_cpu_cdbs(j); + dbs_data = kzalloc(sizeof(*dbs_data), GFP_KERNEL); + if (!dbs_data) { + pr_err("%s: POLICY_INIT: kzalloc failed\n", __func__); + return -ENOMEM; + } - j_cdbs->cur_policy = policy; - j_cdbs->prev_cpu_idle = get_cpu_idle_time(j, - &j_cdbs->prev_cpu_wall); - if (ignore_nice) - j_cdbs->prev_cpu_nice = - kcpustat_cpu(j).cpustat[CPUTIME_NICE]; + dbs_data->cdata = cdata; + dbs_data->usage_count = 1; + rc = cdata->init(dbs_data); + if (rc) { + pr_err("%s: POLICY_INIT: init() failed\n", __func__); + kfree(dbs_data); + return rc; } - /* - * Start the timerschedule work, when this governor is used for - * first time - */ - if (dbs_data->enable != 1) - goto second_time; + if (!have_governor_per_policy()) + WARN_ON(cpufreq_get_global_kobject()); - rc = sysfs_create_group(cpufreq_global_kobject, - dbs_data->attr_group); + rc = sysfs_create_group(get_governor_parent_kobj(policy), + get_sysfs_attr(dbs_data)); if (rc) { - mutex_unlock(&dbs_data->mutex); + cdata->exit(dbs_data); + kfree(dbs_data); return rc; } - /* policy latency is in nS. Convert it to uS first */ + policy->governor_data = dbs_data; + + /* policy latency is in ns. Convert it to us first */ latency = policy->cpuinfo.transition_latency / 1000; if (latency == 0) latency = 1; - /* - * conservative does not implement micro like ondemand - * governor, thus we are bound to jiffes/HZ - */ - if (dbs_data->governor == GOV_CONSERVATIVE) { - struct cs_ops *ops = dbs_data->gov_ops; + /* Bring kernel and HW constraints together */ + dbs_data->min_sampling_rate = max(dbs_data->min_sampling_rate, + MIN_LATENCY_MULTIPLIER * latency); + set_sampling_rate(dbs_data, max(dbs_data->min_sampling_rate, + latency * LATENCY_MULTIPLIER)); - cpufreq_register_notifier(ops->notifier_block, + if ((cdata->governor == GOV_CONSERVATIVE) && + (!policy->governor->initialized)) { + struct cs_ops *cs_ops = dbs_data->cdata->gov_ops; + + cpufreq_register_notifier(cs_ops->notifier_block, CPUFREQ_TRANSITION_NOTIFIER); + } - dbs_data->min_sampling_rate = MIN_SAMPLING_RATE_RATIO * - jiffies_to_usecs(10); - } else { - struct od_ops *ops = dbs_data->gov_ops; + if (!have_governor_per_policy()) + cdata->gdbs_data = dbs_data; + + return 0; + case CPUFREQ_GOV_POLICY_EXIT: + if (!--dbs_data->usage_count) { + sysfs_remove_group(get_governor_parent_kobj(policy), + get_sysfs_attr(dbs_data)); + + if (!have_governor_per_policy()) + cpufreq_put_global_kobject(); + + if ((dbs_data->cdata->governor == GOV_CONSERVATIVE) && + (policy->governor->initialized == 1)) { + struct cs_ops *cs_ops = dbs_data->cdata->gov_ops; - od_tuners->io_is_busy = ops->io_busy(); + cpufreq_unregister_notifier(cs_ops->notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + } + + cdata->exit(dbs_data); + kfree(dbs_data); + cdata->gdbs_data = NULL; } - /* Bring kernel and HW constraints together */ - dbs_data->min_sampling_rate = max(dbs_data->min_sampling_rate, - MIN_LATENCY_MULTIPLIER * latency); - *sampling_rate = max(dbs_data->min_sampling_rate, latency * - LATENCY_MULTIPLIER); + policy->governor_data = NULL; + return 0; + } + + cpu_cdbs = dbs_data->cdata->get_cpu_cdbs(cpu); -second_time: - if (dbs_data->governor == GOV_CONSERVATIVE) { + if (dbs_data->cdata->governor == GOV_CONSERVATIVE) { + cs_tuners = dbs_data->tuners; + cs_dbs_info = dbs_data->cdata->get_cpu_dbs_info_s(cpu); + sampling_rate = cs_tuners->sampling_rate; + ignore_nice = cs_tuners->ignore_nice_load; + } else { + od_tuners = dbs_data->tuners; + od_dbs_info = dbs_data->cdata->get_cpu_dbs_info_s(cpu); + sampling_rate = od_tuners->sampling_rate; + ignore_nice = od_tuners->ignore_nice_load; + od_ops = dbs_data->cdata->gov_ops; + io_busy = od_tuners->io_is_busy; + } + + switch (event) { + case CPUFREQ_GOV_START: + if (!policy->cur) + return -EINVAL; + + mutex_lock(&dbs_data->mutex); + + for_each_cpu(j, policy->cpus) { + struct cpu_dbs_common_info *j_cdbs = + dbs_data->cdata->get_cpu_cdbs(j); + unsigned int prev_load; + + j_cdbs->cpu = j; + j_cdbs->cur_policy = policy; + j_cdbs->prev_cpu_idle = get_cpu_idle_time(j, + &j_cdbs->prev_cpu_wall, io_busy); + + prev_load = (unsigned int) + (j_cdbs->prev_cpu_wall - j_cdbs->prev_cpu_idle); + j_cdbs->prev_load = 100 * prev_load / + (unsigned int) j_cdbs->prev_cpu_wall; + + if (ignore_nice) + j_cdbs->prev_cpu_nice = + kcpustat_cpu(j).cpustat[CPUTIME_NICE]; + + mutex_init(&j_cdbs->timer_mutex); + INIT_DEFERRABLE_WORK(&j_cdbs->work, + dbs_data->cdata->gov_dbs_timer); + } + + if (dbs_data->cdata->governor == GOV_CONSERVATIVE) { cs_dbs_info->down_skip = 0; cs_dbs_info->enable = 1; cs_dbs_info->requested_freq = policy->cur; } else { - struct od_ops *ops = dbs_data->gov_ops; od_dbs_info->rate_mult = 1; od_dbs_info->sample_type = OD_NORMAL_SAMPLE; - ops->powersave_bias_init_cpu(cpu); + od_ops->powersave_bias_init_cpu(cpu); } + mutex_unlock(&dbs_data->mutex); - mutex_init(&cpu_cdbs->timer_mutex); - dbs_timer_init(dbs_data, cpu_cdbs, *sampling_rate); + /* Initiate timer time stamp */ + cpu_cdbs->time_stamp = ktime_get(); + + gov_queue_work(dbs_data, policy, + delay_for_sampling_rate(sampling_rate), true); break; case CPUFREQ_GOV_STOP: - if (dbs_data->governor == GOV_CONSERVATIVE) + if (dbs_data->cdata->governor == GOV_CONSERVATIVE) cs_dbs_info->enable = 0; - dbs_timer_exit(cpu_cdbs); + gov_cancel_work(dbs_data, policy); mutex_lock(&dbs_data->mutex); mutex_destroy(&cpu_cdbs->timer_mutex); - dbs_data->enable--; - if (!dbs_data->enable) { - struct cs_ops *ops = dbs_data->gov_ops; - - sysfs_remove_group(cpufreq_global_kobject, - dbs_data->attr_group); - if (dbs_data->governor == GOV_CONSERVATIVE) - cpufreq_unregister_notifier(ops->notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - } + cpu_cdbs->cur_policy = NULL; + mutex_unlock(&dbs_data->mutex); break; case CPUFREQ_GOV_LIMITS: + mutex_lock(&dbs_data->mutex); + if (!cpu_cdbs->cur_policy) { + mutex_unlock(&dbs_data->mutex); + break; + } mutex_lock(&cpu_cdbs->timer_mutex); if (policy->max < cpu_cdbs->cur_policy->cur) __cpufreq_driver_target(cpu_cdbs->cur_policy, @@ -311,6 +441,7 @@ second_time: policy->min, CPUFREQ_RELATION_L); dbs_check_cpu(dbs_data, cpu); mutex_unlock(&cpu_cdbs->timer_mutex); + mutex_unlock(&dbs_data->mutex); break; } return 0; |
