33 files changed, 1869 insertions, 259 deletions
diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig
index c69d1e07a3a..f1238896785 100644
--- a/drivers/gpu/drm/msm/Kconfig
+++ b/drivers/gpu/drm/msm/Kconfig
@@ -3,7 +3,7 @@ config DRM_MSM
 	tristate "MSM DRM"
 	depends on DRM
 	depends on MSM_IOMMU
-	depends on (ARCH_MSM && ARCH_MSM8960) || (ARM && COMPILE_TEST)
+	depends on ARCH_QCOM || (ARM && COMPILE_TEST)
 	select DRM_KMS_HELPER
 	select SHMEM
 	select TMPFS
diff --git a/drivers/gpu/drm/msm/Makefile b/drivers/gpu/drm/msm/Makefile
index 4f977a593be..93ca49c8df4 100644
--- a/drivers/gpu/drm/msm/Makefile
+++ b/drivers/gpu/drm/msm/Makefile
@@ -7,6 +7,7 @@ msm-y := \
 	adreno/adreno_gpu.o \
 	adreno/a3xx_gpu.o \
 	hdmi/hdmi.o \
+	hdmi/hdmi_audio.o \
 	hdmi/hdmi_bridge.o \
 	hdmi/hdmi_connector.o \
 	hdmi/hdmi_i2c.o \
@@ -33,6 +34,8 @@ msm-y := \
 	msm_gem_submit.o \
 	msm_gpu.o \
 	msm_iommu.o \
+	msm_perf.o \
+	msm_rd.o \
 	msm_ringbuffer.o
 
 msm-$(CONFIG_DRM_MSM_FBDEV) += msm_fbdev.o
diff --git a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
index 461df93e825..942e09d898a 100644
--- a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
@@ -35,7 +35,11 @@
 	 A3XX_INT0_CP_AHB_ERROR_HALT |     \
 	 A3XX_INT0_UCHE_OOB_ACCESS)
 
-static struct platform_device *a3xx_pdev;
+
+static bool hang_debug = false;
+MODULE_PARM_DESC(hang_debug, "Dump registers when hang is detected (can be slow!)");
+module_param_named(hang_debug, hang_debug, bool, 0600);
+static void a3xx_dump(struct msm_gpu *gpu);
 
 static void a3xx_me_init(struct msm_gpu *gpu)
 {
@@ -203,11 +207,11 @@ static int a3xx_hw_init(struct msm_gpu *gpu)
 	/* Turn on performance counters: */
 	gpu_write(gpu, REG_A3XX_RBBM_PERFCTR_CTL, 0x01);
 
-	/* Set SP perfcounter 7 to count SP_FS_FULL_ALU_INSTRUCTIONS
-	 * we will use this to augment our hang detection:
-	 */
-	gpu_write(gpu, REG_A3XX_SP_PERFCOUNTER7_SELECT,
-			SP_FS_FULL_ALU_INSTRUCTIONS);
+	/* Enable the perfcntrs that we use.. */
+	for (i = 0; i < gpu->num_perfcntrs; i++) {
+		const struct msm_gpu_perfcntr *perfcntr = &gpu->perfcntrs[i];
+		gpu_write(gpu, perfcntr->select_reg, perfcntr->select_val);
+	}
 
 	gpu_write(gpu, REG_A3XX_RBBM_INT_0_MASK, A3XX_INT0_MASK);
 
@@ -291,6 +295,9 @@ static int a3xx_hw_init(struct msm_gpu *gpu)
 
 static void a3xx_recover(struct msm_gpu *gpu)
 {
+	/* dump registers before resetting gpu, if enabled: */
+	if (hang_debug)
+		a3xx_dump(gpu);
 	gpu_write(gpu, REG_A3XX_RBBM_SW_RESET_CMD, 1);
 	gpu_read(gpu, REG_A3XX_RBBM_SW_RESET_CMD);
 	gpu_write(gpu, REG_A3XX_RBBM_SW_RESET_CMD, 0);
@@ -311,27 +318,18 @@ static void a3xx_destroy(struct msm_gpu *gpu)
 		ocmem_free(OCMEM_GRAPHICS, a3xx_gpu->ocmem_hdl);
 #endif
 
-	put_device(&a3xx_gpu->pdev->dev);
 	kfree(a3xx_gpu);
 }
 
 static void a3xx_idle(struct msm_gpu *gpu)
 {
-	unsigned long t;
-
 	/* wait for ringbuffer to drain: */
 	adreno_idle(gpu);
 
-	t = jiffies + ADRENO_IDLE_TIMEOUT;
-
 	/* then wait for GPU to finish: */
-	do {
-		uint32_t rbbm_status = gpu_read(gpu, REG_A3XX_RBBM_STATUS);
-		if (!(rbbm_status & A3XX_RBBM_STATUS_GPU_BUSY))
-			return;
-	} while(time_before(jiffies, t));
-
-	DRM_ERROR("timeout waiting for %s to idle!\n", gpu->name);
+	if (spin_until(!(gpu_read(gpu, REG_A3XX_RBBM_STATUS) &
+			A3XX_RBBM_STATUS_GPU_BUSY)))
+		DRM_ERROR("%s: timeout waiting for GPU to idle!\n", gpu->name);
 
 	/* TODO maybe we need to reset GPU here to recover from hang? */
 }
@@ -352,7 +350,6 @@ static irqreturn_t a3xx_irq(struct msm_gpu *gpu)
 	return IRQ_HANDLED;
 }
 
-#ifdef CONFIG_DEBUG_FS
 static const unsigned int a3xx_registers[] = {
 	0x0000, 0x0002, 0x0010, 0x0012, 0x0018, 0x0018, 0x0020, 0x0027,
 	0x0029, 0x002b, 0x002e, 0x0033, 0x0040, 0x0042, 0x0050, 0x005c,
@@ -392,11 +389,18 @@ static const unsigned int a3xx_registers[] = {
 	0x303c, 0x303c, 0x305e, 0x305f,
 };
 
+#ifdef CONFIG_DEBUG_FS
 static void a3xx_show(struct msm_gpu *gpu, struct seq_file *m)
 {
+	struct drm_device *dev = gpu->dev;
 	int i;
 
 	adreno_show(gpu, m);
+
+	mutex_lock(&dev->struct_mutex);
+
+	gpu->funcs->pm_resume(gpu);
+
 	seq_printf(m, "status:   %08x\n",
 			gpu_read(gpu, REG_A3XX_RBBM_STATUS));
 
@@ -412,9 +416,36 @@ static void a3xx_show(struct msm_gpu *gpu, struct seq_file *m)
 			seq_printf(m, "IO:R %08x %08x\n", addr<<2, val);
 		}
 	}
+
+	gpu->funcs->pm_suspend(gpu);
+
+	mutex_unlock(&dev->struct_mutex);
 }
 #endif
 
+/* would be nice to not have to duplicate the _show() stuff with printk(): */
+static void a3xx_dump(struct msm_gpu *gpu)
+{
+	int i;
+
+	adreno_dump(gpu);
+	printk("status:   %08x\n",
+			gpu_read(gpu, REG_A3XX_RBBM_STATUS));
+
+	/* dump these out in a form that can be parsed by demsm: */
+	printk("IO:region %s 00000000 00020000\n", gpu->name);
+	for (i = 0; i < ARRAY_SIZE(a3xx_registers); i += 2) {
+		uint32_t start = a3xx_registers[i];
+		uint32_t end   = a3xx_registers[i+1];
+		uint32_t addr;
+
+		for (addr = start; addr <= end; addr++) {
+			uint32_t val = gpu_read(gpu, addr);
+			printk("IO:R %08x %08x\n", addr<<2, val);
+		}
+	}
+}
+
 static const struct adreno_gpu_funcs funcs = {
 	.base = {
 		.get_param = adreno_get_param,
@@ -434,12 +465,20 @@ static const struct adreno_gpu_funcs funcs = {
 	},
 };
 
+static const struct msm_gpu_perfcntr perfcntrs[] = {
+	{ REG_A3XX_SP_PERFCOUNTER6_SELECT, REG_A3XX_RBBM_PERFCTR_SP_6_LO,
+			SP_ALU_ACTIVE_CYCLES, "ALUACTIVE" },
+	{ REG_A3XX_SP_PERFCOUNTER7_SELECT, REG_A3XX_RBBM_PERFCTR_SP_7_LO,
+			SP_FS_FULL_ALU_INSTRUCTIONS, "ALUFULL" },
+};
+
 struct msm_gpu *a3xx_gpu_init(struct drm_device *dev)
 {
 	struct a3xx_gpu *a3xx_gpu = NULL;
 	struct adreno_gpu *adreno_gpu;
 	struct msm_gpu *gpu;
-	struct platform_device *pdev = a3xx_pdev;
+	struct msm_drm_private *priv = dev->dev_private;
+	struct platform_device *pdev = priv->gpu_pdev;
 	struct adreno_platform_config *config;
 	int ret;
 
@@ -460,7 +499,6 @@ struct msm_gpu *a3xx_gpu_init(struct drm_device *dev)
 	adreno_gpu = &a3xx_gpu->base;
 	gpu = &adreno_gpu->base;
 
-	get_device(&pdev->dev);
 	a3xx_gpu->pdev = pdev;
 
 	gpu->fast_rate = config->fast_rate;
@@ -473,6 +511,9 @@ struct msm_gpu *a3xx_gpu_init(struct drm_device *dev)
 	DBG("fast_rate=%u, slow_rate=%u, bus_freq=%u",
 			gpu->fast_rate, gpu->slow_rate, gpu->bus_freq);
 
+	gpu->perfcntrs = perfcntrs;
+	gpu->num_perfcntrs = ARRAY_SIZE(perfcntrs);
+
 	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, config->rev);
 	if (ret)
 		goto fail;
@@ -522,17 +563,24 @@ fail:
 #  include <mach/kgsl.h>
 #endif
 
-static int a3xx_probe(struct platform_device *pdev)
+static void set_gpu_pdev(struct drm_device *dev,
+		struct platform_device *pdev)
+{
+	struct msm_drm_private *priv = dev->dev_private;
+	priv->gpu_pdev = pdev;
+}
+
+static int a3xx_bind(struct device *dev, struct device *master, void *data)
 {
 	static struct adreno_platform_config config = {};
 #ifdef CONFIG_OF
-	struct device_node *child, *node = pdev->dev.of_node;
+	struct device_node *child, *node = dev->of_node;
 	u32 val;
 	int ret;
 
 	ret = of_property_read_u32(node, "qcom,chipid", &val);
 	if (ret) {
-		dev_err(&pdev->dev, "could not find chipid: %d\n", ret);
+		dev_err(dev, "could not find chipid: %d\n", ret);
 		return ret;
 	}
 
@@ -548,7 +596,7 @@ static int a3xx_probe(struct platform_device *pdev)
 			for_each_child_of_node(child, pwrlvl) {
 				ret = of_property_read_u32(pwrlvl, "qcom,gpu-freq", &val);
 				if (ret) {
-					dev_err(&pdev->dev, "could not find gpu-freq: %d\n", ret);
+					dev_err(dev, "could not find gpu-freq: %d\n", ret);
 					return ret;
 				}
 				config.fast_rate = max(config.fast_rate, val);
@@ -558,12 +606,12 @@ static int a3xx_probe(struct platform_device *pdev)
 	}
 
 	if (!config.fast_rate) {
-		dev_err(&pdev->dev, "could not find clk rates\n");
+		dev_err(dev, "could not find clk rates\n");
 		return -ENXIO;
 	}
 
 #else
-	struct kgsl_device_platform_data *pdata = pdev->dev.platform_data;
+	struct kgsl_device_platform_data *pdata = dev->platform_data;
 	uint32_t version = socinfo_get_version();
 	if (cpu_is_apq8064ab()) {
 		config.fast_rate = 450000000;
@@ -609,14 +657,30 @@ static int a3xx_probe(struct platform_device *pdev)
 	config.bus_scale_table = pdata->bus_scale_table;
 #  endif
 #endif
-	pdev->dev.platform_data = &config;
-	a3xx_pdev = pdev;
+	dev->platform_data = &config;
+	set_gpu_pdev(dev_get_drvdata(master), to_platform_device(dev));
 	return 0;
 }
 
+static void a3xx_unbind(struct device *dev, struct device *master,
+		void *data)
+{
+	set_gpu_pdev(dev_get_drvdata(master), NULL);
+}
+
+static const struct component_ops a3xx_ops = {
+		.bind   = a3xx_bind,
+		.unbind = a3xx_unbind,
+};
+
+static int a3xx_probe(struct platform_device *pdev)
+{
+	return component_add(&pdev->dev, &a3xx_ops);
+}
+
 static int a3xx_remove(struct platform_device *pdev)
 {
-	a3xx_pdev = NULL;
+	component_del(&pdev->dev, &a3xx_ops);
 	return 0;
 }
 
@@ -624,7 +688,6 @@ static const struct of_device_id dt_match[] = {
 	{ .compatible = "qcom,kgsl-3d0" },
 	{}
 };
-MODULE_DEVICE_TABLE(of, dt_match);
 
 static struct platform_driver a3xx_driver = {
 	.probe = a3xx_probe,
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index d321099abdd..28ca8cd8b09 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -73,6 +73,12 @@ int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value)
 	case MSM_PARAM_GMEM_SIZE:
 		*value = adreno_gpu->gmem;
 		return 0;
+	case MSM_PARAM_CHIP_ID:
+		*value = adreno_gpu->rev.patchid |
+				(adreno_gpu->rev.minor << 8) |
+				(adreno_gpu->rev.major << 16) |
+				(adreno_gpu->rev.core << 24);
+		return 0;
 	default:
 		DBG("%s: invalid param: %u", gpu->name, param);
 		return -EINVAL;
@@ -225,19 +231,11 @@ void adreno_flush(struct msm_gpu *gpu)
 void adreno_idle(struct msm_gpu *gpu)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
-	uint32_t rptr, wptr = get_wptr(gpu->rb);
-	unsigned long t;
-
-	t = jiffies + ADRENO_IDLE_TIMEOUT;
-
-	/* then wait for CP to drain ringbuffer: */
-	do {
-		rptr = adreno_gpu->memptrs->rptr;
-		if (rptr == wptr)
-			return;
-	} while(time_before(jiffies, t));
+	uint32_t wptr = get_wptr(gpu->rb);
 
-	DRM_ERROR("%s: timeout waiting to drain ringbuffer!\n", gpu->name);
+	/* wait for CP to drain ringbuffer: */
+	if (spin_until(adreno_gpu->memptrs->rptr == wptr))
+		DRM_ERROR("%s: timeout waiting to drain ringbuffer!\n", gpu->name);
 
 	/* TODO maybe we need to reset GPU here to recover from hang? */
 }
@@ -260,22 +258,37 @@ void adreno_show(struct msm_gpu *gpu, struct seq_file *m)
 }
 #endif
 
-void adreno_wait_ring(struct msm_gpu *gpu, uint32_t ndwords)
+/* would be nice to not have to duplicate the _show() stuff with printk(): */
+void adreno_dump(struct msm_gpu *gpu)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
-	uint32_t freedwords;
-	unsigned long t = jiffies + ADRENO_IDLE_TIMEOUT;
-	do {
-		uint32_t size = gpu->rb->size / 4;
-		uint32_t wptr = get_wptr(gpu->rb);
-		uint32_t rptr = adreno_gpu->memptrs->rptr;
-		freedwords = (rptr + (size - 1) - wptr) % size;
-
-		if (time_after(jiffies, t)) {
-			DRM_ERROR("%s: timeout waiting for ringbuffer space\n", gpu->name);
-			break;
-		}
-	} while(freedwords < ndwords);
+
+	printk("revision: %d (%d.%d.%d.%d)\n",
+			adreno_gpu->info->revn, adreno_gpu->rev.core,
+			adreno_gpu->rev.major, adreno_gpu->rev.minor,
+			adreno_gpu->rev.patchid);
+
+	printk("fence:    %d/%d\n", adreno_gpu->memptrs->fence,
+			gpu->submitted_fence);
+	printk("rptr:     %d\n", adreno_gpu->memptrs->rptr);
+	printk("wptr:     %d\n", adreno_gpu->memptrs->wptr);
+	printk("rb wptr:  %d\n", get_wptr(gpu->rb));
+
+}
+
+static uint32_t ring_freewords(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	uint32_t size = gpu->rb->size / 4;
+	uint32_t wptr = get_wptr(gpu->rb);
+	uint32_t rptr = adreno_gpu->memptrs->rptr;
+	return (rptr + (size - 1) - wptr) % size;
+}
+
+void adreno_wait_ring(struct msm_gpu *gpu, uint32_t ndwords)
+{
+	if (spin_until(ring_freewords(gpu) >= ndwords))
+		DRM_ERROR("%s: timeout waiting for ringbuffer space\n", gpu->name);
 }
 
 static const char *iommu_ports[] = {
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
index ca11ea4da16..63c36ce3302 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
@@ -76,7 +76,20 @@ struct adreno_platform_config {
 #endif
 };
 
-#define ADRENO_IDLE_TIMEOUT (20 * 1000)
+#define ADRENO_IDLE_TIMEOUT msecs_to_jiffies(1000)
+
+#define spin_until(X) ({                                   \
+	int __ret = -ETIMEDOUT;                            \
+	unsigned long __t = jiffies + ADRENO_IDLE_TIMEOUT; \
+	do {                                               \
+		if (X) {                                   \
+			__ret = 0;                         \
+			break;                             \
+		}                                          \
+	} while (time_before(jiffies, __t));               \
+	__ret;                                             \
+})
+
 
 static inline bool adreno_is_a3xx(struct adreno_gpu *gpu)
 {
@@ -114,6 +127,7 @@ void adreno_idle(struct msm_gpu *gpu);
 #ifdef CONFIG_DEBUG_FS
 void adreno_show(struct msm_gpu *gpu, struct seq_file *m);
 #endif
+void adreno_dump(struct msm_gpu *gpu);
 void adreno_wait_ring(struct msm_gpu *gpu, uint32_t ndwords);
 
 int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
diff --git a/drivers/gpu/drm/msm/hdmi/hdmi.c b/drivers/gpu/drm/msm/hdmi/hdmi.c
index 6f1588aa907..7f7aadef8a8 100644
--- a/drivers/gpu/drm/msm/hdmi/hdmi.c
+++ b/drivers/gpu/drm/msm/hdmi/hdmi.c
@@ -17,8 +17,6 @@
 
 #include "hdmi.h"
 
-static struct platform_device *hdmi_pdev;
-
 void hdmi_set_mode(struct hdmi *hdmi, bool power_on)
 {
 	uint32_t ctrl = 0;
@@ -67,7 +65,7 @@ void hdmi_destroy(struct kref *kref)
 	if (hdmi->i2c)
 		hdmi_i2c_destroy(hdmi->i2c);
 
-	put_device(&hdmi->pdev->dev);
+	platform_set_drvdata(hdmi->pdev, NULL);
 }
 
 /* initialize connector */
@@ -75,7 +73,7 @@ struct hdmi *hdmi_init(struct drm_device *dev, struct drm_encoder *encoder)
 {
 	struct hdmi *hdmi = NULL;
 	struct msm_drm_private *priv = dev->dev_private;
-	struct platform_device *pdev = hdmi_pdev;
+	struct platform_device *pdev = priv->hdmi_pdev;
 	struct hdmi_platform_config *config;
 	int i, ret;
 
@@ -95,13 +93,13 @@ struct hdmi *hdmi_init(struct drm_device *dev, struct drm_encoder *encoder)
 
 	kref_init(&hdmi->refcount);
 
-	get_device(&pdev->dev);
-
 	hdmi->dev = dev;
 	hdmi->pdev = pdev;
 	hdmi->config = config;
 	hdmi->encoder = encoder;
 
+	hdmi_audio_infoframe_init(&hdmi->audio.infoframe);
+
 	/* not sure about which phy maps to which msm.. probably I miss some */
 	if (config->phy_init)
 		hdmi->phy = config->phy_init(hdmi);
@@ -228,6 +226,8 @@ struct hdmi *hdmi_init(struct drm_device *dev, struct drm_encoder *encoder)
 	priv->bridges[priv->num_bridges++]       = hdmi->bridge;
 	priv->connectors[priv->num_connectors++] = hdmi->connector;
 
+	platform_set_drvdata(pdev, hdmi);
+
 	return hdmi;
 
 fail:
@@ -249,17 +249,24 @@ fail:
 
 #include <linux/of_gpio.h>
 
-static int hdmi_dev_probe(struct platform_device *pdev)
+static void set_hdmi_pdev(struct drm_device *dev,
+		struct platform_device *pdev)
+{
+	struct msm_drm_private *priv = dev->dev_private;
+	priv->hdmi_pdev = pdev;
+}
+
+static int hdmi_bind(struct device *dev, struct device *master, void *data)
 {
 	static struct hdmi_platform_config config = {};
 #ifdef CONFIG_OF
-	struct device_node *of_node = pdev->dev.of_node;
+	struct device_node *of_node = dev->of_node;
 
 	int get_gpio(const char *name)
 	{
 		int gpio = of_get_named_gpio(of_node, name, 0);
 		if (gpio < 0) {
-			dev_err(&pdev->dev, "failed to get gpio: %s (%d)\n",
+			dev_err(dev, "failed to get gpio: %s (%d)\n",
 					name, gpio);
 			gpio = -1;
 		}
@@ -270,6 +277,7 @@ static int hdmi_dev_probe(struct platform_device *pdev)
 	static const char *hpd_reg_names[] = {"hpd-gdsc", "hpd-5v"};
 	static const char *pwr_reg_names[] = {"core-vdda", "core-vcc"};
 	static const char *hpd_clk_names[] = {"iface_clk", "core_clk", "mdp_core_clk"};
+	static unsigned long hpd_clk_freq[] = {0, 19200000, 0};
 	static const char *pwr_clk_names[] = {"extp_clk", "alt_iface_clk"};
 
 	config.phy_init      = hdmi_phy_8x74_init;
@@ -279,6 +287,7 @@ static int hdmi_dev_probe(struct platform_device *pdev)
 	config.pwr_reg_names = pwr_reg_names;
 	config.pwr_reg_cnt   = ARRAY_SIZE(pwr_reg_names);
 	config.hpd_clk_names = hpd_clk_names;
+	config.hpd_freq      = hpd_clk_freq;
 	config.hpd_clk_cnt   = ARRAY_SIZE(hpd_clk_names);
 	config.pwr_clk_names = pwr_clk_names;
 	config.pwr_clk_cnt   = ARRAY_SIZE(pwr_clk_names);
@@ -305,7 +314,7 @@ static int hdmi_dev_probe(struct platform_device *pdev)
 		config.ddc_data_gpio = 71;
 		config.hpd_gpio      = 72;
 		config.mux_en_gpio   = -1;
-		config.mux_sel_gpio  = 13 + NR_GPIO_IRQS;
+		config.mux_sel_gpio  = -1;
 	} else if (cpu_is_msm8960() || cpu_is_msm8960ab()) {
 		static const char *hpd_reg_names[] = {"8921_hdmi_mvs"};
 		config.phy_init      = hdmi_phy_8960_init;
@@ -336,14 +345,30 @@ static int hdmi_dev_probe(struct platform_device *pdev)
 		config.mux_sel_gpio  = -1;
 	}
 #endif
-	pdev->dev.platform_data = &config;
-	hdmi_pdev = pdev;
+	dev->platform_data = &config;
+	set_hdmi_pdev(dev_get_drvdata(master), to_platform_device(dev));
 	return 0;
 }
 
+static void hdmi_unbind(struct device *dev, struct device *master,
+		void *data)
+{
+	set_hdmi_pdev(dev_get_drvdata(master), NULL);
+}
+
+static const struct component_ops hdmi_ops = {
+		.bind   = hdmi_bind,
+		.unbind = hdmi_unbind,
+};
+
+static int hdmi_dev_probe(struct platform_device *pdev)
+{
+	return component_add(&pdev->dev, &hdmi_ops);
+}
+
 static int hdmi_dev_remove(struct platform_device *pdev)
 {
-	hdmi_pdev = NULL;
+	component_del(&pdev->dev, &hdmi_ops);
 	return 0;
 }
 
@@ -351,7 +376,6 @@ static const struct of_device_id dt_match[] = {
 	{ .compatible = "qcom,hdmi-tx" },
 	{}
 };
-MODULE_DEVICE_TABLE(of, dt_match);
 
 static struct platform_driver hdmi_driver = {
 	.probe = hdmi_dev_probe,
diff --git a/drivers/gpu/drm/msm/hdmi/hdmi.h b/drivers/gpu/drm/msm/hdmi/hdmi.h
index 41b29add70b..9d7723c6528 100644
--- a/drivers/gpu/drm/msm/hdmi/hdmi.h
+++ b/drivers/gpu/drm/msm/hdmi/hdmi.h
@@ -22,6 +22,7 @@
 #include <linux/clk.h>
 #include <linux/platform_device.h>
 #include <linux/regulator/consumer.h>
+#include <linux/hdmi.h>
 
 #include "msm_drv.h"
 #include "hdmi.xml.h"
@@ -30,6 +31,12 @@
 struct hdmi_phy;
 struct hdmi_platform_config;
 
+struct hdmi_audio {
+	bool enabled;
+	struct hdmi_audio_infoframe infoframe;
+	int rate;
+};
+
 struct hdmi {
 	struct kref refcount;
 
@@ -38,6 +45,13 @@ struct hdmi {
 
 	const struct hdmi_platform_config *config;
 
+	/* audio state: */
+	struct hdmi_audio audio;
+
+	/* video state: */
+	bool power_on;
+	unsigned long int pixclock;
+
 	void __iomem *mmio;
 
 	struct regulator *hpd_regs[2];
@@ -73,6 +87,7 @@ struct hdmi_platform_config {
 
 	/* clks that need to be on for hpd: */
 	const char **hpd_clk_names;
+	const long unsigned *hpd_freq;
 	int hpd_clk_cnt;
 
 	/* clks that need to be on for screen pwr (ie pixel clk): */
@@ -132,6 +147,17 @@ struct hdmi_phy *hdmi_phy_8x60_init(struct hdmi *hdmi);
 struct hdmi_phy *hdmi_phy_8x74_init(struct hdmi *hdmi);
 
 /*
+ * audio:
+ */
+
+int hdmi_audio_update(struct hdmi *hdmi);
+int hdmi_audio_info_setup(struct hdmi *hdmi, bool enabled,
+	uint32_t num_of_channels, uint32_t channel_allocation,
+	uint32_t level_shift, bool down_mix);
+void hdmi_audio_set_sample_rate(struct hdmi *hdmi, int rate);
+
+
+/*
  * hdmi bridge:
  */
 
diff --git a/drivers/gpu/drm/msm/hdmi/hdmi_audio.c b/drivers/gpu/drm/msm/hdmi/hdmi_audio.c
new file mode 100644
index 00000000000..872485f6013
--- /dev/null
+++ b/drivers/gpu/drm/msm/hdmi/hdmi_audio.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (C) 2013 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/hdmi.h>
+#include "hdmi.h"
+
+
+/* Supported HDMI Audio channels */
+#define MSM_HDMI_AUDIO_CHANNEL_2		0
+#define MSM_HDMI_AUDIO_CHANNEL_4		1
+#define MSM_HDMI_AUDIO_CHANNEL_6		2
+#define MSM_HDMI_AUDIO_CHANNEL_8		3
+
+/* maps MSM_HDMI_AUDIO_CHANNEL_n consts used by audio driver to # of channels: */
+static int nchannels[] = { 2, 4, 6, 8 };
+
+/* Supported HDMI Audio sample rates */
+#define MSM_HDMI_SAMPLE_RATE_32KHZ		0
+#define MSM_HDMI_SAMPLE_RATE_44_1KHZ		1
+#define MSM_HDMI_SAMPLE_RATE_48KHZ		2
+#define MSM_HDMI_SAMPLE_RATE_88_2KHZ		3
+#define MSM_HDMI_SAMPLE_RATE_96KHZ		4
+#define MSM_HDMI_SAMPLE_RATE_176_4KHZ		5
+#define MSM_HDMI_SAMPLE_RATE_192KHZ		6
+#define MSM_HDMI_SAMPLE_RATE_MAX		7
+
+
+struct hdmi_msm_audio_acr {
+	uint32_t n;	/* N parameter for clock regeneration */
+	uint32_t cts;	/* CTS parameter for clock regeneration */
+};
+
+struct hdmi_msm_audio_arcs {
+	unsigned long int pixclock;
+	struct hdmi_msm_audio_acr lut[MSM_HDMI_SAMPLE_RATE_MAX];
+};
+
+#define HDMI_MSM_AUDIO_ARCS(pclk, ...) { (1000 * (pclk)), __VA_ARGS__ }
+
+/* Audio constants lookup table for hdmi_msm_audio_acr_setup */
+/* Valid Pixel-Clock rates: 25.2MHz, 27MHz, 27.03MHz, 74.25MHz, 148.5MHz */
+static const struct hdmi_msm_audio_arcs acr_lut[] = {
+	/*  25.200MHz  */
+	HDMI_MSM_AUDIO_ARCS(25200, {
+		{4096, 25200}, {6272, 28000}, {6144, 25200}, {12544, 28000},
+		{12288, 25200}, {25088, 28000}, {24576, 25200} }),
+	/*  27.000MHz  */
+	HDMI_MSM_AUDIO_ARCS(27000, {
+		{4096, 27000}, {6272, 30000}, {6144, 27000}, {12544, 30000},
+		{12288, 27000}, {25088, 30000}, {24576, 27000} }),
+	/*  27.027MHz */
+	HDMI_MSM_AUDIO_ARCS(27030, {
+		{4096, 27027}, {6272, 30030}, {6144, 27027}, {12544, 30030},
+		{12288, 27027}, {25088, 30030}, {24576, 27027} }),
+	/*  74.250MHz */
+	HDMI_MSM_AUDIO_ARCS(74250, {
+		{4096, 74250}, {6272, 82500}, {6144, 74250}, {12544, 82500},
+		{12288, 74250}, {25088, 82500}, {24576, 74250} }),
+	/* 148.500MHz */
+	HDMI_MSM_AUDIO_ARCS(148500, {
+		{4096, 148500}, {6272, 165000}, {6144, 148500}, {12544, 165000},
+		{12288, 148500}, {25088, 165000}, {24576, 148500} }),
+};
+
+static const struct hdmi_msm_audio_arcs *get_arcs(unsigned long int pixclock)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(acr_lut); i++) {
+		const struct hdmi_msm_audio_arcs *arcs = &acr_lut[i];
+		if (arcs->pixclock == pixclock)
+			return arcs;
+	}
+
+	return NULL;
+}
+
+int hdmi_audio_update(struct hdmi *hdmi)
+{
+	struct hdmi_audio *audio = &hdmi->audio;
+	struct hdmi_audio_infoframe *info = &audio->infoframe;
+	const struct hdmi_msm_audio_arcs *arcs = NULL;
+	bool enabled = audio->enabled;
+	uint32_t acr_pkt_ctrl, vbi_pkt_ctrl, aud_pkt_ctrl;
+	uint32_t infofrm_ctrl, audio_config;
+
+	DBG("audio: enabled=%d, channels=%d, channel_allocation=0x%x, "
+		"level_shift_value=%d, downmix_inhibit=%d, rate=%d",
+		audio->enabled, info->channels,  info->channel_allocation,
+		info->level_shift_value, info->downmix_inhibit, audio->rate);
+	DBG("video: power_on=%d, pixclock=%lu", hdmi->power_on, hdmi->pixclock);
+
+	if (enabled && !(hdmi->power_on && hdmi->pixclock)) {
+		DBG("disabling audio: no video");
+		enabled = false;
+	}
+
+	if (enabled) {
+		arcs = get_arcs(hdmi->pixclock);
+		if (!arcs) {
+			DBG("disabling audio: unsupported pixclock: %lu",
+					hdmi->pixclock);
+			enabled = false;
+		}
+	}
+
+	/* Read first before writing */
+	acr_pkt_ctrl = hdmi_read(hdmi, REG_HDMI_ACR_PKT_CTRL);
+	vbi_pkt_ctrl = hdmi_read(hdmi, REG_HDMI_VBI_PKT_CTRL);
+	aud_pkt_ctrl = hdmi_read(hdmi, REG_HDMI_AUDIO_PKT_CTRL1);
+	infofrm_ctrl = hdmi_read(hdmi, REG_HDMI_INFOFRAME_CTRL0);
+	audio_config = hdmi_read(hdmi, REG_HDMI_AUDIO_CFG);
+
+	/* Clear N/CTS selection bits */
+	acr_pkt_ctrl &= ~HDMI_ACR_PKT_CTRL_SELECT__MASK;
+
+	if (enabled) {
+		uint32_t n, cts, multiplier;
+		enum hdmi_acr_cts select;
+		uint8_t buf[14];
+
+		n   = arcs->lut[audio->rate].n;
+		cts = arcs->lut[audio->rate].cts;
+
+		if ((MSM_HDMI_SAMPLE_RATE_192KHZ == audio->rate) ||
+				(MSM_HDMI_SAMPLE_RATE_176_4KHZ == audio->rate)) {
+			multiplier = 4;
+			n >>= 2; /* divide N by 4 and use multiplier */
+		} else if ((MSM_HDMI_SAMPLE_RATE_96KHZ == audio->rate) ||
+				(MSM_HDMI_SAMPLE_RATE_88_2KHZ == audio->rate)) {
+			multiplier = 2;
+			n >>= 1; /* divide N by 2 and use multiplier */
+		} else {
+			multiplier = 1;
+		}
+
+		DBG("n=%u, cts=%u, multiplier=%u", n, cts, multiplier);
+
+		acr_pkt_ctrl |= HDMI_ACR_PKT_CTRL_SOURCE;
+		acr_pkt_ctrl |= HDMI_ACR_PKT_CTRL_AUDIO_PRIORITY;
+		acr_pkt_ctrl |= HDMI_ACR_PKT_CTRL_N_MULTIPLIER(multiplier);
+
+		if ((MSM_HDMI_SAMPLE_RATE_48KHZ == audio->rate) ||
+				(MSM_HDMI_SAMPLE_RATE_96KHZ == audio->rate) ||
+				(MSM_HDMI_SAMPLE_RATE_192KHZ == audio->rate))
+			select = ACR_48;
+		else if ((MSM_HDMI_SAMPLE_RATE_44_1KHZ == audio->rate) ||
+				(MSM_HDMI_SAMPLE_RATE_88_2KHZ == audio->rate) ||
+				(MSM_HDMI_SAMPLE_RATE_176_4KHZ == audio->rate))
+			select = ACR_44;
+		else /* default to 32k */
+			select = ACR_32;
+
+		acr_pkt_ctrl |= HDMI_ACR_PKT_CTRL_SELECT(select);
+
+		hdmi_write(hdmi, REG_HDMI_ACR_0(select - 1),
+				HDMI_ACR_0_CTS(cts));
+		hdmi_write(hdmi, REG_HDMI_ACR_1(select - 1),
+				HDMI_ACR_1_N(n));
+
+		hdmi_write(hdmi, REG_HDMI_AUDIO_PKT_CTRL2,
+				COND(info->channels != 2, HDMI_AUDIO_PKT_CTRL2_LAYOUT) |
+				HDMI_AUDIO_PKT_CTRL2_OVERRIDE);
+
+		acr_pkt_ctrl |= HDMI_ACR_PKT_CTRL_CONT;
+		acr_pkt_ctrl |= HDMI_ACR_PKT_CTRL_SEND;
+
+		/* configure infoframe: */
+		hdmi_audio_infoframe_pack(info, buf, sizeof(buf));
+		hdmi_write(hdmi, REG_HDMI_AUDIO_INFO0,
+				(buf[3] <<  0) || (buf[4] <<  8) ||
+				(buf[5] << 16) || (buf[6] << 24));
+		hdmi_write(hdmi, REG_HDMI_AUDIO_INFO1,
+				(buf[7] <<  0) || (buf[8] << 8));
+
+		hdmi_write(hdmi, REG_HDMI_GC, 0);
+
+		vbi_pkt_ctrl |= HDMI_VBI_PKT_CTRL_GC_ENABLE;
+		vbi_pkt_ctrl |= HDMI_VBI_PKT_CTRL_GC_EVERY_FRAME;
+
+		aud_pkt_ctrl |= HDMI_AUDIO_PKT_CTRL1_AUDIO_SAMPLE_SEND;
+
+		infofrm_ctrl |= HDMI_INFOFRAME_CTRL0_AUDIO_INFO_SEND;
+		infofrm_ctrl |= HDMI_INFOFRAME_CTRL0_AUDIO_INFO_CONT;
+		infofrm_ctrl |= HDMI_INFOFRAME_CTRL0_AUDIO_INFO_SOURCE;
+		infofrm_ctrl |= HDMI_INFOFRAME_CTRL0_AUDIO_INFO_UPDATE;
+
+		audio_config &= ~HDMI_AUDIO_CFG_FIFO_WATERMARK__MASK;
+		audio_config |= HDMI_AUDIO_CFG_FIFO_WATERMARK(4);
+		audio_config |= HDMI_AUDIO_CFG_ENGINE_ENABLE;
+	} else {
+		hdmi_write(hdmi, REG_HDMI_GC, HDMI_GC_MUTE);
+		acr_pkt_ctrl &= ~HDMI_ACR_PKT_CTRL_CONT;
+		acr_pkt_ctrl &= ~HDMI_ACR_PKT_CTRL_SEND;
+		vbi_pkt_ctrl &= ~HDMI_VBI_PKT_CTRL_GC_ENABLE;
+		vbi_pkt_ctrl &= ~HDMI_VBI_PKT_CTRL_GC_EVERY_FRAME;
+		aud_pkt_ctrl &= ~HDMI_AUDIO_PKT_CTRL1_AUDIO_SAMPLE_SEND;
+		infofrm_ctrl &= ~HDMI_INFOFRAME_CTRL0_AUDIO_INFO_SEND;
+		infofrm_ctrl &= ~HDMI_INFOFRAME_CTRL0_AUDIO_INFO_CONT;
+		infofrm_ctrl &= ~HDMI_INFOFRAME_CTRL0_AUDIO_INFO_SOURCE;
+		infofrm_ctrl &= ~HDMI_INFOFRAME_CTRL0_AUDIO_INFO_UPDATE;
+		audio_config &= ~HDMI_AUDIO_CFG_ENGINE_ENABLE;
+	}
+
+	hdmi_write(hdmi, REG_HDMI_ACR_PKT_CTRL, acr_pkt_ctrl);
+	hdmi_write(hdmi, REG_HDMI_VBI_PKT_CTRL, vbi_pkt_ctrl);
+	hdmi_write(hdmi, REG_HDMI_AUDIO_PKT_CTRL1, aud_pkt_ctrl);
+	hdmi_write(hdmi, REG_HDMI_INFOFRAME_CTRL0, infofrm_ctrl);
+
+	hdmi_write(hdmi, REG_HDMI_AUD_INT,
+			COND(enabled, HDMI_AUD_INT_AUD_FIFO_URUN_INT) |
+			COND(enabled, HDMI_AUD_INT_AUD_SAM_DROP_INT));
+
+	hdmi_write(hdmi, REG_HDMI_AUDIO_CFG, audio_config);
+
+
+	DBG("audio %sabled", enabled ? "en" : "dis");
+
+	return 0;
+}
+
+int hdmi_audio_info_setup(struct hdmi *hdmi, bool enabled,
+	uint32_t num_of_channels, uint32_t channel_allocation,
+	uint32_t level_shift, bool down_mix)
+{
+	struct hdmi_audio *audio;
+
+	if (!hdmi)
+		return -ENXIO;
+
+	audio = &hdmi->audio;
+
+	if (num_of_channels >= ARRAY_SIZE(nchannels))
+		return -EINVAL;
+
+	audio->enabled = enabled;
+	audio->infoframe.channels = nchannels[num_of_channels];
+	audio->infoframe.channel_allocation = channel_allocation;
+	audio->infoframe.level_shift_value = level_shift;
+	audio->infoframe.downmix_inhibit = down_mix;
+
+	return hdmi_audio_update(hdmi);
+}
+
+void hdmi_audio_set_sample_rate(struct hdmi *hdmi, int rate)
+{
+	struct hdmi_audio *audio;
+
+	if (!hdmi)
+		return;
+
+	audio = &hdmi->audio;
+
+	if ((rate < 0) || (rate >= MSM_HDMI_SAMPLE_RATE_MAX))
+		return;
+
+	audio->rate = rate;
+	hdmi_audio_update(hdmi);
+}
diff --git a/drivers/gpu/drm/msm/hdmi/hdmi_bridge.c b/drivers/gpu/drm/msm/hdmi/hdmi_bridge.c
index 7d10e55403c..f6cf745c249 100644
--- a/drivers/gpu/drm/msm/hdmi/hdmi_bridge.c
+++ b/drivers/gpu/drm/msm/hdmi/hdmi_bridge.c
@@ -19,11 +19,7 @@
 
 struct hdmi_bridge {
 	struct drm_bridge base;
-
 	struct hdmi *hdmi;
-	bool power_on;
-
-	unsigned long int pixclock;
 };
 #define to_hdmi_bridge(x) container_of(x, struct hdmi_bridge, base)
 
@@ -52,8 +48,8 @@ static void power_on(struct drm_bridge *bridge)
 	}
 
 	if (config->pwr_clk_cnt > 0) {
-		DBG("pixclock: %lu", hdmi_bridge->pixclock);
-		ret = clk_set_rate(hdmi->pwr_clks[0], hdmi_bridge->pixclock);
+		DBG("pixclock: %lu", hdmi->pixclock);
+		ret = clk_set_rate(hdmi->pwr_clks[0], hdmi->pixclock);
 		if (ret) {
 			dev_err(dev->dev, "failed to set pixel clk: %s (%d)\n",
 					config->pwr_clk_names[0], ret);
@@ -102,12 +98,13 @@ static void hdmi_bridge_pre_enable(struct drm_bridge *bridge)
 
 	DBG("power up");
 
-	if (!hdmi_bridge->power_on) {
+	if (!hdmi->power_on) {
 		power_on(bridge);
-		hdmi_bridge->power_on = true;
+		hdmi->power_on = true;
+		hdmi_audio_update(hdmi);
 	}
 
-	phy->funcs->powerup(phy, hdmi_bridge->pixclock);
+	phy->funcs->powerup(phy, hdmi->pixclock);
 	hdmi_set_mode(hdmi, true);
 }
 
@@ -129,9 +126,10 @@ static void hdmi_bridge_post_disable(struct drm_bridge *bridge)
 	hdmi_set_mode(hdmi, false);
 	phy->funcs->powerdown(phy);
 
-	if (hdmi_bridge->power_on) {
+	if (hdmi->power_on) {
 		power_off(bridge);
-		hdmi_bridge->power_on = false;
+		hdmi->power_on = false;
+		hdmi_audio_update(hdmi);
 	}
 }
 
@@ -146,7 +144,7 @@ static void hdmi_bridge_mode_set(struct drm_bridge *bridge,
 
 	mode = adjusted_mode;
 
-	hdmi_bridge->pixclock = mode->clock * 1000;
+	hdmi->pixclock = mode->clock * 1000;
 
 	hdmi->hdmi_mode = drm_match_cea_mode(mode) > 1;
 
@@ -194,9 +192,7 @@ static void hdmi_bridge_mode_set(struct drm_bridge *bridge,
 	DBG("frame_ctrl=%08x", frame_ctrl);
 	hdmi_write(hdmi, REG_HDMI_FRAME_CTRL, frame_ctrl);
 
-	// TODO until we have audio, this might be safest:
-	if (hdmi->hdmi_mode)
-		hdmi_write(hdmi, REG_HDMI_GC, HDMI_GC_MUTE);
+	hdmi_audio_update(hdmi);
 }
 
 static const struct drm_bridge_funcs hdmi_bridge_funcs = {
diff --git a/drivers/gpu/drm/msm/hdmi/hdmi_connector.c b/drivers/gpu/drm/msm/hdmi/hdmi_connector.c
index 7dedfdd1207..28f7e3ec6c2 100644
--- a/drivers/gpu/drm/msm/hdmi/hdmi_connector.c
+++ b/drivers/gpu/drm/msm/hdmi/hdmi_connector.c
@@ -127,6 +127,14 @@ static int hpd_enable(struct hdmi_connector *hdmi_connector)
 	}
 
 	for (i = 0; i < config->hpd_clk_cnt; i++) {
+		if (config->hpd_freq && config->hpd_freq[i]) {
+			ret = clk_set_rate(hdmi->hpd_clks[i],
+					config->hpd_freq[i]);
+			if (ret)
+				dev_warn(dev->dev, "failed to set clk %s (%d)\n",
+						config->hpd_clk_names[i], ret);
+		}
+
 		ret = clk_prepare_enable(hdmi->hpd_clks[i]);
 		if (ret) {
 			dev_err(dev->dev, "failed to enable hpd clk: %s (%d)\n",
@@ -247,36 +255,49 @@ void hdmi_connector_irq(struct drm_connector *connector)
 	}
 }
 
+static enum drm_connector_status detect_reg(struct hdmi *hdmi)
+{
+	uint32_t hpd_int_status = hdmi_read(hdmi, REG_HDMI_HPD_INT_STATUS);
+	return (hpd_int_status & HDMI_HPD_INT_STATUS_CABLE_DETECTED) ?
+			connector_status_connected : connector_status_disconnected;
+}
+
+static enum drm_connector_status detect_gpio(struct hdmi *hdmi)
+{
+	const struct hdmi_platform_config *config = hdmi->config;
+	return gpio_get_value(config->hpd_gpio) ?
+			connector_status_connected :
+			connector_status_disconnected;
+}
+
 static enum drm_connector_status hdmi_connector_detect(
 		struct drm_connector *connector, bool force)
 {
 	struct hdmi_connector *hdmi_connector = to_hdmi_connector(connector);
 	struct hdmi *hdmi = hdmi_connector->hdmi;
-	const struct hdmi_platform_config *config = hdmi->config;
-	uint32_t hpd_int_status;
+	enum drm_connector_status stat_gpio, stat_reg;
 	int retry = 20;
 
-	hpd_int_status = hdmi_read(hdmi, REG_HDMI_HPD_INT_STATUS);
+	do {
+		stat_gpio = detect_gpio(hdmi);
+		stat_reg  = detect_reg(hdmi);
 
-	/* sense seems to in some cases be momentarily de-asserted, don't
-	 * let that trick us into thinking the monitor is gone:
-	 */
-	while (retry-- && !(hpd_int_status & HDMI_HPD_INT_STATUS_CABLE_DETECTED)) {
-		/* hdmi debounce logic seems to get stuck sometimes,
-		 * read directly the gpio to get a second opinion:
-		 */
-		if (gpio_get_value(config->hpd_gpio)) {
-			DBG("gpio tells us we are connected!");
-			hpd_int_status |= HDMI_HPD_INT_STATUS_CABLE_DETECTED;
+		if (stat_gpio == stat_reg)
 			break;
-		}
+
 		mdelay(10);
-		hpd_int_status = hdmi_read(hdmi, REG_HDMI_HPD_INT_STATUS);
-		DBG("status=%08x", hpd_int_status);
+	} while (--retry);
+
+	/* the status we get from reading gpio seems to be more reliable,
+	 * so trust that one the most if we didn't manage to get hdmi and
+	 * gpio status to agree:
+	 */
+	if (stat_gpio != stat_reg) {
+		DBG("HDMI_HPD_INT_STATUS tells us: %d", stat_reg);
+		DBG("hpd gpio tells us: %d", stat_gpio);
 	}
 
-	return (hpd_int_status & HDMI_HPD_INT_STATUS_CABLE_DETECTED) ?
-			connector_status_connected : connector_status_disconnected;
+	return stat_gpio;
 }
 
 static void hdmi_connector_destroy(struct drm_connector *connector)
@@ -389,7 +410,8 @@ struct drm_connector *hdmi_connector_init(struct hdmi *hdmi)
 			DRM_MODE_CONNECTOR_HDMIA);
 	drm_connector_helper_add(connector, &hdmi_connector_helper_funcs);
 
-	connector->polled = DRM_CONNECTOR_POLL_HPD;
+	connector->polled = DRM_CONNECTOR_POLL_CONNECT |
+			DRM_CONNECTOR_POLL_DISCONNECT;
 
 	connector->interlace_allowed = 1;
 	connector->doublescan_allowed = 0;
diff --git a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c
index 1964f4f0d45..74cebb51e8c 100644
--- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c
+++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c
@@ -39,6 +39,7 @@ struct mdp4_crtc {
 		spinlock_t lock;
 		bool stale;
 		uint32_t width, height;
+		uint32_t x, y;
 
 		/* next cursor to scan-out: */
 		uint32_t next_iova;
@@ -57,9 +58,16 @@ struct mdp4_crtc {
 #define PENDING_FLIP   0x2
 	atomic_t pending;
 
-	/* the fb that we currently hold a scanout ref to: */
+	/* the fb that we logically (from PoV of KMS API) hold a ref
+	 * to.  Which we may not yet be scanning out (we may still
+	 * be scanning out previous in case of page_flip while waiting
+	 * for gpu rendering to complete:
+	 */
 	struct drm_framebuffer *fb;
 
+	/* the fb that we currently hold a scanout ref to: */
+	struct drm_framebuffer *scanout_fb;
+
 	/* for unref'ing framebuffers after scanout completes: */
 	struct drm_flip_work unref_fb_work;
 
@@ -77,24 +85,73 @@ static struct mdp4_kms *get_kms(struct drm_crtc *crtc)
 	return to_mdp4_kms(to_mdp_kms(priv->kms));
 }
 
-static void update_fb(struct drm_crtc *crtc, bool async,
-		struct drm_framebuffer *new_fb)
+static void request_pending(struct drm_crtc *crtc, uint32_t pending)
 {
 	struct mdp4_crtc *mdp4_crtc = to_mdp4_crtc(crtc);
-	struct drm_framebuffer *old_fb = mdp4_crtc->fb;
 
-	if (old_fb)
-		drm_flip_work_queue(&mdp4_crtc->unref_fb_work, old_fb);
+	atomic_or(pending, &mdp4_crtc->pending);
+	mdp_irq_register(&get_kms(crtc)->base, &mdp4_crtc->vblank);
+}
+
+static void crtc_flush(struct drm_crtc *crtc)
+{
+	struct mdp4_crtc *mdp4_crtc = to_mdp4_crtc(crtc);
+	struct mdp4_kms *mdp4_kms = get_kms(crtc);
+	uint32_t i, flush = 0;
+
+	for (i = 0; i < ARRAY_SIZE(mdp4_crtc->planes); i++) {
+		struct drm_plane *plane = mdp4_crtc->planes[i];
+		if (plane) {
+			enum mdp4_pipe pipe_id = mdp4_plane_pipe(plane);
+			flush |= pipe2flush(pipe_id);
+		}
+	}
+	flush |= ovlp2flush(mdp4_crtc->ovlp);
+
+	DBG("%s: flush=%08x", mdp4_crtc->name, flush);
+
+	mdp4_write(mdp4_kms, REG_MDP4_OVERLAY_FLUSH, flush);
+}
+
+static void update_fb(struct drm_crtc *crtc, struct drm_framebuffer *new_fb)
+{
+	struct mdp4_crtc *mdp4_crtc = to_mdp4_crtc(crtc);
+	struct drm_framebuffer *old_fb = mdp4_crtc->fb;
 
 	/* grab reference to incoming scanout fb: */
 	drm_framebuffer_reference(new_fb);
-	mdp4_crtc->base.fb = new_fb;
+	mdp4_crtc->base.primary->fb = new_fb;
 	mdp4_crtc->fb = new_fb;
 
-	if (!async) {
-		/* enable vblank to pick up the old_fb */
-		mdp_irq_register(&get_kms(crtc)->base, &mdp4_crtc->vblank);
-	}
+	if (old_fb)
+		drm_flip_work_queue(&mdp4_crtc->unref_fb_work, old_fb);
+}
+
+/* unlike update_fb(), take a ref to the new scanout fb *before* updating
+ * plane, then call this.  Needed to ensure we don't unref the buffer that
+ * is actually still being scanned out.
+ *
+ * Note that this whole thing goes away with atomic.. since we can defer
+ * calling into driver until rendering is done.
+ */
+static void update_scanout(struct drm_crtc *crtc, struct drm_framebuffer *fb)
+{
+	struct mdp4_crtc *mdp4_crtc = to_mdp4_crtc(crtc);
+
+	/* flush updates, to make sure hw is updated to new scanout fb,
+	 * so that we can safely queue unref to current fb (ie. next
+	 * vblank we know hw is done w/ previous scanout_fb).
+	 */
+	crtc_flush(crtc);
+
+	if (mdp4_crtc->scanout_fb)
+		drm_flip_work_queue(&mdp4_crtc->unref_fb_work,
+				mdp4_crtc->scanout_fb);
+
+	mdp4_crtc->scanout_fb = fb;
+
+	/* enable vblank to complete flip: */
+	request_pending(crtc, PENDING_FLIP);
 }
 
 /* if file!=NULL, this is preclose potential cancel-flip path */
@@ -120,49 +177,19 @@ static void complete_flip(struct drm_crtc *crtc, struct drm_file *file)
 	spin_unlock_irqrestore(&dev->event_lock, flags);
 }
 
-static void crtc_flush(struct drm_crtc *crtc)
-{
-	struct mdp4_crtc *mdp4_crtc = to_mdp4_crtc(crtc);
-	struct mdp4_kms *mdp4_kms = get_kms(crtc);
-	uint32_t i, flush = 0;
-
-	for (i = 0; i < ARRAY_SIZE(mdp4_crtc->planes); i++) {
-		struct drm_plane *plane = mdp4_crtc->planes[i];
-		if (plane) {
-			enum mdp4_pipe pipe_id = mdp4_plane_pipe(plane);
-			flush |= pipe2flush(pipe_id);
-		}
-	}
-	flush |= ovlp2flush(mdp4_crtc->ovlp);
-
-	DBG("%s: flush=%08x", mdp4_crtc->name, flush);
-
-	mdp4_write(mdp4_kms, REG_MDP4_OVERLAY_FLUSH, flush);
-}
-
-static void request_pending(struct drm_crtc *crtc, uint32_t pending)
-{
-	struct mdp4_crtc *mdp4_crtc = to_mdp4_crtc(crtc);
-
-	atomic_or(pending, &mdp4_crtc->pending);
-	mdp_irq_register(&get_kms(crtc)->base, &mdp4_crtc->vblank);
-}
-
 static void pageflip_cb(struct msm_fence_cb *cb)
 {
 	struct mdp4_crtc *mdp4_crtc =
 		container_of(cb, struct mdp4_crtc, pageflip_cb);
 	struct drm_crtc *crtc = &mdp4_crtc->base;
-	struct drm_framebuffer *fb = crtc->fb;
+	struct drm_framebuffer *fb = crtc->primary->fb;
 
 	if (!fb)
 		return;
 
+	drm_framebuffer_reference(fb);
 	mdp4_plane_set_scanout(mdp4_crtc->plane, fb);
-	crtc_flush(crtc);
-
-	/* enable vblank to complete flip: */
-	request_pending(crtc, PENDING_FLIP);
+	update_scanout(crtc, fb);
 }
 
 static void unref_fb_worker(struct drm_flip_work *work, void *val)
@@ -190,8 +217,6 @@ static void mdp4_crtc_destroy(struct drm_crtc *crtc)
 {
 	struct mdp4_crtc *mdp4_crtc = to_mdp4_crtc(crtc);
 
-	mdp4_crtc->plane->funcs->destroy(mdp4_crtc->plane);
-
 	drm_crtc_cleanup(crtc);
 	drm_flip_work_cleanup(&mdp4_crtc->unref_fb_work);
 	drm_flip_work_cleanup(&mdp4_crtc->unref_cursor_work);
@@ -320,6 +345,20 @@ static int mdp4_crtc_mode_set(struct drm_crtc *crtc,
 			mode->vsync_end, mode->vtotal,
 			mode->type, mode->flags);
 
+	/* grab extra ref for update_scanout() */
+	drm_framebuffer_reference(crtc->primary->fb);
+
+	ret = mdp4_plane_mode_set(mdp4_crtc->plane, crtc, crtc->primary->fb,
+			0, 0, mode->hdisplay, mode->vdisplay,
+			x << 16, y << 16,
+			mode->hdisplay << 16, mode->vdisplay << 16);
+	if (ret) {
+		drm_framebuffer_unreference(crtc->primary->fb);
+		dev_err(crtc->dev->dev, "%s: failed to set mode on plane: %d\n",
+				mdp4_crtc->name, ret);
+		return ret;
+	}
+
 	mdp4_write(mdp4_kms, REG_MDP4_DMA_SRC_SIZE(dma),
 			MDP4_DMA_SRC_SIZE_WIDTH(mode->hdisplay) |
 			MDP4_DMA_SRC_SIZE_HEIGHT(mode->vdisplay));
@@ -327,7 +366,7 @@ static int mdp4_crtc_mode_set(struct drm_crtc *crtc,
 	/* take data from pipe: */
 	mdp4_write(mdp4_kms, REG_MDP4_DMA_SRC_BASE(dma), 0);
 	mdp4_write(mdp4_kms, REG_MDP4_DMA_SRC_STRIDE(dma),
-			crtc->fb->pitches[0]);
+			crtc->primary->fb->pitches[0]);
 	mdp4_write(mdp4_kms, REG_MDP4_DMA_DST_SIZE(dma),
 			MDP4_DMA_DST_SIZE_WIDTH(0) |
 			MDP4_DMA_DST_SIZE_HEIGHT(0));
@@ -337,28 +376,19 @@ static int mdp4_crtc_mode_set(struct drm_crtc *crtc,
 			MDP4_OVLP_SIZE_WIDTH(mode->hdisplay) |
 			MDP4_OVLP_SIZE_HEIGHT(mode->vdisplay));
 	mdp4_write(mdp4_kms, REG_MDP4_OVLP_STRIDE(ovlp),
-			crtc->fb->pitches[0]);
+			crtc->primary->fb->pitches[0]);
 
 	mdp4_write(mdp4_kms, REG_MDP4_OVLP_CFG(ovlp), 1);
 
-	update_fb(crtc, false, crtc->fb);
-
-	ret = mdp4_plane_mode_set(mdp4_crtc->plane, crtc, crtc->fb,
-			0, 0, mode->hdisplay, mode->vdisplay,
-			x << 16, y << 16,
-			mode->hdisplay << 16, mode->vdisplay << 16);
-	if (ret) {
-		dev_err(crtc->dev->dev, "%s: failed to set mode on plane: %d\n",
-				mdp4_crtc->name, ret);
-		return ret;
-	}
-
 	if (dma == DMA_E) {
 		mdp4_write(mdp4_kms, REG_MDP4_DMA_E_QUANT(0), 0x00ff0000);
 		mdp4_write(mdp4_kms, REG_MDP4_DMA_E_QUANT(1), 0x00ff0000);
 		mdp4_write(mdp4_kms, REG_MDP4_DMA_E_QUANT(2), 0x00ff0000);
 	}
 
+	update_fb(crtc, crtc->primary->fb);
+	update_scanout(crtc, crtc->primary->fb);
+
 	return 0;
 }
 
@@ -385,13 +415,24 @@ static int mdp4_crtc_mode_set_base(struct drm_crtc *crtc, int x, int y,
 	struct mdp4_crtc *mdp4_crtc = to_mdp4_crtc(crtc);
 	struct drm_plane *plane = mdp4_crtc->plane;
 	struct drm_display_mode *mode = &crtc->mode;
+	int ret;
 
-	update_fb(crtc, false, crtc->fb);
+	/* grab extra ref for update_scanout() */
+	drm_framebuffer_reference(crtc->primary->fb);
 
-	return mdp4_plane_mode_set(plane, crtc, crtc->fb,
+	ret = mdp4_plane_mode_set(plane, crtc, crtc->primary->fb,
 			0, 0, mode->hdisplay, mode->vdisplay,
 			x << 16, y << 16,
 			mode->hdisplay << 16, mode->vdisplay << 16);
+	if (ret) {
+		drm_framebuffer_unreference(crtc->primary->fb);
+		return ret;
+	}
+
+	update_fb(crtc, crtc->primary->fb);
+	update_scanout(crtc, crtc->primary->fb);
+
+	return 0;
 }
 
 static void mdp4_crtc_load_lut(struct drm_crtc *crtc)
@@ -419,7 +460,7 @@ static int mdp4_crtc_page_flip(struct drm_crtc *crtc,
 	mdp4_crtc->event = event;
 	spin_unlock_irqrestore(&dev->event_lock, flags);
 
-	update_fb(crtc, true, new_fb);
+	update_fb(crtc, new_fb);
 
 	return msm_gem_queue_inactive_cb(obj, &mdp4_crtc->pageflip_cb);
 }
@@ -442,12 +483,12 @@ static int mdp4_crtc_set_property(struct drm_crtc *crtc,
 static void update_cursor(struct drm_crtc *crtc)
 {
 	struct mdp4_crtc *mdp4_crtc = to_mdp4_crtc(crtc);
+	struct mdp4_kms *mdp4_kms = get_kms(crtc);
 	enum mdp4_dma dma = mdp4_crtc->dma;
 	unsigned long flags;
 
 	spin_lock_irqsave(&mdp4_crtc->cursor.lock, flags);
 	if (mdp4_crtc->cursor.stale) {
-		struct mdp4_kms *mdp4_kms = get_kms(crtc);
 		struct drm_gem_object *next_bo = mdp4_crtc->cursor.next_bo;
 		struct drm_gem_object *prev_bo = mdp4_crtc->cursor.scanout_bo;
 		uint32_t iova = mdp4_crtc->cursor.next_iova;
@@ -467,9 +508,8 @@ static void update_cursor(struct drm_crtc *crtc)
 					MDP4_DMA_CURSOR_BLEND_CONFIG_CURSOR_EN);
 		} else {
 			/* disable cursor: */
-			mdp4_write(mdp4_kms, REG_MDP4_DMA_CURSOR_BASE(dma), 0);
-			mdp4_write(mdp4_kms, REG_MDP4_DMA_CURSOR_BLEND_CONFIG(dma),
-					MDP4_DMA_CURSOR_BLEND_CONFIG_FORMAT(CURSOR_ARGB));
+			mdp4_write(mdp4_kms, REG_MDP4_DMA_CURSOR_BASE(dma),
+					mdp4_kms->blank_cursor_iova);
 		}
 
 		/* and drop the iova ref + obj rev when done scanning out: */
@@ -479,6 +519,11 @@ static void update_cursor(struct drm_crtc *crtc)
 		mdp4_crtc->cursor.scanout_bo = next_bo;
 		mdp4_crtc->cursor.stale = false;
 	}
+
+	mdp4_write(mdp4_kms, REG_MDP4_DMA_CURSOR_POS(dma),
+			MDP4_DMA_CURSOR_POS_X(mdp4_crtc->cursor.x) |
+			MDP4_DMA_CURSOR_POS_Y(mdp4_crtc->cursor.y));
+
 	spin_unlock_irqrestore(&mdp4_crtc->cursor.lock, flags);
 }
 
@@ -526,8 +571,7 @@ static int mdp4_crtc_cursor_set(struct drm_crtc *crtc,
 
 	if (old_bo) {
 		/* drop our previous reference: */
-		msm_gem_put_iova(old_bo, mdp4_kms->id);
-		drm_gem_object_unreference_unlocked(old_bo);
+		drm_flip_work_queue(&mdp4_crtc->unref_cursor_work, old_bo);
 	}
 
 	request_pending(crtc, PENDING_CURSOR);
@@ -542,12 +586,15 @@ fail:
 static int mdp4_crtc_cursor_move(struct drm_crtc *crtc, int x, int y)
 {
 	struct mdp4_crtc *mdp4_crtc = to_mdp4_crtc(crtc);
-	struct mdp4_kms *mdp4_kms = get_kms(crtc);
-	enum mdp4_dma dma = mdp4_crtc->dma;
+	unsigned long flags;
 
-	mdp4_write(mdp4_kms, REG_MDP4_DMA_CURSOR_POS(dma),
-			MDP4_DMA_CURSOR_POS_X(x) |
-			MDP4_DMA_CURSOR_POS_Y(y));
+	spin_lock_irqsave(&mdp4_crtc->cursor.lock, flags);
+	mdp4_crtc->cursor.x = x;
+	mdp4_crtc->cursor.y = y;
+	spin_unlock_irqrestore(&mdp4_crtc->cursor.lock, flags);
+
+	crtc_flush(crtc);
+	request_pending(crtc, PENDING_CURSOR);
 
 	return 0;
 }
@@ -688,6 +735,9 @@ void mdp4_crtc_attach(struct drm_crtc *crtc, struct drm_plane *plane)
 
 void mdp4_crtc_detach(struct drm_crtc *crtc, struct drm_plane *plane)
 {
+	/* don't actually detatch our primary plane: */
+	if (to_mdp4_crtc(crtc)->plane == plane)
+		return;
 	set_attach(crtc, mdp4_plane_pipe(plane), NULL);
 }
 
@@ -713,6 +763,7 @@ struct drm_crtc *mdp4_crtc_init(struct drm_device *dev,
 	crtc = &mdp4_crtc->base;
 
 	mdp4_crtc->plane = plane;
+	mdp4_crtc->id = id;
 
 	mdp4_crtc->ovlp = ovlp_id;
 	mdp4_crtc->dma = dma_id;
@@ -738,7 +789,7 @@ struct drm_crtc *mdp4_crtc_init(struct drm_device *dev,
 
 	INIT_FENCE_CB(&mdp4_crtc->pageflip_cb, pageflip_cb);
 
-	drm_crtc_init(dev, crtc, &mdp4_crtc_funcs);
+	drm_crtc_init_with_planes(dev, crtc, plane, NULL, &mdp4_crtc_funcs);
 	drm_crtc_helper_add(crtc, &mdp4_crtc_helper_funcs);
 
 	mdp4_plane_install_properties(mdp4_crtc->plane, &crtc->base);
diff --git a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_irq.c b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_irq.c
index c740ccd1cc6..8edd531cb62 100644
--- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_irq.c
+++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_irq.c
@@ -70,12 +70,12 @@ irqreturn_t mdp4_irq(struct msm_kms *kms)
 
 	VERB("status=%08x", status);
 
+	mdp_dispatch_irqs(mdp_kms, status);
+
 	for (id = 0; id < priv->num_crtcs; id++)
 		if (status & mdp4_crtc_vblank(priv->crtcs[id]))
 			drm_handle_vblank(dev, id);
 
-	mdp_dispatch_irqs(mdp_kms, status);
-
 	return IRQ_HANDLED;
 }
 
diff --git a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c
index 272e707c948..0bb4faa1752 100644
--- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c
+++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c
@@ -144,6 +144,10 @@ static void mdp4_preclose(struct msm_kms *kms, struct drm_file *file)
 static void mdp4_destroy(struct msm_kms *kms)
 {
 	struct mdp4_kms *mdp4_kms = to_mdp4_kms(to_mdp_kms(kms));
+	if (mdp4_kms->blank_cursor_iova)
+		msm_gem_put_iova(mdp4_kms->blank_cursor_bo, mdp4_kms->id);
+	if (mdp4_kms->blank_cursor_bo)
+		drm_gem_object_unreference(mdp4_kms->blank_cursor_bo);
 	kfree(mdp4_kms);
 }
 
@@ -372,6 +376,23 @@ struct msm_kms *mdp4_kms_init(struct drm_device *dev)
 		goto fail;
 	}
 
+	mutex_lock(&dev->struct_mutex);
+	mdp4_kms->blank_cursor_bo = msm_gem_new(dev, SZ_16K, MSM_BO_WC);
+	mutex_unlock(&dev->struct_mutex);
+	if (IS_ERR(mdp4_kms->blank_cursor_bo)) {
+		ret = PTR_ERR(mdp4_kms->blank_cursor_bo);
+		dev_err(dev->dev, "could not allocate blank-cursor bo: %d\n", ret);
+		mdp4_kms->blank_cursor_bo = NULL;
+		goto fail;
+	}
+
+	ret = msm_gem_get_iova(mdp4_kms->blank_cursor_bo, mdp4_kms->id,
+			&mdp4_kms->blank_cursor_iova);
+	if (ret) {
+		dev_err(dev->dev, "could not pin blank-cursor bo: %d\n", ret);
+		goto fail;
+	}
+
 	return kms;
 
 fail:
diff --git a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h
index 66a4d31aec8..715520c54cd 100644
--- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h
+++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h
@@ -44,6 +44,10 @@ struct mdp4_kms {
 	struct clk *lut_clk;
 
 	struct mdp_irq error_handler;
+
+	/* empty/blank cursor bo to use when cursor is "disabled" */
+	struct drm_gem_object *blank_cursor_bo;
+	uint32_t blank_cursor_iova;
 };
 #define to_mdp4_kms(x) container_of(x, struct mdp4_kms, base)
 
diff --git a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_plane.c b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_plane.c
index 2406027200e..66f33dba1eb 100644
--- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_plane.c
+++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_plane.c
@@ -170,8 +170,8 @@ int mdp4_plane_mode_set(struct drm_plane *plane,
 			MDP4_PIPE_DST_SIZE_HEIGHT(crtc_h));
 
 	mdp4_write(mdp4_kms, REG_MDP4_PIPE_DST_XY(pipe),
-			MDP4_PIPE_SRC_XY_X(crtc_x) |
-			MDP4_PIPE_SRC_XY_Y(crtc_y));
+			MDP4_PIPE_DST_XY_X(crtc_x) |
+			MDP4_PIPE_DST_XY_Y(crtc_y));
 
 	mdp4_plane_set_scanout(plane, fb);
 
@@ -222,6 +222,7 @@ struct drm_plane *mdp4_plane_init(struct drm_device *dev,
 	struct drm_plane *plane = NULL;
 	struct mdp4_plane *mdp4_plane;
 	int ret;
+	enum drm_plane_type type;
 
 	mdp4_plane = kzalloc(sizeof(*mdp4_plane), GFP_KERNEL);
 	if (!mdp4_plane) {
@@ -237,9 +238,10 @@ struct drm_plane *mdp4_plane_init(struct drm_device *dev,
 	mdp4_plane->nformats = mdp4_get_formats(pipe_id, mdp4_plane->formats,
 			ARRAY_SIZE(mdp4_plane->formats));
 
-	drm_plane_init(dev, plane, 0xff, &mdp4_plane_funcs,
-			mdp4_plane->formats, mdp4_plane->nformats,
-			private_plane);
+	type = private_plane ? DRM_PLANE_TYPE_PRIMARY : DRM_PLANE_TYPE_OVERLAY;
+	drm_universal_plane_init(dev, plane, 0xff, &mdp4_plane_funcs,
+				 mdp4_plane->formats, mdp4_plane->nformats,
+				 type);
 
 	mdp4_plane_install_properties(plane, &plane->base);
 
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c
index 71a3b2345eb..ebe2e60f3ab 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c
@@ -102,7 +102,7 @@ static void update_fb(struct drm_crtc *crtc, struct drm_framebuffer *new_fb)
 
 	/* grab reference to incoming scanout fb: */
 	drm_framebuffer_reference(new_fb);
-	mdp5_crtc->base.fb = new_fb;
+	mdp5_crtc->base.primary->fb = new_fb;
 	mdp5_crtc->fb = new_fb;
 
 	if (old_fb)
@@ -195,8 +195,6 @@ static void mdp5_crtc_destroy(struct drm_crtc *crtc)
 {
 	struct mdp5_crtc *mdp5_crtc = to_mdp5_crtc(crtc);
 
-	mdp5_crtc->plane->funcs->destroy(mdp5_crtc->plane);
-
 	drm_crtc_cleanup(crtc);
 	drm_flip_work_cleanup(&mdp5_crtc->unref_fb_work);
 
@@ -289,13 +287,14 @@ static int mdp5_crtc_mode_set(struct drm_crtc *crtc,
 			mode->type, mode->flags);
 
 	/* grab extra ref for update_scanout() */
-	drm_framebuffer_reference(crtc->fb);
+	drm_framebuffer_reference(crtc->primary->fb);
 
-	ret = mdp5_plane_mode_set(mdp5_crtc->plane, crtc, crtc->fb,
+	ret = mdp5_plane_mode_set(mdp5_crtc->plane, crtc, crtc->primary->fb,
 			0, 0, mode->hdisplay, mode->vdisplay,
 			x << 16, y << 16,
 			mode->hdisplay << 16, mode->vdisplay << 16);
 	if (ret) {
+		drm_framebuffer_unreference(crtc->primary->fb);
 		dev_err(crtc->dev->dev, "%s: failed to set mode on plane: %d\n",
 				mdp5_crtc->name, ret);
 		return ret;
@@ -305,8 +304,8 @@ static int mdp5_crtc_mode_set(struct drm_crtc *crtc,
 			MDP5_LM_OUT_SIZE_WIDTH(mode->hdisplay) |
 			MDP5_LM_OUT_SIZE_HEIGHT(mode->vdisplay));
 
-	update_fb(crtc, crtc->fb);
-	update_scanout(crtc, crtc->fb);
+	update_fb(crtc, crtc->primary->fb);
+	update_scanout(crtc, crtc->primary->fb);
 
 	return 0;
 }
@@ -337,17 +336,21 @@ static int mdp5_crtc_mode_set_base(struct drm_crtc *crtc, int x, int y,
 	int ret;
 
 	/* grab extra ref for update_scanout() */
-	drm_framebuffer_reference(crtc->fb);
+	drm_framebuffer_reference(crtc->primary->fb);
 
-	ret = mdp5_plane_mode_set(plane, crtc, crtc->fb,
+	ret = mdp5_plane_mode_set(plane, crtc, crtc->primary->fb,
 			0, 0, mode->hdisplay, mode->vdisplay,
 			x << 16, y << 16,
 			mode->hdisplay << 16, mode->vdisplay << 16);
+	if (ret) {
+		drm_framebuffer_unreference(crtc->primary->fb);
+		return ret;
+	}
 
-	update_fb(crtc, crtc->fb);
-	update_scanout(crtc, crtc->fb);
+	update_fb(crtc, crtc->primary->fb);
+	update_scanout(crtc, crtc->primary->fb);
 
-	return ret;
+	return 0;
 }
 
 static void mdp5_crtc_load_lut(struct drm_crtc *crtc)
@@ -519,6 +522,9 @@ void mdp5_crtc_attach(struct drm_crtc *crtc, struct drm_plane *plane)
 
 void mdp5_crtc_detach(struct drm_crtc *crtc, struct drm_plane *plane)
 {
+	/* don't actually detatch our primary plane: */
+	if (to_mdp5_crtc(crtc)->plane == plane)
+		return;
 	set_attach(crtc, mdp5_plane_pipe(plane), NULL);
 }
 
@@ -554,7 +560,7 @@ struct drm_crtc *mdp5_crtc_init(struct drm_device *dev,
 
 	INIT_FENCE_CB(&mdp5_crtc->pageflip_cb, pageflip_cb);
 
-	drm_crtc_init(dev, crtc, &mdp5_crtc_funcs);
+	drm_crtc_init_with_planes(dev, crtc, plane, NULL, &mdp5_crtc_funcs);
 	drm_crtc_helper_add(crtc, &mdp5_crtc_helper_funcs);
 
 	mdp5_plane_install_properties(mdp5_crtc->plane, &crtc->base);
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_irq.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_irq.c
index 353d494a497..f2b985bc2ad 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_irq.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_irq.c
@@ -71,11 +71,11 @@ static void mdp5_irq_mdp(struct mdp_kms *mdp_kms)
 
 	VERB("status=%08x", status);
 
+	mdp_dispatch_irqs(mdp_kms, status);
+
 	for (id = 0; id < priv->num_crtcs; id++)
 		if (status & mdp5_crtc_vblank(priv->crtcs[id]))
 			drm_handle_vblank(dev, id);
-
-	mdp_dispatch_irqs(mdp_kms, status);
 }
 
 irqreturn_t mdp5_irq(struct msm_kms *kms)
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
index ee8446c1b5f..71510ee26e9 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
@@ -20,6 +20,10 @@
 #include "msm_mmu.h"
 #include "mdp5_kms.h"
 
+static const char *iommu_ports[] = {
+		"mdp_0",
+};
+
 static struct mdp5_platform_config *mdp5_get_config(struct platform_device *dev);
 
 static int mdp5_hw_init(struct msm_kms *kms)
@@ -104,6 +108,12 @@ static void mdp5_preclose(struct msm_kms *kms, struct drm_file *file)
 static void mdp5_destroy(struct msm_kms *kms)
 {
 	struct mdp5_kms *mdp5_kms = to_mdp5_kms(to_mdp_kms(kms));
+	struct msm_mmu *mmu = mdp5_kms->mmu;
+
+	if (mmu) {
+		mmu->funcs->detach(mmu, iommu_ports, ARRAY_SIZE(iommu_ports));
+		mmu->funcs->destroy(mmu);
+	}
 	kfree(mdp5_kms);
 }
 
@@ -216,10 +226,6 @@ fail:
 	return ret;
 }
 
-static const char *iommu_ports[] = {
-		"mdp_0",
-};
-
 static int get_clk(struct platform_device *pdev, struct clk **clkp,
 		const char *name)
 {
@@ -280,12 +286,22 @@ struct msm_kms *mdp5_kms_init(struct drm_device *dev)
 		goto fail;
 	}
 
-	ret = get_clk(pdev, &mdp5_kms->axi_clk, "bus_clk") ||
-			get_clk(pdev, &mdp5_kms->ahb_clk, "iface_clk") ||
-			get_clk(pdev, &mdp5_kms->src_clk, "core_clk_src") ||
-			get_clk(pdev, &mdp5_kms->core_clk, "core_clk") ||
-			get_clk(pdev, &mdp5_kms->lut_clk, "lut_clk") ||
-			get_clk(pdev, &mdp5_kms->vsync_clk, "vsync_clk");
+	ret = get_clk(pdev, &mdp5_kms->axi_clk, "bus_clk");
+	if (ret)
+		goto fail;
+	ret = get_clk(pdev, &mdp5_kms->ahb_clk, "iface_clk");
+	if (ret)
+		goto fail;
+	ret = get_clk(pdev, &mdp5_kms->src_clk, "core_clk_src");
+	if (ret)
+		goto fail;
+	ret = get_clk(pdev, &mdp5_kms->core_clk, "core_clk");
+	if (ret)
+		goto fail;
+	ret = get_clk(pdev, &mdp5_kms->lut_clk, "lut_clk");
+	if (ret)
+		goto fail;
+	ret = get_clk(pdev, &mdp5_kms->vsync_clk, "vsync_clk");
 	if (ret)
 		goto fail;
 
@@ -307,17 +323,23 @@ struct msm_kms *mdp5_kms_init(struct drm_device *dev)
 		mmu = msm_iommu_new(dev, config->iommu);
 		if (IS_ERR(mmu)) {
 			ret = PTR_ERR(mmu);
+			dev_err(dev->dev, "failed to init iommu: %d\n", ret);
 			goto fail;
 		}
+
 		ret = mmu->funcs->attach(mmu, iommu_ports,
 				ARRAY_SIZE(iommu_ports));
-		if (ret)
+		if (ret) {
+			dev_err(dev->dev, "failed to attach iommu: %d\n", ret);
+			mmu->funcs->destroy(mmu);
 			goto fail;
+		}
 	} else {
 		dev_info(dev->dev, "no iommu, fallback to phys "
 				"contig buffers for scanout\n");
 		mmu = NULL;
 	}
+	mdp5_kms->mmu = mmu;
 
 	mdp5_kms->id = msm_register_mmu(dev, mmu);
 	if (mdp5_kms->id < 0) {
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h
index c8b1a2522c2..6e981b692d1 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h
@@ -33,6 +33,7 @@ struct mdp5_kms {
 
 	/* mapper-id used to request GEM buffer mapped for scanout: */
 	int id;
+	struct msm_mmu *mmu;
 
 	/* for tracking smp allocation amongst pipes: */
 	mdp5_smp_state_t smp_state;
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
index 0ac8bb5e7e8..f3daec4412a 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
@@ -85,8 +85,11 @@ static int mdp5_plane_disable(struct drm_plane *plane)
 static void mdp5_plane_destroy(struct drm_plane *plane)
 {
 	struct mdp5_plane *mdp5_plane = to_mdp5_plane(plane);
+	struct msm_drm_private *priv = plane->dev->dev_private;
+
+	if (priv->kms)
+		mdp5_plane_disable(plane);
 
-	mdp5_plane_disable(plane);
 	drm_plane_cleanup(plane);
 
 	kfree(mdp5_plane);
@@ -358,6 +361,7 @@ struct drm_plane *mdp5_plane_init(struct drm_device *dev,
 	struct drm_plane *plane = NULL;
 	struct mdp5_plane *mdp5_plane;
 	int ret;
+	enum drm_plane_type type;
 
 	mdp5_plane = kzalloc(sizeof(*mdp5_plane), GFP_KERNEL);
 	if (!mdp5_plane) {
@@ -373,9 +377,10 @@ struct drm_plane *mdp5_plane_init(struct drm_device *dev,
 	mdp5_plane->nformats = mdp5_get_formats(pipe, mdp5_plane->formats,
 			ARRAY_SIZE(mdp5_plane->formats));
 
-	drm_plane_init(dev, plane, 0xff, &mdp5_plane_funcs,
-			mdp5_plane->formats, mdp5_plane->nformats,
-			private_plane);
+	type = private_plane ? DRM_PLANE_TYPE_PRIMARY : DRM_PLANE_TYPE_OVERLAY;
+	drm_universal_plane_init(dev, plane, 0xff, &mdp5_plane_funcs,
+				 mdp5_plane->formats, mdp5_plane->nformats,
+				 type);
 
 	mdp5_plane_install_properties(plane, &plane->base);
 
diff --git a/drivers/gpu/drm/msm/mdp/mdp_kms.c b/drivers/gpu/drm/msm/mdp/mdp_kms.c
index 3be48f7c36b..03455b64a24 100644
--- a/drivers/gpu/drm/msm/mdp/mdp_kms.c
+++ b/drivers/gpu/drm/msm/mdp/mdp_kms.c
@@ -101,7 +101,8 @@ void mdp_irq_wait(struct mdp_kms *mdp_kms, uint32_t irqmask)
 		.count = 1,
 	};
 	mdp_irq_register(mdp_kms, &wait.irq);
-	wait_event(wait_event, (wait.count <= 0));
+	wait_event_timeout(wait_event, (wait.count <= 0),
+			msecs_to_jiffies(100));
 	mdp_irq_unregister(mdp_kms, &wait.irq);
 }
 
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index e6adafc7eff..9a5d87db5c2 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -56,6 +56,10 @@ static char *vram;
 MODULE_PARM_DESC(vram, "Configure VRAM size (for devices without IOMMU/GPUMMU");
 module_param(vram, charp, 0);
 
+/*
+ * Util/helpers:
+ */
+
 void __iomem *msm_ioremap(struct platform_device *pdev, const char *name,
 		const char *dbgname)
 {
@@ -143,6 +147,8 @@ static int msm_unload(struct drm_device *dev)
 				priv->vram.paddr, &attrs);
 	}
 
+	component_unbind_all(dev->dev, dev);
+
 	dev->dev_private = NULL;
 
 	kfree(priv);
@@ -153,7 +159,7 @@ static int msm_unload(struct drm_device *dev)
 static int get_mdp_ver(struct platform_device *pdev)
 {
 #ifdef CONFIG_OF
-	const static struct of_device_id match_types[] = { {
+	static const struct of_device_id match_types[] = { {
 		.compatible = "qcom,mdss_mdp",
 		.data	= (void	*)5,
 	}, {
@@ -175,6 +181,7 @@ static int msm_load(struct drm_device *dev, unsigned long flags)
 	struct msm_kms *kms;
 	int ret;
 
+
 	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
 	if (!priv) {
 		dev_err(dev->dev, "failed to allocate private data\n");
@@ -213,7 +220,7 @@ static int msm_load(struct drm_device *dev, unsigned long flags)
 		 * is bogus, but non-null if allocation succeeded:
 		 */
 		p = dma_alloc_attrs(dev->dev, size,
-				&priv->vram.paddr, 0, &attrs);
+				&priv->vram.paddr, GFP_KERNEL, &attrs);
 		if (!p) {
 			dev_err(dev->dev, "failed to allocate VRAM\n");
 			priv->vram.paddr = 0;
@@ -226,6 +233,13 @@ static int msm_load(struct drm_device *dev, unsigned long flags)
 				(uint32_t)(priv->vram.paddr + size));
 	}
 
+	platform_set_drvdata(pdev, dev);
+
+	/* Bind all our sub-components: */
+	ret = component_bind_all(dev->dev, dev);
+	if (ret)
+		return ret;
+
 	switch (get_mdp_ver(pdev)) {
 	case 4:
 		kms = mdp4_kms_init(dev);
@@ -274,19 +288,21 @@ static int msm_load(struct drm_device *dev, unsigned long flags)
 	}
 
 	pm_runtime_get_sync(dev->dev);
-	ret = drm_irq_install(dev);
+	ret = drm_irq_install(dev, platform_get_irq(dev->platformdev, 0));
 	pm_runtime_put_sync(dev->dev);
 	if (ret < 0) {
 		dev_err(dev->dev, "failed to install IRQ handler\n");
 		goto fail;
 	}
 
-	platform_set_drvdata(pdev, dev);
-
 #ifdef CONFIG_DRM_MSM_FBDEV
 	priv->fbdev = msm_fbdev_init(dev);
 #endif
 
+	ret = msm_debugfs_late_init(dev);
+	if (ret)
+		goto fail;
+
 	drm_kms_helper_poll_init(dev);
 
 	return 0;
@@ -311,7 +327,6 @@ static void load_gpu(struct drm_device *dev)
 		gpu = NULL;
 		/* not fatal */
 	}
-	mutex_unlock(&dev->struct_mutex);
 
 	if (gpu) {
 		int ret;
@@ -321,10 +336,16 @@ static void load_gpu(struct drm_device *dev)
 			dev_err(dev->dev, "gpu hw init failed: %d\n", ret);
 			gpu->funcs->destroy(gpu);
 			gpu = NULL;
+		} else {
+			/* give inactive pm a chance to kick in: */
+			msm_gpu_retire(gpu);
 		}
+
 	}
 
 	priv->gpu = gpu;
+
+	mutex_unlock(&dev->struct_mutex);
 }
 
 static int msm_open(struct drm_device *dev, struct drm_file *file)
@@ -365,11 +386,8 @@ static void msm_preclose(struct drm_device *dev, struct drm_file *file)
 static void msm_lastclose(struct drm_device *dev)
 {
 	struct msm_drm_private *priv = dev->dev_private;
-	if (priv->fbdev) {
-		drm_modeset_lock_all(dev);
-		drm_fb_helper_restore_fbdev_mode(priv->fbdev);
-		drm_modeset_unlock_all(dev);
-	}
+	if (priv->fbdev)
+		drm_fb_helper_restore_fbdev_mode_unlocked(priv->fbdev);
 }
 
 static irqreturn_t msm_irq(int irq, void *arg)
@@ -514,6 +532,41 @@ static struct drm_info_list msm_debugfs_list[] = {
 		{ "fb", show_locked, 0, msm_fb_show },
 };
 
+static int late_init_minor(struct drm_minor *minor)
+{
+	int ret;
+
+	if (!minor)
+		return 0;
+
+	ret = msm_rd_debugfs_init(minor);
+	if (ret) {
+		dev_err(minor->dev->dev, "could not install rd debugfs\n");
+		return ret;
+	}
+
+	ret = msm_perf_debugfs_init(minor);
+	if (ret) {
+		dev_err(minor->dev->dev, "could not install perf debugfs\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+int msm_debugfs_late_init(struct drm_device *dev)
+{
+	int ret;
+	ret = late_init_minor(dev->primary);
+	if (ret)
+		return ret;
+	ret = late_init_minor(dev->render);
+	if (ret)
+		return ret;
+	ret = late_init_minor(dev->control);
+	return ret;
+}
+
 static int msm_debugfs_init(struct drm_minor *minor)
 {
 	struct drm_device *dev = minor->dev;
@@ -528,13 +581,17 @@ static int msm_debugfs_init(struct drm_minor *minor)
 		return ret;
 	}
 
-	return ret;
+	return 0;
 }
 
 static void msm_debugfs_cleanup(struct drm_minor *minor)
 {
 	drm_debugfs_remove_files(msm_debugfs_list,
 			ARRAY_SIZE(msm_debugfs_list), minor);
+	if (!minor->dev->dev_private)
+		return;
+	msm_rd_debugfs_cleanup(minor);
+	msm_perf_debugfs_cleanup(minor);
 }
 #endif
 
@@ -647,6 +704,12 @@ static int msm_ioctl_gem_new(struct drm_device *dev, void *data,
 		struct drm_file *file)
 {
 	struct drm_msm_gem_new *args = data;
+
+	if (args->flags & ~MSM_BO_FLAGS) {
+		DRM_ERROR("invalid flags: %08x\n", args->flags);
+		return -EINVAL;
+	}
+
 	return msm_gem_new_handle(dev, file, args->size,
 			args->flags, &args->handle);
 }
@@ -660,6 +723,11 @@ static int msm_ioctl_gem_cpu_prep(struct drm_device *dev, void *data,
 	struct drm_gem_object *obj;
 	int ret;
 
+	if (args->op & ~MSM_PREP_FLAGS) {
+		DRM_ERROR("invalid op: %08x\n", args->op);
+		return -EINVAL;
+	}
+
 	obj = drm_gem_object_lookup(dev, file, args->handle);
 	if (!obj)
 		return -ENOENT;
@@ -714,7 +782,14 @@ static int msm_ioctl_wait_fence(struct drm_device *dev, void *data,
 		struct drm_file *file)
 {
 	struct drm_msm_wait_fence *args = data;
-	return msm_wait_fence_interruptable(dev, args->fence, &TS(args->timeout));
+
+	if (args->pad) {
+		DRM_ERROR("invalid pad: %08x\n", args->pad);
+		return -EINVAL;
+	}
+
+	return msm_wait_fence_interruptable(dev, args->fence,
+			&TS(args->timeout));
 }
 
 static const struct drm_ioctl_desc msm_ioctls[] = {
@@ -819,18 +894,110 @@ static const struct dev_pm_ops msm_pm_ops = {
 };
 
 /*
+ * Componentized driver support:
+ */
+
+#ifdef CONFIG_OF
+/* NOTE: the CONFIG_OF case duplicates the same code as exynos or imx
+ * (or probably any other).. so probably some room for some helpers
+ */
+static int compare_of(struct device *dev, void *data)
+{
+	return dev->of_node == data;
+}
+
+static int msm_drm_add_components(struct device *master, struct master *m)
+{
+	struct device_node *np = master->of_node;
+	unsigned i;
+	int ret;
+
+	for (i = 0; ; i++) {
+		struct device_node *node;
+
+		node = of_parse_phandle(np, "connectors", i);
+		if (!node)
+			break;
+
+		ret = component_master_add_child(m, compare_of, node);
+		of_node_put(node);
+
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+#else
+static int compare_dev(struct device *dev, void *data)
+{
+	return dev == data;
+}
+
+static int msm_drm_add_components(struct device *master, struct master *m)
+{
+	/* For non-DT case, it kinda sucks.  We don't actually have a way
+	 * to know whether or not we are waiting for certain devices (or if
+	 * they are simply not present).  But for non-DT we only need to
+	 * care about apq8064/apq8060/etc (all mdp4/a3xx):
+	 */
+	static const char *devnames[] = {
+			"hdmi_msm.0", "kgsl-3d0.0",
+	};
+	int i;
+
+	DBG("Adding components..");
+
+	for (i = 0; i < ARRAY_SIZE(devnames); i++) {
+		struct device *dev;
+		int ret;
+
+		dev = bus_find_device_by_name(&platform_bus_type,
+				NULL, devnames[i]);
+		if (!dev) {
+			dev_info(master, "still waiting for %s\n", devnames[i]);
+			return -EPROBE_DEFER;
+		}
+
+		ret = component_master_add_child(m, compare_dev, dev);
+		if (ret) {
+			DBG("could not add child: %d", ret);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+#endif
+
+static int msm_drm_bind(struct device *dev)
+{
+	return drm_platform_init(&msm_driver, to_platform_device(dev));
+}
+
+static void msm_drm_unbind(struct device *dev)
+{
+	drm_put_dev(platform_get_drvdata(to_platform_device(dev)));
+}
+
+static const struct component_master_ops msm_drm_ops = {
+		.add_components = msm_drm_add_components,
+		.bind = msm_drm_bind,
+		.unbind = msm_drm_unbind,
+};
+
+/*
  * Platform driver:
  */
 
 static int msm_pdev_probe(struct platform_device *pdev)
 {
 	pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
-	return drm_platform_init(&msm_driver, pdev);
+	return component_master_add(&pdev->dev, &msm_drm_ops);
 }
 
 static int msm_pdev_remove(struct platform_device *pdev)
 {
-	drm_put_dev(platform_get_drvdata(pdev));
+	component_master_del(&pdev->dev, &msm_drm_ops);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index 3d63269c5b2..8a2c5fd0893 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -22,6 +22,7 @@
 #include <linux/clk.h>
 #include <linux/cpufreq.h>
 #include <linux/module.h>
+#include <linux/component.h>
 #include <linux/platform_device.h>
 #include <linux/pm.h>
 #include <linux/pm_runtime.h>
@@ -32,7 +33,7 @@
 #include <asm/sizes.h>
 
 
-#if defined(CONFIG_COMPILE_TEST) && !defined(CONFIG_ARCH_MSM)
+#if defined(CONFIG_COMPILE_TEST) && !defined(CONFIG_ARCH_QCOM)
 /* stubs we need for compile-test: */
 static inline struct device *msm_iommu_get_ctx(const char *ctx_name)
 {
@@ -54,6 +55,9 @@ static inline struct device *msm_iommu_get_ctx(const char *ctx_name)
 struct msm_kms;
 struct msm_gpu;
 struct msm_mmu;
+struct msm_rd_state;
+struct msm_perf_state;
+struct msm_gem_submit;
 
 #define NUM_DOMAINS 2    /* one for KMS, then one per gpu core (?) */
 
@@ -69,6 +73,9 @@ struct msm_drm_private {
 
 	struct msm_kms *kms;
 
+	/* subordinate devices, if present: */
+	struct platform_device *hdmi_pdev, *gpu_pdev;
+
 	/* when we have more than one 'msm_gpu' these need to be an array: */
 	struct msm_gpu *gpu;
 	struct msm_file_private *lastctx;
@@ -78,6 +85,9 @@ struct msm_drm_private {
 	uint32_t next_fence, completed_fence;
 	wait_queue_head_t fence_event;
 
+	struct msm_rd_state *rd;
+	struct msm_perf_state *perf;
+
 	/* list of GEM objects: */
 	struct list_head inactive_list;
 
@@ -200,6 +210,15 @@ void __exit hdmi_unregister(void);
 void msm_gem_describe(struct drm_gem_object *obj, struct seq_file *m);
 void msm_gem_describe_objects(struct list_head *list, struct seq_file *m);
 void msm_framebuffer_describe(struct drm_framebuffer *fb, struct seq_file *m);
+int msm_debugfs_late_init(struct drm_device *dev);
+int msm_rd_debugfs_init(struct drm_minor *minor);
+void msm_rd_debugfs_cleanup(struct drm_minor *minor);
+void msm_rd_dump_submit(struct msm_gem_submit *submit);
+int msm_perf_debugfs_init(struct drm_minor *minor);
+void msm_perf_debugfs_cleanup(struct drm_minor *minor);
+#else
+static inline int msm_debugfs_late_init(struct drm_device *dev) { return 0; }
+static inline void msm_rd_dump_submit(struct msm_gem_submit *submit) {}
 #endif
 
 void __iomem *msm_ioremap(struct platform_device *pdev, const char *name,
diff --git a/drivers/gpu/drm/msm/msm_fbdev.c b/drivers/gpu/drm/msm/msm_fbdev.c
index 6c6d7d4c9b4..5107fc4826b 100644
--- a/drivers/gpu/drm/msm/msm_fbdev.c
+++ b/drivers/gpu/drm/msm/msm_fbdev.c
@@ -59,14 +59,11 @@ static int msm_fbdev_create(struct drm_fb_helper *helper,
 	struct drm_framebuffer *fb = NULL;
 	struct fb_info *fbi = NULL;
 	struct drm_mode_fb_cmd2 mode_cmd = {0};
-	dma_addr_t paddr;
+	uint32_t paddr;
 	int ret, size;
 
-	/* only doing ARGB32 since this is what is needed to alpha-blend
-	 * with video overlays:
-	 */
 	sizes->surface_bpp = 32;
-	sizes->surface_depth = 32;
+	sizes->surface_depth = 24;
 
 	DBG("create fbdev: %dx%d@%d (%dx%d)", sizes->surface_width,
 			sizes->surface_height, sizes->surface_bpp,
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index d8d60c969ac..690d7e7b6d1 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -118,8 +118,10 @@ static void put_pages(struct drm_gem_object *obj)
 
 		if (iommu_present(&platform_bus_type))
 			drm_gem_put_pages(obj, msm_obj->pages, true, false);
-		else
+		else {
 			drm_mm_remove_node(msm_obj->vram_node);
+			drm_free_large(msm_obj->pages);
+		}
 
 		msm_obj->pages = NULL;
 	}
@@ -276,6 +278,7 @@ int msm_gem_get_iova_locked(struct drm_gem_object *obj, int id,
 		uint32_t *iova)
 {
 	struct msm_gem_object *msm_obj = to_msm_bo(obj);
+	struct drm_device *dev = obj->dev;
 	int ret = 0;
 
 	if (!msm_obj->domain[id].iova) {
@@ -283,6 +286,11 @@ int msm_gem_get_iova_locked(struct drm_gem_object *obj, int id,
 		struct msm_mmu *mmu = priv->mmus[id];
 		struct page **pages = get_pages(obj);
 
+		if (!mmu) {
+			dev_err(dev->dev, "null MMU pointer\n");
+			return -EINVAL;
+		}
+
 		if (IS_ERR(pages))
 			return PTR_ERR(pages);
 
@@ -644,7 +652,7 @@ struct drm_gem_object *msm_gem_new(struct drm_device *dev,
 
 fail:
 	if (obj)
-		drm_gem_object_unreference_unlocked(obj);
+		drm_gem_object_unreference(obj);
 
 	return ERR_PTR(ret);
 }
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index 3246bb46c4f..bfb052688f8 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -90,6 +90,7 @@ struct msm_gem_submit {
 		uint32_t type;
 		uint32_t size;  /* in dwords */
 		uint32_t iova;
+		uint32_t idx;   /* cmdstream buffer idx in bos[] */
 	} cmd[MAX_CMDS];
 	struct {
 		uint32_t flags;
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
index 5281d4bc37f..cd0554f6831 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -23,7 +23,6 @@
  * Cmdstream submission:
  */
 
-#define BO_INVALID_FLAGS ~(MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE)
 /* make sure these don't conflict w/ MSM_SUBMIT_BO_x */
 #define BO_VALID    0x8000
 #define BO_LOCKED   0x4000
@@ -77,7 +76,7 @@ static int submit_lookup_objects(struct msm_gem_submit *submit,
 			goto out_unlock;
 		}
 
-		if (submit_bo.flags & BO_INVALID_FLAGS) {
+		if (submit_bo.flags & ~MSM_SUBMIT_BO_FLAGS) {
 			DRM_ERROR("invalid flags: %x\n", submit_bo.flags);
 			ret = -EINVAL;
 			goto out_unlock;
@@ -163,7 +162,7 @@ retry:
 
 
 		/* if locking succeeded, pin bo: */
-		ret = msm_gem_get_iova(&msm_obj->base,
+		ret = msm_gem_get_iova_locked(&msm_obj->base,
 				submit->gpu->id, &iova);
 
 		/* this would break the logic in the fail path.. there is no
@@ -247,7 +246,7 @@ static int submit_reloc(struct msm_gem_submit *submit, struct msm_gem_object *ob
 	/* For now, just map the entire thing.  Eventually we probably
 	 * to do it page-by-page, w/ kmap() if not vmap()d..
 	 */
-	ptr = msm_gem_vaddr(&obj->base);
+	ptr = msm_gem_vaddr_locked(&obj->base);
 
 	if (IS_ERR(ptr)) {
 		ret = PTR_ERR(ptr);
@@ -307,14 +306,12 @@ static void submit_cleanup(struct msm_gem_submit *submit, bool fail)
 {
 	unsigned i;
 
-	mutex_lock(&submit->dev->struct_mutex);
 	for (i = 0; i < submit->nr_bos; i++) {
 		struct msm_gem_object *msm_obj = submit->bos[i].obj;
 		submit_unlock_unpin_bo(submit, i);
 		list_del_init(&msm_obj->submit_entry);
 		drm_gem_object_unreference(&msm_obj->base);
 	}
-	mutex_unlock(&submit->dev->struct_mutex);
 
 	ww_acquire_fini(&submit->ticket);
 	kfree(submit);
@@ -342,6 +339,8 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 	if (args->nr_cmds > MAX_CMDS)
 		return -EINVAL;
 
+	mutex_lock(&dev->struct_mutex);
+
 	submit = submit_create(dev, gpu, args->nr_bos);
 	if (!submit) {
 		ret = -ENOMEM;
@@ -369,6 +368,18 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 			goto out;
 		}
 
+		/* validate input from userspace: */
+		switch (submit_cmd.type) {
+		case MSM_SUBMIT_CMD_BUF:
+		case MSM_SUBMIT_CMD_IB_TARGET_BUF:
+		case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
+			break;
+		default:
+			DRM_ERROR("invalid type: %08x\n", submit_cmd.type);
+			ret = -EINVAL;
+			goto out;
+		}
+
 		ret = submit_bo(submit, submit_cmd.submit_idx,
 				&msm_obj, &iova, NULL);
 		if (ret)
@@ -391,6 +402,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 		submit->cmd[i].type = submit_cmd.type;
 		submit->cmd[i].size = submit_cmd.size / 4;
 		submit->cmd[i].iova = iova + submit_cmd.submit_offset;
+		submit->cmd[i].idx  = submit_cmd.submit_idx;
 
 		if (submit->valid)
 			continue;
@@ -410,5 +422,6 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 out:
 	if (submit)
 		submit_cleanup(submit, !!ret);
+	mutex_unlock(&dev->struct_mutex);
 	return ret;
 }
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 4ebce8be489..c6322197db8 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -154,9 +154,18 @@ static int disable_axi(struct msm_gpu *gpu)
 
 int msm_gpu_pm_resume(struct msm_gpu *gpu)
 {
+	struct drm_device *dev = gpu->dev;
 	int ret;
 
-	DBG("%s", gpu->name);
+	DBG("%s: active_cnt=%d", gpu->name, gpu->active_cnt);
+
+	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
+
+	if (gpu->active_cnt++ > 0)
+		return 0;
+
+	if (WARN_ON(gpu->active_cnt <= 0))
+		return -EINVAL;
 
 	ret = enable_pwrrail(gpu);
 	if (ret)
@@ -175,9 +184,18 @@ int msm_gpu_pm_resume(struct msm_gpu *gpu)
 
 int msm_gpu_pm_suspend(struct msm_gpu *gpu)
 {
+	struct drm_device *dev = gpu->dev;
 	int ret;
 
-	DBG("%s", gpu->name);
+	DBG("%s: active_cnt=%d", gpu->name, gpu->active_cnt);
+
+	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
+
+	if (--gpu->active_cnt > 0)
+		return 0;
+
+	if (WARN_ON(gpu->active_cnt < 0))
+		return -EINVAL;
 
 	ret = disable_axi(gpu);
 	if (ret)
@@ -195,6 +213,55 @@ int msm_gpu_pm_suspend(struct msm_gpu *gpu)
 }
 
 /*
+ * Inactivity detection (for suspend):
+ */
+
+static void inactive_worker(struct work_struct *work)
+{
+	struct msm_gpu *gpu = container_of(work, struct msm_gpu, inactive_work);
+	struct drm_device *dev = gpu->dev;
+
+	if (gpu->inactive)
+		return;
+
+	DBG("%s: inactive!\n", gpu->name);
+	mutex_lock(&dev->struct_mutex);
+	if (!(msm_gpu_active(gpu) || gpu->inactive)) {
+		disable_axi(gpu);
+		disable_clk(gpu);
+		gpu->inactive = true;
+	}
+	mutex_unlock(&dev->struct_mutex);
+}
+
+static void inactive_handler(unsigned long data)
+{
+	struct msm_gpu *gpu = (struct msm_gpu *)data;
+	struct msm_drm_private *priv = gpu->dev->dev_private;
+
+	queue_work(priv->wq, &gpu->inactive_work);
+}
+
+/* cancel inactive timer and make sure we are awake: */
+static void inactive_cancel(struct msm_gpu *gpu)
+{
+	DBG("%s", gpu->name);
+	del_timer(&gpu->inactive_timer);
+	if (gpu->inactive) {
+		enable_clk(gpu);
+		enable_axi(gpu);
+		gpu->inactive = false;
+	}
+}
+
+static void inactive_start(struct msm_gpu *gpu)
+{
+	DBG("%s", gpu->name);
+	mod_timer(&gpu->inactive_timer,
+			round_jiffies_up(jiffies + DRM_MSM_INACTIVE_JIFFIES));
+}
+
+/*
  * Hangcheck detection for locked gpu:
  */
 
@@ -206,7 +273,10 @@ static void recover_worker(struct work_struct *work)
 	dev_err(dev->dev, "%s: hangcheck recover!\n", gpu->name);
 
 	mutex_lock(&dev->struct_mutex);
-	gpu->funcs->recover(gpu);
+	if (msm_gpu_active(gpu)) {
+		inactive_cancel(gpu);
+		gpu->funcs->recover(gpu);
+	}
 	mutex_unlock(&dev->struct_mutex);
 
 	msm_gpu_retire(gpu);
@@ -250,6 +320,101 @@ static void hangcheck_handler(unsigned long data)
 }
 
 /*
+ * Performance Counters:
+ */
+
+/* called under perf_lock */
+static int update_hw_cntrs(struct msm_gpu *gpu, uint32_t ncntrs, uint32_t *cntrs)
+{
+	uint32_t current_cntrs[ARRAY_SIZE(gpu->last_cntrs)];
+	int i, n = min(ncntrs, gpu->num_perfcntrs);
+
+	/* read current values: */
+	for (i = 0; i < gpu->num_perfcntrs; i++)
+		current_cntrs[i] = gpu_read(gpu, gpu->perfcntrs[i].sample_reg);
+
+	/* update cntrs: */
+	for (i = 0; i < n; i++)
+		cntrs[i] = current_cntrs[i] - gpu->last_cntrs[i];
+
+	/* save current values: */
+	for (i = 0; i < gpu->num_perfcntrs; i++)
+		gpu->last_cntrs[i] = current_cntrs[i];
+
+	return n;
+}
+
+static void update_sw_cntrs(struct msm_gpu *gpu)
+{
+	ktime_t time;
+	uint32_t elapsed;
+	unsigned long flags;
+
+	spin_lock_irqsave(&gpu->perf_lock, flags);
+	if (!gpu->perfcntr_active)
+		goto out;
+
+	time = ktime_get();
+	elapsed = ktime_to_us(ktime_sub(time, gpu->last_sample.time));
+
+	gpu->totaltime += elapsed;
+	if (gpu->last_sample.active)
+		gpu->activetime += elapsed;
+
+	gpu->last_sample.active = msm_gpu_active(gpu);
+	gpu->last_sample.time = time;
+
+out:
+	spin_unlock_irqrestore(&gpu->perf_lock, flags);
+}
+
+void msm_gpu_perfcntr_start(struct msm_gpu *gpu)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&gpu->perf_lock, flags);
+	/* we could dynamically enable/disable perfcntr registers too.. */
+	gpu->last_sample.active = msm_gpu_active(gpu);
+	gpu->last_sample.time = ktime_get();
+	gpu->activetime = gpu->totaltime = 0;
+	gpu->perfcntr_active = true;
+	update_hw_cntrs(gpu, 0, NULL);
+	spin_unlock_irqrestore(&gpu->perf_lock, flags);
+}
+
+void msm_gpu_perfcntr_stop(struct msm_gpu *gpu)
+{
+	gpu->perfcntr_active = false;
+}
+
+/* returns -errno or # of cntrs sampled */
+int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
+		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&gpu->perf_lock, flags);
+
+	if (!gpu->perfcntr_active) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	*activetime = gpu->activetime;
+	*totaltime = gpu->totaltime;
+
+	gpu->activetime = gpu->totaltime = 0;
+
+	ret = update_hw_cntrs(gpu, ncntrs, cntrs);
+
+out:
+	spin_unlock_irqrestore(&gpu->perf_lock, flags);
+
+	return ret;
+}
+
+/*
  * Cmdstream submission/retirement:
  */
 
@@ -281,6 +446,9 @@ static void retire_worker(struct work_struct *work)
 	}
 
 	mutex_unlock(&dev->struct_mutex);
+
+	if (!msm_gpu_active(gpu))
+		inactive_start(gpu);
 }
 
 /* call from irq handler to schedule work to retire bo's */
@@ -288,6 +456,7 @@ void msm_gpu_retire(struct msm_gpu *gpu)
 {
 	struct msm_drm_private *priv = gpu->dev->dev_private;
 	queue_work(priv->wq, &gpu->retire_work);
+	update_sw_cntrs(gpu);
 }
 
 /* add bo's to gpu's ring, and kick gpu: */
@@ -298,12 +467,18 @@ int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 	struct msm_drm_private *priv = dev->dev_private;
 	int i, ret;
 
-	mutex_lock(&dev->struct_mutex);
-
 	submit->fence = ++priv->next_fence;
 
 	gpu->submitted_fence = submit->fence;
 
+	inactive_cancel(gpu);
+
+	msm_rd_dump_submit(submit);
+
+	gpu->submitted_fence = submit->fence;
+
+	update_sw_cntrs(gpu);
+
 	ret = gpu->funcs->submit(gpu, submit, ctx);
 	priv->lastctx = ctx;
 
@@ -331,7 +506,6 @@ int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 			msm_gem_move_to_active(&msm_obj->base, gpu, true, submit->fence);
 	}
 	hangcheck_timer_reset(gpu);
-	mutex_unlock(&dev->struct_mutex);
 
 	return ret;
 }
@@ -357,17 +531,26 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 	struct iommu_domain *iommu;
 	int i, ret;
 
+	if (WARN_ON(gpu->num_perfcntrs > ARRAY_SIZE(gpu->last_cntrs)))
+		gpu->num_perfcntrs = ARRAY_SIZE(gpu->last_cntrs);
+
 	gpu->dev = drm;
 	gpu->funcs = funcs;
 	gpu->name = name;
+	gpu->inactive = true;
 
 	INIT_LIST_HEAD(&gpu->active_list);
 	INIT_WORK(&gpu->retire_work, retire_worker);
+	INIT_WORK(&gpu->inactive_work, inactive_worker);
 	INIT_WORK(&gpu->recover_work, recover_worker);
 
+	setup_timer(&gpu->inactive_timer, inactive_handler,
+			(unsigned long)gpu);
 	setup_timer(&gpu->hangcheck_timer, hangcheck_handler,
 			(unsigned long)gpu);
 
+	spin_lock_init(&gpu->perf_lock);
+
 	BUG_ON(ARRAY_SIZE(clk_names) != ARRAY_SIZE(gpu->grp_clks));
 
 	/* Map registers: */
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 458db8c64c2..9b579b79284 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -25,6 +25,7 @@
 #include "msm_ringbuffer.h"
 
 struct msm_gem_submit;
+struct msm_gpu_perfcntr;
 
 /* So far, with hardware that I've seen to date, we can have:
  *  + zero, one, or two z180 2d cores
@@ -64,6 +65,18 @@ struct msm_gpu {
 	struct drm_device *dev;
 	const struct msm_gpu_funcs *funcs;
 
+	/* performance counters (hw & sw): */
+	spinlock_t perf_lock;
+	bool perfcntr_active;
+	struct {
+		bool active;
+		ktime_t time;
+	} last_sample;
+	uint32_t totaltime, activetime;    /* sw counters */
+	uint32_t last_cntrs[5];            /* hw counters */
+	const struct msm_gpu_perfcntr *perfcntrs;
+	uint32_t num_perfcntrs;
+
 	struct msm_ringbuffer *rb;
 	uint32_t rb_iova;
 
@@ -72,6 +85,10 @@ struct msm_gpu {
 
 	uint32_t submitted_fence;
 
+	/* is gpu powered/active? */
+	int active_cnt;
+	bool inactive;
+
 	/* worker for handling active-list retiring: */
 	struct work_struct retire_work;
 
@@ -91,7 +108,12 @@ struct msm_gpu {
 	uint32_t bsc;
 #endif
 
-	/* Hang Detction: */
+	/* Hang and Inactivity Detection:
+	 */
+#define DRM_MSM_INACTIVE_PERIOD   66 /* in ms (roughly four frames) */
+#define DRM_MSM_INACTIVE_JIFFIES  msecs_to_jiffies(DRM_MSM_INACTIVE_PERIOD)
+	struct timer_list inactive_timer;
+	struct work_struct inactive_work;
 #define DRM_MSM_HANGCHECK_PERIOD 500 /* in ms */
 #define DRM_MSM_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_MSM_HANGCHECK_PERIOD)
 	struct timer_list hangcheck_timer;
@@ -99,6 +121,24 @@ struct msm_gpu {
 	struct work_struct recover_work;
 };
 
+static inline bool msm_gpu_active(struct msm_gpu *gpu)
+{
+	return gpu->submitted_fence > gpu->funcs->last_fence(gpu);
+}
+
+/* Perf-Counters:
+ * The select_reg and select_val are just there for the benefit of the child
+ * class that actually enables the perf counter..  but msm_gpu base class
+ * will handle sampling/displaying the counters.
+ */
+
+struct msm_gpu_perfcntr {
+	uint32_t select_reg;
+	uint32_t sample_reg;
+	uint32_t select_val;
+	const char *name;
+};
+
 static inline void gpu_write(struct msm_gpu *gpu, u32 reg, u32 data)
 {
 	msm_writel(data, gpu->mmio + (reg << 2));
@@ -112,6 +152,11 @@ static inline u32 gpu_read(struct msm_gpu *gpu, u32 reg)
 int msm_gpu_pm_suspend(struct msm_gpu *gpu);
 int msm_gpu_pm_resume(struct msm_gpu *gpu);
 
+void msm_gpu_perfcntr_start(struct msm_gpu *gpu);
+void msm_gpu_perfcntr_stop(struct msm_gpu *gpu);
+int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
+		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
+
 void msm_gpu_retire(struct msm_gpu *gpu);
 int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 		struct msm_file_private *ctx);
diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c
index 92b74598623..4b2ad9181ed 100644
--- a/drivers/gpu/drm/msm/msm_iommu.c
+++ b/drivers/gpu/drm/msm/msm_iommu.c
@@ -28,7 +28,7 @@ static int msm_fault_handler(struct iommu_domain *iommu, struct device *dev,
 		unsigned long iova, int flags, void *arg)
 {
 	DBG("*** fault: iova=%08lx, flags=%d", iova, flags);
-	return 0;
+	return -ENOSYS;
 }
 
 static int msm_iommu_attach(struct msm_mmu *mmu, const char **names, int cnt)
@@ -40,8 +40,10 @@ static int msm_iommu_attach(struct msm_mmu *mmu, const char **names, int cnt)
 	for (i = 0; i < cnt; i++) {
 		struct device *msm_iommu_get_ctx(const char *ctx_name);
 		struct device *ctx = msm_iommu_get_ctx(names[i]);
-		if (IS_ERR_OR_NULL(ctx))
+		if (IS_ERR_OR_NULL(ctx)) {
+			dev_warn(dev->dev, "couldn't get %s context", names[i]);
 			continue;
+		}
 		ret = iommu_attach_device(iommu->domain, ctx);
 		if (ret) {
 			dev_warn(dev->dev, "could not attach iommu to %s", names[i]);
@@ -52,6 +54,20 @@ static int msm_iommu_attach(struct msm_mmu *mmu, const char **names, int cnt)
 	return 0;
 }
 
+static void msm_iommu_detach(struct msm_mmu *mmu, const char **names, int cnt)
+{
+	struct msm_iommu *iommu = to_msm_iommu(mmu);
+	int i;
+
+	for (i = 0; i < cnt; i++) {
+		struct device *msm_iommu_get_ctx(const char *ctx_name);
+		struct device *ctx = msm_iommu_get_ctx(names[i]);
+		if (IS_ERR_OR_NULL(ctx))
+			continue;
+		iommu_detach_device(iommu->domain, ctx);
+	}
+}
+
 static int msm_iommu_map(struct msm_mmu *mmu, uint32_t iova,
 		struct sg_table *sgt, unsigned len, int prot)
 {
@@ -110,7 +126,7 @@ static int msm_iommu_unmap(struct msm_mmu *mmu, uint32_t iova,
 
 		VERB("unmap[%d]: %08x(%x)", i, iova, bytes);
 
-		BUG_ON(!IS_ALIGNED(bytes, PAGE_SIZE));
+		BUG_ON(!PAGE_ALIGNED(bytes));
 
 		da += bytes;
 	}
@@ -127,6 +143,7 @@ static void msm_iommu_destroy(struct msm_mmu *mmu)
 
 static const struct msm_mmu_funcs funcs = {
 		.attach = msm_iommu_attach,
+		.detach = msm_iommu_detach,
 		.map = msm_iommu_map,
 		.unmap = msm_iommu_unmap,
 		.destroy = msm_iommu_destroy,
diff --git a/drivers/gpu/drm/msm/msm_mmu.h b/drivers/gpu/drm/msm/msm_mmu.h
index 030324482b4..21da6d154f7 100644
--- a/drivers/gpu/drm/msm/msm_mmu.h
+++ b/drivers/gpu/drm/msm/msm_mmu.h
@@ -22,6 +22,7 @@
 
 struct msm_mmu_funcs {
 	int (*attach)(struct msm_mmu *mmu, const char **names, int cnt);
+	void (*detach)(struct msm_mmu *mmu, const char **names, int cnt);
 	int (*map)(struct msm_mmu *mmu, uint32_t iova, struct sg_table *sgt,
 			unsigned len, int prot);
 	int (*unmap)(struct msm_mmu *mmu, uint32_t iova, struct sg_table *sgt,
diff --git a/drivers/gpu/drm/msm/msm_perf.c b/drivers/gpu/drm/msm/msm_perf.c
new file mode 100644
index 00000000000..830857c47c8
--- /dev/null
+++ b/drivers/gpu/drm/msm/msm_perf.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright (C) 2013 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* For profiling, userspace can:
+ *
+ *   tail -f /sys/kernel/debug/dri/<minor>/gpu
+ *
+ * This will enable performance counters/profiling to track the busy time
+ * and any gpu specific performance counters that are supported.
+ */
+
+#ifdef CONFIG_DEBUG_FS
+
+#include <linux/debugfs.h>
+
+#include "msm_drv.h"
+#include "msm_gpu.h"
+
+struct msm_perf_state {
+	struct drm_device *dev;
+
+	bool open;
+	int cnt;
+	struct mutex read_lock;
+
+	char buf[256];
+	int buftot, bufpos;
+
+	unsigned long next_jiffies;
+
+	struct dentry *ent;
+	struct drm_info_node *node;
+};
+
+#define SAMPLE_TIME (HZ/4)
+
+/* wait for next sample time: */
+static int wait_sample(struct msm_perf_state *perf)
+{
+	unsigned long start_jiffies = jiffies;
+
+	if (time_after(perf->next_jiffies, start_jiffies)) {
+		unsigned long remaining_jiffies =
+			perf->next_jiffies - start_jiffies;
+		int ret = schedule_timeout_interruptible(remaining_jiffies);
+		if (ret > 0) {
+			/* interrupted */
+			return -ERESTARTSYS;
+		}
+	}
+	perf->next_jiffies += SAMPLE_TIME;
+	return 0;
+}
+
+static int refill_buf(struct msm_perf_state *perf)
+{
+	struct msm_drm_private *priv = perf->dev->dev_private;
+	struct msm_gpu *gpu = priv->gpu;
+	char *ptr = perf->buf;
+	int rem = sizeof(perf->buf);
+	int i, n;
+
+	if ((perf->cnt++ % 32) == 0) {
+		/* Header line: */
+		n = snprintf(ptr, rem, "%%BUSY");
+		ptr += n;
+		rem -= n;
+
+		for (i = 0; i < gpu->num_perfcntrs; i++) {
+			const struct msm_gpu_perfcntr *perfcntr = &gpu->perfcntrs[i];
+			n = snprintf(ptr, rem, "\t%s", perfcntr->name);
+			ptr += n;
+			rem -= n;
+		}
+	} else {
+		/* Sample line: */
+		uint32_t activetime = 0, totaltime = 0;
+		uint32_t cntrs[5];
+		uint32_t val;
+		int ret;
+
+		/* sleep until next sample time: */
+		ret = wait_sample(perf);
+		if (ret)
+			return ret;
+
+		ret = msm_gpu_perfcntr_sample(gpu, &activetime, &totaltime,
+				ARRAY_SIZE(cntrs), cntrs);
+		if (ret < 0)
+			return ret;
+
+		val = totaltime ? 1000 * activetime / totaltime : 0;
+		n = snprintf(ptr, rem, "%3d.%d%%", val / 10, val % 10);
+		ptr += n;
+		rem -= n;
+
+		for (i = 0; i < ret; i++) {
+			/* cycle counters (I think).. convert to MHz.. */
+			val = cntrs[i] / 10000;
+			n = snprintf(ptr, rem, "\t%5d.%02d",
+					val / 100, val % 100);
+			ptr += n;
+			rem -= n;
+		}
+	}
+
+	n = snprintf(ptr, rem, "\n");
+	ptr += n;
+	rem -= n;
+
+	perf->bufpos = 0;
+	perf->buftot = ptr - perf->buf;
+
+	return 0;
+}
+
+static ssize_t perf_read(struct file *file, char __user *buf,
+		size_t sz, loff_t *ppos)
+{
+	struct msm_perf_state *perf = file->private_data;
+	int n = 0, ret;
+
+	mutex_lock(&perf->read_lock);
+
+	if (perf->bufpos >= perf->buftot) {
+		ret = refill_buf(perf);
+		if (ret)
+			goto out;
+	}
+
+	n = min((int)sz, perf->buftot - perf->bufpos);
+	ret = copy_to_user(buf, &perf->buf[perf->bufpos], n);
+	if (ret)
+		goto out;
+
+	perf->bufpos += n;
+	*ppos += n;
+
+out:
+	mutex_unlock(&perf->read_lock);
+	if (ret)
+		return ret;
+	return n;
+}
+
+static int perf_open(struct inode *inode, struct file *file)
+{
+	struct msm_perf_state *perf = inode->i_private;
+	struct drm_device *dev = perf->dev;
+	struct msm_drm_private *priv = dev->dev_private;
+	struct msm_gpu *gpu = priv->gpu;
+	int ret = 0;
+
+	mutex_lock(&dev->struct_mutex);
+
+	if (perf->open || !gpu) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	file->private_data = perf;
+	perf->open = true;
+	perf->cnt = 0;
+	perf->buftot = 0;
+	perf->bufpos = 0;
+	msm_gpu_perfcntr_start(gpu);
+	perf->next_jiffies = jiffies + SAMPLE_TIME;
+
+out:
+	mutex_unlock(&dev->struct_mutex);
+	return ret;
+}
+
+static int perf_release(struct inode *inode, struct file *file)
+{
+	struct msm_perf_state *perf = inode->i_private;
+	struct msm_drm_private *priv = perf->dev->dev_private;
+	msm_gpu_perfcntr_stop(priv->gpu);
+	perf->open = false;
+	return 0;
+}
+
+
+static const struct file_operations perf_debugfs_fops = {
+	.owner = THIS_MODULE,
+	.open = perf_open,
+	.read = perf_read,
+	.llseek = no_llseek,
+	.release = perf_release,
+};
+
+int msm_perf_debugfs_init(struct drm_minor *minor)
+{
+	struct msm_drm_private *priv = minor->dev->dev_private;
+	struct msm_perf_state *perf;
+
+	/* only create on first minor: */
+	if (priv->perf)
+		return 0;
+
+	perf = kzalloc(sizeof(*perf), GFP_KERNEL);
+	if (!perf)
+		return -ENOMEM;
+
+	perf->dev = minor->dev;
+
+	mutex_init(&perf->read_lock);
+	priv->perf = perf;
+
+	perf->node = kzalloc(sizeof(*perf->node), GFP_KERNEL);
+	if (!perf->node)
+		goto fail;
+
+	perf->ent = debugfs_create_file("perf", S_IFREG | S_IRUGO,
+			minor->debugfs_root, perf, &perf_debugfs_fops);
+	if (!perf->ent) {
+		DRM_ERROR("Cannot create /sys/kernel/debug/dri/%s/perf\n",
+				minor->debugfs_root->d_name.name);
+		goto fail;
+	}
+
+	perf->node->minor = minor;
+	perf->node->dent  = perf->ent;
+	perf->node->info_ent = NULL;
+
+	mutex_lock(&minor->debugfs_lock);
+	list_add(&perf->node->list, &minor->debugfs_list);
+	mutex_unlock(&minor->debugfs_lock);
+
+	return 0;
+
+fail:
+	msm_perf_debugfs_cleanup(minor);
+	return -1;
+}
+
+void msm_perf_debugfs_cleanup(struct drm_minor *minor)
+{
+	struct msm_drm_private *priv = minor->dev->dev_private;
+	struct msm_perf_state *perf = priv->perf;
+
+	if (!perf)
+		return;
+
+	priv->perf = NULL;
+
+	debugfs_remove(perf->ent);
+
+	if (perf->node) {
+		mutex_lock(&minor->debugfs_lock);
+		list_del(&perf->node->list);
+		mutex_unlock(&minor->debugfs_lock);
+		kfree(perf->node);
+	}
+
+	mutex_destroy(&perf->read_lock);
+
+	kfree(perf);
+}
+
+#endif
diff --git a/drivers/gpu/drm/msm/msm_rd.c b/drivers/gpu/drm/msm/msm_rd.c
new file mode 100644
index 00000000000..9a78c48817c
--- /dev/null
+++ b/drivers/gpu/drm/msm/msm_rd.c
@@ -0,0 +1,337 @@
+/*
+ * Copyright (C) 2013 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* For debugging crashes, userspace can:
+ *
+ *   tail -f /sys/kernel/debug/dri/<minor>/rd > logfile.rd
+ *
+ * To log the cmdstream in a format that is understood by freedreno/cffdump
+ * utility.  By comparing the last successfully completed fence #, to the
+ * cmdstream for the next fence, you can narrow down which process and submit
+ * caused the gpu crash/lockup.
+ *
+ * This bypasses drm_debugfs_create_files() mainly because we need to use
+ * our own fops for a bit more control.  In particular, we don't want to
+ * do anything if userspace doesn't have the debugfs file open.
+ */
+
+#ifdef CONFIG_DEBUG_FS
+
+#include <linux/kfifo.h>
+#include <linux/debugfs.h>
+#include <linux/circ_buf.h>
+#include <linux/wait.h>
+
+#include "msm_drv.h"
+#include "msm_gpu.h"
+#include "msm_gem.h"
+
+enum rd_sect_type {
+	RD_NONE,
+	RD_TEST,       /* ascii text */
+	RD_CMD,        /* ascii text */
+	RD_GPUADDR,    /* u32 gpuaddr, u32 size */
+	RD_CONTEXT,    /* raw dump */
+	RD_CMDSTREAM,  /* raw dump */
+	RD_CMDSTREAM_ADDR, /* gpu addr of cmdstream */
+	RD_PARAM,      /* u32 param_type, u32 param_val, u32 bitlen */
+	RD_FLUSH,      /* empty, clear previous params */
+	RD_PROGRAM,    /* shader program, raw dump */
+	RD_VERT_SHADER,
+	RD_FRAG_SHADER,
+	RD_BUFFER_CONTENTS,
+	RD_GPU_ID,
+};
+
+#define BUF_SZ 512  /* should be power of 2 */
+
+/* space used: */
+#define circ_count(circ) \
+	(CIRC_CNT((circ)->head, (circ)->tail, BUF_SZ))
+#define circ_count_to_end(circ) \
+	(CIRC_CNT_TO_END((circ)->head, (circ)->tail, BUF_SZ))
+/* space available: */
+#define circ_space(circ) \
+	(CIRC_SPACE((circ)->head, (circ)->tail, BUF_SZ))
+#define circ_space_to_end(circ) \
+	(CIRC_SPACE_TO_END((circ)->head, (circ)->tail, BUF_SZ))
+
+struct msm_rd_state {
+	struct drm_device *dev;
+
+	bool open;
+
+	struct dentry *ent;
+	struct drm_info_node *node;
+
+	/* current submit to read out: */
+	struct msm_gem_submit *submit;
+
+	/* fifo access is synchronized on the producer side by
+	 * struct_mutex held by submit code (otherwise we could
+	 * end up w/ cmds logged in different order than they
+	 * were executed).  And read_lock synchronizes the reads
+	 */
+	struct mutex read_lock;
+
+	wait_queue_head_t fifo_event;
+	struct circ_buf fifo;
+
+	char buf[BUF_SZ];
+};
+
+static void rd_write(struct msm_rd_state *rd, const void *buf, int sz)
+{
+	struct circ_buf *fifo = &rd->fifo;
+	const char *ptr = buf;
+
+	while (sz > 0) {
+		char *fptr = &fifo->buf[fifo->head];
+		int n;
+
+		wait_event(rd->fifo_event, circ_space(&rd->fifo) > 0);
+
+		n = min(sz, circ_space_to_end(&rd->fifo));
+		memcpy(fptr, ptr, n);
+
+		fifo->head = (fifo->head + n) & (BUF_SZ - 1);
+		sz  -= n;
+		ptr += n;
+
+		wake_up_all(&rd->fifo_event);
+	}
+}
+
+static void rd_write_section(struct msm_rd_state *rd,
+		enum rd_sect_type type, const void *buf, int sz)
+{
+	rd_write(rd, &type, 4);
+	rd_write(rd, &sz, 4);
+	rd_write(rd, buf, sz);
+}
+
+static ssize_t rd_read(struct file *file, char __user *buf,
+		size_t sz, loff_t *ppos)
+{
+	struct msm_rd_state *rd = file->private_data;
+	struct circ_buf *fifo = &rd->fifo;
+	const char *fptr = &fifo->buf[fifo->tail];
+	int n = 0, ret = 0;
+
+	mutex_lock(&rd->read_lock);
+
+	ret = wait_event_interruptible(rd->fifo_event,
+			circ_count(&rd->fifo) > 0);
+	if (ret)
+		goto out;
+
+	n = min_t(int, sz, circ_count_to_end(&rd->fifo));
+	ret = copy_to_user(buf, fptr, n);
+	if (ret)
+		goto out;
+
+	fifo->tail = (fifo->tail + n) & (BUF_SZ - 1);
+	*ppos += n;
+
+	wake_up_all(&rd->fifo_event);
+
+out:
+	mutex_unlock(&rd->read_lock);
+	if (ret)
+		return ret;
+	return n;
+}
+
+static int rd_open(struct inode *inode, struct file *file)
+{
+	struct msm_rd_state *rd = inode->i_private;
+	struct drm_device *dev = rd->dev;
+	struct msm_drm_private *priv = dev->dev_private;
+	struct msm_gpu *gpu = priv->gpu;
+	uint64_t val;
+	uint32_t gpu_id;
+	int ret = 0;
+
+	mutex_lock(&dev->struct_mutex);
+
+	if (rd->open || !gpu) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	file->private_data = rd;
+	rd->open = true;
+
+	/* the parsing tools need to know gpu-id to know which
+	 * register database to load.
+	 */
+	gpu->funcs->get_param(gpu, MSM_PARAM_GPU_ID, &val);
+	gpu_id = val;
+
+	rd_write_section(rd, RD_GPU_ID, &gpu_id, sizeof(gpu_id));
+
+out:
+	mutex_unlock(&dev->struct_mutex);
+	return ret;
+}
+
+static int rd_release(struct inode *inode, struct file *file)
+{
+	struct msm_rd_state *rd = inode->i_private;
+	rd->open = false;
+	return 0;
+}
+
+
+static const struct file_operations rd_debugfs_fops = {
+	.owner = THIS_MODULE,
+	.open = rd_open,
+	.read = rd_read,
+	.llseek = no_llseek,
+	.release = rd_release,
+};
+
+int msm_rd_debugfs_init(struct drm_minor *minor)
+{
+	struct msm_drm_private *priv = minor->dev->dev_private;
+	struct msm_rd_state *rd;
+
+	/* only create on first minor: */
+	if (priv->rd)
+		return 0;
+
+	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
+	if (!rd)
+		return -ENOMEM;
+
+	rd->dev = minor->dev;
+	rd->fifo.buf = rd->buf;
+
+	mutex_init(&rd->read_lock);
+	priv->rd = rd;
+
+	init_waitqueue_head(&rd->fifo_event);
+
+	rd->node = kzalloc(sizeof(*rd->node), GFP_KERNEL);
+	if (!rd->node)
+		goto fail;
+
+	rd->ent = debugfs_create_file("rd", S_IFREG | S_IRUGO,
+			minor->debugfs_root, rd, &rd_debugfs_fops);
+	if (!rd->ent) {
+		DRM_ERROR("Cannot create /sys/kernel/debug/dri/%s/rd\n",
+				minor->debugfs_root->d_name.name);
+		goto fail;
+	}
+
+	rd->node->minor = minor;
+	rd->node->dent  = rd->ent;
+	rd->node->info_ent = NULL;
+
+	mutex_lock(&minor->debugfs_lock);
+	list_add(&rd->node->list, &minor->debugfs_list);
+	mutex_unlock(&minor->debugfs_lock);
+
+	return 0;
+
+fail:
+	msm_rd_debugfs_cleanup(minor);
+	return -1;
+}
+
+void msm_rd_debugfs_cleanup(struct drm_minor *minor)
+{
+	struct msm_drm_private *priv = minor->dev->dev_private;
+	struct msm_rd_state *rd = priv->rd;
+
+	if (!rd)
+		return;
+
+	priv->rd = NULL;
+
+	debugfs_remove(rd->ent);
+
+	if (rd->node) {
+		mutex_lock(&minor->debugfs_lock);
+		list_del(&rd->node->list);
+		mutex_unlock(&minor->debugfs_lock);
+		kfree(rd->node);
+	}
+
+	mutex_destroy(&rd->read_lock);
+
+	kfree(rd);
+}
+
+/* called under struct_mutex */
+void msm_rd_dump_submit(struct msm_gem_submit *submit)
+{
+	struct drm_device *dev = submit->dev;
+	struct msm_drm_private *priv = dev->dev_private;
+	struct msm_rd_state *rd = priv->rd;
+	char msg[128];
+	int i, n;
+
+	if (!rd->open)
+		return;
+
+	/* writing into fifo is serialized by caller, and
+	 * rd->read_lock is used to serialize the reads
+	 */
+	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
+
+	n = snprintf(msg, sizeof(msg), "%.*s/%d: fence=%u",
+			TASK_COMM_LEN, current->comm, task_pid_nr(current),
+			submit->fence);
+
+	rd_write_section(rd, RD_CMD, msg, ALIGN(n, 4));
+
+	/* could be nice to have an option (module-param?) to snapshot
+	 * all the bo's associated with the submit.  Handy to see vtx
+	 * buffers, etc.  For now just the cmdstream bo's is enough.
+	 */
+
+	for (i = 0; i < submit->nr_cmds; i++) {
+		uint32_t idx  = submit->cmd[i].idx;
+		uint32_t iova = submit->cmd[i].iova;
+		uint32_t szd  = submit->cmd[i].size; /* in dwords */
+		struct msm_gem_object *obj = submit->bos[idx].obj;
+		const char *buf = msm_gem_vaddr_locked(&obj->base);
+
+		buf += iova - submit->bos[idx].iova;
+
+		rd_write_section(rd, RD_GPUADDR,
+				(uint32_t[2]){ iova, szd * 4 }, 8);
+		rd_write_section(rd, RD_BUFFER_CONTENTS,
+				buf, szd * 4);
+
+		switch (submit->cmd[i].type) {
+		case MSM_SUBMIT_CMD_IB_TARGET_BUF:
+			/* ignore IB-targets, we've logged the buffer, the
+			 * parser tool will follow the IB based on the logged
+			 * buffer/gpuaddr, so nothing more to do.
+			 */
+			break;
+		case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
+		case MSM_SUBMIT_CMD_BUF:
+			rd_write_section(rd, RD_CMDSTREAM_ADDR,
+					(uint32_t[2]){ iova, szd }, 8);
+			break;
+		}
+	}
+}
+#endif