Merge tag 'drm-intel-fixes-2015-07-15' into drm-intel-next-queued
[linux-drm-fsl-dcu.git] / drivers / gpu / drm / i915 / intel_lrc.c
index 22e9f85f40e43dd6ed8c5c370a5bcd9a4db10916..9faad82c42ecd05fe599072bcaf25d49d05ae13b 100644 (file)
 #include <drm/drmP.h>
 #include <drm/i915_drm.h>
 #include "i915_drv.h"
+#include "intel_mocs.h"
 
 #define GEN9_LR_CONTEXT_RENDER_SIZE (22 * PAGE_SIZE)
 #define GEN8_LR_CONTEXT_RENDER_SIZE (20 * PAGE_SIZE)
@@ -211,8 +212,7 @@ enum {
 #define GEN8_CTX_ID_SHIFT 32
 #define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT  0x17
 
-static int intel_lr_context_pin(struct intel_engine_cs *ring,
-               struct intel_context *ctx);
+static int intel_lr_context_pin(struct drm_i915_gem_request *rq);
 
 /**
  * intel_sanitize_enable_execlists() - sanitize i915.enable_execlists
@@ -262,10 +262,11 @@ u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj)
        return lrca >> 12;
 }
 
-static uint64_t execlists_ctx_descriptor(struct intel_engine_cs *ring,
-                                        struct drm_i915_gem_object *ctx_obj)
+static uint64_t execlists_ctx_descriptor(struct drm_i915_gem_request *rq)
 {
+       struct intel_engine_cs *ring = rq->ring;
        struct drm_device *dev = ring->dev;
+       struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state;
        uint64_t desc;
        uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj);
 
@@ -293,55 +294,59 @@ static uint64_t execlists_ctx_descriptor(struct intel_engine_cs *ring,
        return desc;
 }
 
-static void execlists_elsp_write(struct intel_engine_cs *ring,
-                                struct drm_i915_gem_object *ctx_obj0,
-                                struct drm_i915_gem_object *ctx_obj1)
+static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
+                                struct drm_i915_gem_request *rq1)
 {
+
+       struct intel_engine_cs *ring = rq0->ring;
        struct drm_device *dev = ring->dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
-       uint64_t temp = 0;
-       uint32_t desc[4];
+       uint64_t desc[2];
 
-       /* XXX: You must always write both descriptors in the order below. */
-       if (ctx_obj1)
-               temp = execlists_ctx_descriptor(ring, ctx_obj1);
-       else
-               temp = 0;
-       desc[1] = (u32)(temp >> 32);
-       desc[0] = (u32)temp;
+       if (rq1) {
+               desc[1] = execlists_ctx_descriptor(rq1);
+               rq1->elsp_submitted++;
+       } else {
+               desc[1] = 0;
+       }
 
-       temp = execlists_ctx_descriptor(ring, ctx_obj0);
-       desc[3] = (u32)(temp >> 32);
-       desc[2] = (u32)temp;
+       desc[0] = execlists_ctx_descriptor(rq0);
+       rq0->elsp_submitted++;
 
+       /* You must always write both descriptors in the order below. */
        spin_lock(&dev_priv->uncore.lock);
        intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
-       I915_WRITE_FW(RING_ELSP(ring), desc[1]);
-       I915_WRITE_FW(RING_ELSP(ring), desc[0]);
-       I915_WRITE_FW(RING_ELSP(ring), desc[3]);
+       I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[1]));
+       I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[1]));
 
+       I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[0]));
        /* The context is automatically loaded after the following */
-       I915_WRITE_FW(RING_ELSP(ring), desc[2]);
+       I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[0]));
 
-       /* ELSP is a wo register, so use another nearby reg for posting instead */
+       /* ELSP is a wo register, use another nearby reg for posting */
        POSTING_READ_FW(RING_EXECLIST_STATUS(ring));
        intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
        spin_unlock(&dev_priv->uncore.lock);
 }
 
-static int execlists_update_context(struct drm_i915_gem_object *ctx_obj,
-                                   struct drm_i915_gem_object *ring_obj,
-                                   struct i915_hw_ppgtt *ppgtt,
-                                   u32 tail)
+static int execlists_update_context(struct drm_i915_gem_request *rq)
 {
+       struct intel_engine_cs *ring = rq->ring;
+       struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt;
+       struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state;
+       struct drm_i915_gem_object *rb_obj = rq->ringbuf->obj;
        struct page *page;
        uint32_t *reg_state;
 
+       BUG_ON(!ctx_obj);
+       WARN_ON(!i915_gem_obj_is_pinned(ctx_obj));
+       WARN_ON(!i915_gem_obj_is_pinned(rb_obj));
+
        page = i915_gem_object_get_page(ctx_obj, 1);
        reg_state = kmap_atomic(page);
 
-       reg_state[CTX_RING_TAIL+1] = tail;
-       reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(ring_obj);
+       reg_state[CTX_RING_TAIL+1] = rq->tail;
+       reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(rb_obj);
 
        /* True PPGTT with dynamic page allocation: update PDP registers and
         * point the unallocated PDPs to the scratch page
@@ -358,32 +363,15 @@ static int execlists_update_context(struct drm_i915_gem_object *ctx_obj,
        return 0;
 }
 
-static void execlists_submit_contexts(struct intel_engine_cs *ring,
-                                     struct intel_context *to0, u32 tail0,
-                                     struct intel_context *to1, u32 tail1)
+static void execlists_submit_requests(struct drm_i915_gem_request *rq0,
+                                     struct drm_i915_gem_request *rq1)
 {
-       struct drm_i915_gem_object *ctx_obj0 = to0->engine[ring->id].state;
-       struct intel_ringbuffer *ringbuf0 = to0->engine[ring->id].ringbuf;
-       struct drm_i915_gem_object *ctx_obj1 = NULL;
-       struct intel_ringbuffer *ringbuf1 = NULL;
+       execlists_update_context(rq0);
 
-       BUG_ON(!ctx_obj0);
-       WARN_ON(!i915_gem_obj_is_pinned(ctx_obj0));
-       WARN_ON(!i915_gem_obj_is_pinned(ringbuf0->obj));
+       if (rq1)
+               execlists_update_context(rq1);
 
-       execlists_update_context(ctx_obj0, ringbuf0->obj, to0->ppgtt, tail0);
-
-       if (to1) {
-               ringbuf1 = to1->engine[ring->id].ringbuf;
-               ctx_obj1 = to1->engine[ring->id].state;
-               BUG_ON(!ctx_obj1);
-               WARN_ON(!i915_gem_obj_is_pinned(ctx_obj1));
-               WARN_ON(!i915_gem_obj_is_pinned(ringbuf1->obj));
-
-               execlists_update_context(ctx_obj1, ringbuf1->obj, to1->ppgtt, tail1);
-       }
-
-       execlists_elsp_write(ring, ctx_obj0, ctx_obj1);
+       execlists_elsp_write(rq0, rq1);
 }
 
 static void execlists_context_unqueue(struct intel_engine_cs *ring)
@@ -443,13 +431,7 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
 
        WARN_ON(req1 && req1->elsp_submitted);
 
-       execlists_submit_contexts(ring, req0->ctx, req0->tail,
-                                 req1 ? req1->ctx : NULL,
-                                 req1 ? req1->tail : 0);
-
-       req0->elsp_submitted++;
-       if (req1)
-               req1->elsp_submitted++;
+       execlists_submit_requests(req0, req1);
 }
 
 static bool execlists_check_remove_request(struct intel_engine_cs *ring,
@@ -549,7 +531,7 @@ static int execlists_context_queue(struct drm_i915_gem_request *request)
        int num_elements = 0;
 
        if (request->ctx != ring->default_context)
-               intel_lr_context_pin(ring, request->ctx);
+               intel_lr_context_pin(request);
 
        i915_gem_request_reference(request);
 
@@ -641,14 +623,14 @@ int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request
 {
        int ret;
 
+       request->ringbuf = request->ctx->engine[request->ring->id].ringbuf;
+
        if (request->ctx != request->ring->default_context) {
-               ret = intel_lr_context_pin(request->ring, request->ctx);
+               ret = intel_lr_context_pin(request);
                if (ret)
                        return ret;
        }
 
-       request->ringbuf = request->ctx->engine[request->ring->id].ringbuf;
-
        return 0;
 }
 
@@ -791,8 +773,7 @@ static int logical_ring_prepare(struct drm_i915_gem_request *req, int bytes)
  *
  * Return: non-zero if the ringbuffer is not ready to be written to.
  */
-static int intel_logical_ring_begin(struct drm_i915_gem_request *req,
-                                   int num_dwords)
+int intel_logical_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
 {
        struct drm_i915_private *dev_priv;
        int ret;
@@ -958,7 +939,7 @@ void intel_execlists_retire_requests(struct intel_engine_cs *ring)
                                ctx->engine[ring->id].state;
 
                if (ctx_obj && (ctx != ring->default_context))
-                       intel_lr_context_unpin(ring, ctx);
+                       intel_lr_context_unpin(req);
                list_del(&req->execlist_link);
                i915_gem_request_unreference(req);
        }
@@ -1002,15 +983,15 @@ int logical_ring_flush_all_caches(struct drm_i915_gem_request *req)
        return 0;
 }
 
-static int intel_lr_context_pin(struct intel_engine_cs *ring,
-               struct intel_context *ctx)
+static int intel_lr_context_pin(struct drm_i915_gem_request *rq)
 {
-       struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
-       struct intel_ringbuffer *ringbuf = ctx->engine[ring->id].ringbuf;
+       struct intel_engine_cs *ring = rq->ring;
+       struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state;
+       struct intel_ringbuffer *ringbuf = rq->ringbuf;
        int ret = 0;
 
        WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
-       if (ctx->engine[ring->id].pin_count++ == 0) {
+       if (rq->ctx->engine[ring->id].pin_count++ == 0) {
                ret = i915_gem_obj_ggtt_pin(ctx_obj,
                                GEN8_LR_CONTEXT_ALIGN, 0);
                if (ret)
@@ -1026,20 +1007,20 @@ static int intel_lr_context_pin(struct intel_engine_cs *ring,
 unpin_ctx_obj:
        i915_gem_object_ggtt_unpin(ctx_obj);
 reset_pin_count:
-       ctx->engine[ring->id].pin_count = 0;
+       rq->ctx->engine[ring->id].pin_count = 0;
 
        return ret;
 }
 
-void intel_lr_context_unpin(struct intel_engine_cs *ring,
-               struct intel_context *ctx)
+void intel_lr_context_unpin(struct drm_i915_gem_request *rq)
 {
-       struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
-       struct intel_ringbuffer *ringbuf = ctx->engine[ring->id].ringbuf;
+       struct intel_engine_cs *ring = rq->ring;
+       struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state;
+       struct intel_ringbuffer *ringbuf = rq->ringbuf;
 
        if (ctx_obj) {
                WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
-               if (--ctx->engine[ring->id].pin_count == 0) {
+               if (--rq->ctx->engine[ring->id].pin_count == 0) {
                        intel_unpin_ringbuffer_obj(ringbuf);
                        i915_gem_object_ggtt_unpin(ctx_obj);
                }
@@ -1084,14 +1065,74 @@ static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req)
        return 0;
 }
 
-#define wa_ctx_emit(batch, cmd)                                                \
+#define wa_ctx_emit(batch, index, cmd)                                 \
        do {                                                            \
-               if (WARN_ON(index >= (PAGE_SIZE / sizeof(uint32_t)))) { \
+               int __index = (index)++;                                \
+               if (WARN_ON(__index >= (PAGE_SIZE / sizeof(uint32_t)))) { \
                        return -ENOSPC;                                 \
                }                                                       \
-               batch[index++] = (cmd);                                 \
+               batch[__index] = (cmd);                                 \
        } while (0)
 
+
+/*
+ * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
+ * PIPE_CONTROL instruction. This is required for the flush to happen correctly
+ * but there is a slight complication as this is applied in WA batch where the
+ * values are only initialized once so we cannot take register value at the
+ * beginning and reuse it further; hence we save its value to memory, upload a
+ * constant value with bit21 set and then we restore it back with the saved value.
+ * To simplify the WA, a constant value is formed by using the default value
+ * of this register. This shouldn't be a problem because we are only modifying
+ * it for a short period and this batch in non-premptible. We can ofcourse
+ * use additional instructions that read the actual value of the register
+ * at that time and set our bit of interest but it makes the WA complicated.
+ *
+ * This WA is also required for Gen9 so extracting as a function avoids
+ * code duplication.
+ */
+static inline int gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *ring,
+                                               uint32_t *const batch,
+                                               uint32_t index)
+{
+       uint32_t l3sqc4_flush = (0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES);
+
+       /*
+        * WaDisableLSQCROPERFforOCL:skl
+        * This WA is implemented in skl_init_clock_gating() but since
+        * this batch updates GEN8_L3SQCREG4 with default value we need to
+        * set this bit here to retain the WA during flush.
+        */
+       if (IS_SKYLAKE(ring->dev) && INTEL_REVID(ring->dev) <= SKL_REVID_E0)
+               l3sqc4_flush |= GEN8_LQSC_RO_PERF_DIS;
+
+       wa_ctx_emit(batch, index, (MI_STORE_REGISTER_MEM_GEN8(1) |
+                                  MI_SRM_LRM_GLOBAL_GTT));
+       wa_ctx_emit(batch, index, GEN8_L3SQCREG4);
+       wa_ctx_emit(batch, index, ring->scratch.gtt_offset + 256);
+       wa_ctx_emit(batch, index, 0);
+
+       wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1));
+       wa_ctx_emit(batch, index, GEN8_L3SQCREG4);
+       wa_ctx_emit(batch, index, l3sqc4_flush);
+
+       wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6));
+       wa_ctx_emit(batch, index, (PIPE_CONTROL_CS_STALL |
+                                  PIPE_CONTROL_DC_FLUSH_ENABLE));
+       wa_ctx_emit(batch, index, 0);
+       wa_ctx_emit(batch, index, 0);
+       wa_ctx_emit(batch, index, 0);
+       wa_ctx_emit(batch, index, 0);
+
+       wa_ctx_emit(batch, index, (MI_LOAD_REGISTER_MEM_GEN8(1) |
+                                  MI_SRM_LRM_GLOBAL_GTT));
+       wa_ctx_emit(batch, index, GEN8_L3SQCREG4);
+       wa_ctx_emit(batch, index, ring->scratch.gtt_offset + 256);
+       wa_ctx_emit(batch, index, 0);
+
+       return index;
+}
+
 static inline uint32_t wa_ctx_start(struct i915_wa_ctx_bb *wa_ctx,
                                    uint32_t offset,
                                    uint32_t start_alignment)
@@ -1148,48 +1189,32 @@ static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
        uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
 
        /* WaDisableCtxRestoreArbitration:bdw,chv */
-       wa_ctx_emit(batch, MI_ARB_ON_OFF | MI_ARB_DISABLE);
+       wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_DISABLE);
 
        /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
        if (IS_BROADWELL(ring->dev)) {
-               struct drm_i915_private *dev_priv = to_i915(ring->dev);
-               uint32_t l3sqc4_flush = (I915_READ(GEN8_L3SQCREG4) |
-                                        GEN8_LQSC_FLUSH_COHERENT_LINES);
-
-               wa_ctx_emit(batch, MI_LOAD_REGISTER_IMM(1));
-               wa_ctx_emit(batch, GEN8_L3SQCREG4);
-               wa_ctx_emit(batch, l3sqc4_flush);
-
-               wa_ctx_emit(batch, GFX_OP_PIPE_CONTROL(6));
-               wa_ctx_emit(batch, (PIPE_CONTROL_CS_STALL |
-                                   PIPE_CONTROL_DC_FLUSH_ENABLE));
-               wa_ctx_emit(batch, 0);
-               wa_ctx_emit(batch, 0);
-               wa_ctx_emit(batch, 0);
-               wa_ctx_emit(batch, 0);
-
-               wa_ctx_emit(batch, MI_LOAD_REGISTER_IMM(1));
-               wa_ctx_emit(batch, GEN8_L3SQCREG4);
-               wa_ctx_emit(batch, l3sqc4_flush & ~GEN8_LQSC_FLUSH_COHERENT_LINES);
+               index = gen8_emit_flush_coherentl3_wa(ring, batch, index);
+               if (index < 0)
+                       return index;
        }
 
        /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
        /* Actual scratch location is at 128 bytes offset */
        scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES;
 
-       wa_ctx_emit(batch, GFX_OP_PIPE_CONTROL(6));
-       wa_ctx_emit(batch, (PIPE_CONTROL_FLUSH_L3 |
-                           PIPE_CONTROL_GLOBAL_GTT_IVB |
-                           PIPE_CONTROL_CS_STALL |
-                           PIPE_CONTROL_QW_WRITE));
-       wa_ctx_emit(batch, scratch_addr);
-       wa_ctx_emit(batch, 0);
-       wa_ctx_emit(batch, 0);
-       wa_ctx_emit(batch, 0);
+       wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6));
+       wa_ctx_emit(batch, index, (PIPE_CONTROL_FLUSH_L3 |
+                                  PIPE_CONTROL_GLOBAL_GTT_IVB |
+                                  PIPE_CONTROL_CS_STALL |
+                                  PIPE_CONTROL_QW_WRITE));
+       wa_ctx_emit(batch, index, scratch_addr);
+       wa_ctx_emit(batch, index, 0);
+       wa_ctx_emit(batch, index, 0);
+       wa_ctx_emit(batch, index, 0);
 
        /* Pad to end of cacheline */
        while (index % CACHELINE_DWORDS)
-               wa_ctx_emit(batch, MI_NOOP);
+               wa_ctx_emit(batch, index, MI_NOOP);
 
        /*
         * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
@@ -1225,9 +1250,64 @@ static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
        uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
 
        /* WaDisableCtxRestoreArbitration:bdw,chv */
-       wa_ctx_emit(batch, MI_ARB_ON_OFF | MI_ARB_ENABLE);
+       wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE);
+
+       wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END);
+
+       return wa_ctx_end(wa_ctx, *offset = index, 1);
+}
+
+static int gen9_init_indirectctx_bb(struct intel_engine_cs *ring,
+                                   struct i915_wa_ctx_bb *wa_ctx,
+                                   uint32_t *const batch,
+                                   uint32_t *offset)
+{
+       int ret;
+       struct drm_device *dev = ring->dev;
+       uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
+
+       /* WaDisableCtxRestoreArbitration:skl,bxt */
+       if ((IS_SKYLAKE(dev) && (INTEL_REVID(dev) <= SKL_REVID_D0)) ||
+           (IS_BROXTON(dev) && (INTEL_REVID(dev) == BXT_REVID_A0)))
+               wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_DISABLE);
 
-       wa_ctx_emit(batch, MI_BATCH_BUFFER_END);
+       /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt */
+       ret = gen8_emit_flush_coherentl3_wa(ring, batch, index);
+       if (ret < 0)
+               return ret;
+       index = ret;
+
+       /* Pad to end of cacheline */
+       while (index % CACHELINE_DWORDS)
+               wa_ctx_emit(batch, index, MI_NOOP);
+
+       return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS);
+}
+
+static int gen9_init_perctx_bb(struct intel_engine_cs *ring,
+                              struct i915_wa_ctx_bb *wa_ctx,
+                              uint32_t *const batch,
+                              uint32_t *offset)
+{
+       struct drm_device *dev = ring->dev;
+       uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
+
+       /* WaSetDisablePixMaskCammingAndRhwoInCommonSliceChicken:skl,bxt */
+       if ((IS_SKYLAKE(dev) && (INTEL_REVID(dev) <= SKL_REVID_B0)) ||
+           (IS_BROXTON(dev) && (INTEL_REVID(dev) == BXT_REVID_A0))) {
+               wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1));
+               wa_ctx_emit(batch, index, GEN9_SLICE_COMMON_ECO_CHICKEN0);
+               wa_ctx_emit(batch, index,
+                           _MASKED_BIT_ENABLE(DISABLE_PIXEL_MASK_CAMMING));
+               wa_ctx_emit(batch, index, MI_NOOP);
+       }
+
+       /* WaDisableCtxRestoreArbitration:skl,bxt */
+       if ((IS_SKYLAKE(dev) && (INTEL_REVID(dev) <= SKL_REVID_D0)) ||
+           (IS_BROXTON(dev) && (INTEL_REVID(dev) == BXT_REVID_A0)))
+               wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE);
+
+       wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END);
 
        return wa_ctx_end(wa_ctx, *offset = index, 1);
 }
@@ -1273,10 +1353,11 @@ static int intel_init_workaround_bb(struct intel_engine_cs *ring)
        WARN_ON(ring->id != RCS);
 
        /* update this when WA for higher Gen are added */
-       if (WARN(INTEL_INFO(ring->dev)->gen > 8,
-                "WA batch buffer is not initialized for Gen%d\n",
-                INTEL_INFO(ring->dev)->gen))
+       if (INTEL_INFO(ring->dev)->gen > 9) {
+               DRM_ERROR("WA batch buffer is not initialized for Gen%d\n",
+                         INTEL_INFO(ring->dev)->gen);
                return 0;
+       }
 
        /* some WA perform writes to scratch page, ensure it is valid */
        if (ring->scratch.obj == NULL) {
@@ -1308,6 +1389,20 @@ static int intel_init_workaround_bb(struct intel_engine_cs *ring)
                                          &offset);
                if (ret)
                        goto out;
+       } else if (INTEL_INFO(ring->dev)->gen == 9) {
+               ret = gen9_init_indirectctx_bb(ring,
+                                              &wa_ctx->indirect_ctx,
+                                              batch,
+                                              &offset);
+               if (ret)
+                       goto out;
+
+               ret = gen9_init_perctx_bb(ring,
+                                         &wa_ctx->per_ctx,
+                                         batch,
+                                         &offset);
+               if (ret)
+                       goto out;
        }
 
 out:
@@ -1326,6 +1421,12 @@ static int gen8_init_common_ring(struct intel_engine_cs *ring)
        I915_WRITE_IMR(ring, ~(ring->irq_enable_mask | ring->irq_keep_mask));
        I915_WRITE(RING_HWSTAM(ring->mmio_base), 0xffffffff);
 
+       if (ring->status_page.obj) {
+               I915_WRITE(RING_HWS_PGA(ring->mmio_base),
+                          (u32)ring->status_page.gfx_addr);
+               POSTING_READ(RING_HWS_PGA(ring->mmio_base));
+       }
+
        I915_WRITE(RING_MODE_GEN7(ring),
                   _MASKED_BIT_DISABLE(GFX_REPLAY_MODE) |
                   _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));
@@ -1426,7 +1527,10 @@ static int gen8_emit_bb_start(struct drm_i915_gem_request *req,
                return ret;
 
        /* FIXME(BDW): Address space and security selectors. */
-       intel_logical_ring_emit(ringbuf, MI_BATCH_BUFFER_START_GEN8 | (ppgtt<<8));
+       intel_logical_ring_emit(ringbuf, MI_BATCH_BUFFER_START_GEN8 |
+                               (ppgtt<<8) |
+                               (dispatch_flags & I915_DISPATCH_RS ?
+                                MI_BATCH_RESOURCE_STREAMER : 0));
        intel_logical_ring_emit(ringbuf, lower_32_bits(offset));
        intel_logical_ring_emit(ringbuf, upper_32_bits(offset));
        intel_logical_ring_emit(ringbuf, MI_NOOP);
@@ -1651,6 +1755,14 @@ static int gen8_init_rcs_context(struct drm_i915_gem_request *req)
        if (ret)
                return ret;
 
+       ret = intel_rcs_context_init_mocs(req);
+       /*
+        * Failing to program the MOCS is non-fatal.The system will not
+        * run at peak performance. So generate an error and carry on.
+        */
+       if (ret)
+               DRM_ERROR("MOCS failed to program: expect performance issues.\n");
+
        return intel_lr_context_render_state_init(req);
 }
 
@@ -2019,7 +2131,8 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
        reg_state[CTX_CONTEXT_CONTROL] = RING_CONTEXT_CONTROL(ring);
        reg_state[CTX_CONTEXT_CONTROL+1] =
                _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
-                               CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
+                                  CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
+                                  CTX_CTRL_RS_CTX_ENABLE);
        reg_state[CTX_RING_HEAD] = RING_HEAD(ring->mmio_base);
        reg_state[CTX_RING_HEAD+1] = 0;
        reg_state[CTX_RING_TAIL] = RING_TAIL(ring->mmio_base);